In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

# Utilities

In [2]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    
    for specification in tqdm(os.listdir(os.path.join(dataset_path, source))):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
            
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
stop_words = set(stopwords.words('english'))
stop_words = set(['itself', 'down', 'by', 'with', 'doesn', 'wouldn', 'other', 'ours', 'of', 'then', 'where', 'don', 'these', 'nor', 'she', "should've", 'won', 'ma', 'from', 'had', "you're", 'our', 'did', 'them', 'too', 'her', 'that', 'haven', 'after', "you'll", 'hers', 'because', 'yourself', 'against', 'mightn', 'as', 'll', 'whom', 'how', 'couldn', 'further', 'aren', "you'd", 'and', 'needn', "couldn't", 'those', 'to', "doesn't", "weren't", 'both', 'ourselves', 'in', 'which', 'yours', 'under', 'some', 'what', 'during', 'before', "needn't", "shan't", 'here', 'having', 'hasn', 'your', "hasn't", 'between', 'me', "she's", 'into', 'all', 'at', 'shan', 'who', 'o', 'an', 'very', 'can', 'you', 'shouldn', 'such', 'but', 'do', 'out', 'am', "shouldn't", 'above', 'wasn', 'or', 'were', 'own', 'didn', "you've", 'on', 'will', 'my', 'it', 'have', 'once', 'only', 'been', 'themselves', 'his', 'be', "mightn't", 'they', 'not', 'so', 'up', 'any', 'most', 'has', 'myself', 't', 'yourselves', 'isn', "it's", 'y', 'm', 'now', 'until', 're', 'there', 'their', 'mustn', "mustn't", 'again', 'being', 'hadn', 'doing', 'just', 'no', 'if', 've', "wasn't", "won't", 'we', 'below', 'does', 'more', 'this', 'should', "isn't", 'ain', "don't", 'i', "haven't", 'than', "didn't", 'are', 'about', 'off', 'him', 'for', 'few', "wouldn't", 'was', 'weren', 'why', 'he', "that'll", 'd', 'the', 'its', 'a', 'each', 'is', 'while', "aren't", 'when', 'theirs', 'same', 's', 'himself', 'herself', "hadn't", 'through', 'over'])
punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

def tokenize_stop_words_punctuation(x):
    return [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stop_words]

In [14]:
def count_nan(df):
    return len(df) - df.count()

In [21]:
def remove_punctuation_and_lower(camera):
    if pd.isna(camera):
        return camera
    
    for p in punctuation:
        camera = camera.replace(p, "")
    return camera.lower()

In [32]:
def weight_to_grams_conversion(value):
    if pd.isna(value):
        return value
   
    tokens = word_tokenize(value)
    to_convert = tokens[0]
    metric = tokens[1]
    if metric == "oz":
        return int(round(float(to_convert) * 28.35))
    elif metric == "lbs":
        return int(round(float(to_convert) * 454))
    else:
        return int(round(float(to_convert)))

In [39]:
def pixels_to_megapixels(value):
    if pd.isna(value):
        return value
    
    tokens = word_tokenize(value)
    metric = tokens[1]
    if metric == "pixels":
        to_convert = tokens[0].replace(",", "")
        return float((to_convert)) / (10 ** 6)
    else:
        return float(tokens[0])

In [46]:
def keep_inches(value):
    if pd.isna(value):
        return value
    
    return word_tokenize(value)[0]

# priceme.co.nz

In [36]:
df_priceme = create_dataframe('../datasets/unlabeled/2013_camera_specs', "www.priceme.co.nz")

  1%|▏         | 10/740 [00:00<00:07, 92.06it/s]

>>> Creating dataframe...



100%|██████████| 740/740 [00:04<00:00, 179.00it/s]

>>> Dataframe created successfully!






In [37]:
df_priceme.head()

Unnamed: 0,source,spec_number,spec_id,<page title>,light sensitivity,max image resolution,optical sensor,resolution,shutter speed,still image format,...,3d support,image stabilizer,lens aperture,max focal length,min focal length,optical zoom,storage media,focus adjustment,colour,internal memory
0,www.priceme.co.nz,2407,www.priceme.co.nz//2407,Canon EOS 70D + 18-55/3.5-5.6 New Zealand Pri...,"100 - 12,800 ISO",5472x3648,CMOS,The amount of detail that the camera can captu...,30-1/8000s,"JPEG, RAW",...,,,,,,,,,,
1,www.priceme.co.nz,2050,www.priceme.co.nz//2050,Canon EOS 700D + 18-55/3.5-5.6 IS STM New Zea...,"100 - 12,800 ISO",5184x3456,CMOS,The amount of detail that the camera can captu...,30-1/4000s,"JPEG, RAW",...,,,,,,,,,,
2,www.priceme.co.nz,1984,www.priceme.co.nz//1984,Nikon Coolpix P310 New Zealand Prices - PriceMe,"100 - 12,800 ISO",4608x3456,,The amount of detail that the camera can captu...,,JPEG,...,Yes,Optical,F/1.8-4.9,100mm,24mm,The number of times the image can be enlarged ...,,,,
3,www.priceme.co.nz,2262,www.priceme.co.nz//2262,Nikon D5200 + 18-105/3.5-5.6 VR New Zealand P...,"100 - 6,400 ISO",6000x4000,CMOS,The amount of detail that the camera can captu...,30-1/4000s,"JPEG, RAW",...,,,,,,,,,,
4,www.priceme.co.nz,1937,www.priceme.co.nz//1937,Olympus Stylus TG-3 New Zealand Prices - PriceMe,"100 - 6,400 ISO",4608x3456,,The amount of detail that the camera can captu...,,JPEG,...,No,Optical,F/2-4.9,100mm,25mm,The number of times the image can be enlarged ...,,,,


In [38]:
cols = ["spec_id", "max image resolution", "<page title>"]

In [39]:
df_priceme = df_priceme[cols]
df_priceme.head()

Unnamed: 0,spec_id,max image resolution,<page title>
0,www.priceme.co.nz//2407,5472x3648,Canon EOS 70D + 18-55/3.5-5.6 New Zealand Pri...
1,www.priceme.co.nz//2050,5184x3456,Canon EOS 700D + 18-55/3.5-5.6 IS STM New Zea...
2,www.priceme.co.nz//1984,4608x3456,Nikon Coolpix P310 New Zealand Prices - PriceMe
3,www.priceme.co.nz//2262,6000x4000,Nikon D5200 + 18-105/3.5-5.6 VR New Zealand P...
4,www.priceme.co.nz//1937,4608x3456,Olympus Stylus TG-3 New Zealand Prices - PriceMe


### Page title

In [44]:
df_priceme['page_title'] = df_priceme['<page title>'].apply(tokenize_stop_words_punctuation)
df_priceme.drop(columns=['<page title>'], inplace=True)

In [45]:
df_priceme.head()

Unnamed: 0,spec_id,max image resolution,page_title
0,www.priceme.co.nz//2407,5472x3648,"[canon, eos, 70d, 18553556, new, zealand, pric..."
1,www.priceme.co.nz//2050,5184x3456,"[canon, eos, 700d, 18553556, stm, new, zealand..."
2,www.priceme.co.nz//1984,4608x3456,"[nikon, coolpix, p310, new, zealand, prices, p..."
3,www.priceme.co.nz//2262,6000x4000,"[nikon, d5200, 181053556, vr, new, zealand, pr..."
4,www.priceme.co.nz//1937,4608x3456,"[olympus, stylus, tg3, new, zealand, prices, p..."


### Max image resolution

In [46]:
df_priceme['max image resolution'].value_counts()

4608x3456      132
4000x3000       69
5184x3456       61
6000x4000       53
4928x3264       38
3648x2736       32
4320x3240       30
5472x3648       29
4912x3264       26
4896x3264       23
4592x3448       17
4608x3072       14
3872x2592       14
4896x3672       13
4288x3216       12
4592x3056       11
4032x3024       10
6016x4016        9
4288x2848        9
7360x4912        7
6016x4000        7
5456x3632        7
4272x2848        6
3968x2976        6
3264x2448        3
7360x4144        3
5760x3840        3
2640 × 1760      2
4256x2832        2
4352x3264        2
5212x3472        2
1920x1080        2
4240x2384        2
5184x3888        2
3888x2592        2
2272x1704        1
4672x3120        1
6048x4032        1
3296x2472        1
4288 x3216       1
4928x3280        1
3664x2748        1
5152x3864        1
3456x3456        1
3072x3204        1
3616x2712        1
4160x3120        1
4672x3104        1
4224x3168        1
4752x3168        1
4344x3258        1
4256x2848        1
Name: max im

In [51]:
# Number of nan
count_nan(df_priceme)

spec_id                  0
max image resolution    63
page_title               0
dtype: int64

In [56]:
def clean(val):
    if pd.isna(val):
        return val
    
    if ' × ' in val:
        l, r = tuple(val.split(' × '))
        return 'x'.join([l.strip(), r.strip()])
    
    l, r = tuple(val.split('x'))
    return 'x'.join([l.strip(), r.strip()])

In [57]:
df_priceme['max image resolution'] = df_priceme['max image resolution'].apply(clean)
df_priceme.head()

Unnamed: 0,spec_id,max image resolution,page_title
0,www.priceme.co.nz//2407,5472x3648,"[canon, eos, 70d, 18553556, new, zealand, pric..."
1,www.priceme.co.nz//2050,5184x3456,"[canon, eos, 700d, 18553556, stm, new, zealand..."
2,www.priceme.co.nz//1984,4608x3456,"[nikon, coolpix, p310, new, zealand, prices, p..."
3,www.priceme.co.nz//2262,6000x4000,"[nikon, d5200, 181053556, vr, new, zealand, pr..."
4,www.priceme.co.nz//1937,4608x3456,"[olympus, stylus, tg3, new, zealand, prices, p..."


In [58]:
df_priceme['max image resolution'].value_counts()

4608x3456    132
4000x3000     69
5184x3456     61
6000x4000     53
4928x3264     38
3648x2736     32
4320x3240     30
5472x3648     29
4912x3264     26
4896x3264     23
4592x3448     17
4608x3072     14
3872x2592     14
4896x3672     13
4288x3216     13
4592x3056     11
4032x3024     10
6016x4016      9
4288x2848      9
7360x4912      7
6016x4000      7
5456x3632      7
3968x2976      6
4272x2848      6
7360x4144      3
3264x2448      3
5760x3840      3
4352x3264      2
4256x2832      2
2640x1760      2
5212x3472      2
1920x1080      2
4240x2384      2
5184x3888      2
3888x2592      2
6048x4032      1
4672x3120      1
2272x1704      1
3296x2472      1
4928x3280      1
3072x3204      1
3456x3456      1
5152x3864      1
3664x2748      1
4256x2848      1
4160x3120      1
4672x3104      1
4752x3168      1
4224x3168      1
4344x3258      1
3616x2712      1
Name: max image resolution, dtype: int64

### Save to CSV

In [59]:
df_priceme.to_csv("../datasets/unlabeled/cleaned/priceme.co.nz.csv", index=False)

# gosale.com

In [5]:
df_gosale = create_dataframe('../datasets/unlabeled/2013_camera_specs', "www.gosale.com")

  1%|▏         | 13/1002 [00:00<00:07, 124.46it/s]

>>> Creating dataframe...



100%|██████████| 1002/1002 [00:06<00:00, 163.15it/s]

>>> Dataframe created successfully!






In [6]:
df_gosale.head()

Unnamed: 0,source,spec_number,spec_id,<page title>,camera type,deal first added on,ean13,last updated,manufactured in,manufacturer,...,upc,dimensions,ean14,feature,megapixel range,weight,megapixels,optical zoom,lcd screen size,redeye reduction
0,www.gosale.com,840,www.gosale.com//840,Canon PowerShot A2300 IS 16MP Digital on sale ...,Point-and-Shoot,2-December-2012,13803146677,09-October-2014,Made in USA,Canon,...,13803146677,,,,,,,,,
1,www.gosale.com,1476,www.gosale.com//1476,Canon PowerShot ELPH 320 HS 16.1MP Wi-Fi on sa...,Point-and-Shoot,30-May-2012,13803145588,09-October-2014,,Canon,...,13803145588,,,,,,,,,
2,www.gosale.com,1146,www.gosale.com//1146,Panasonic Lumix DMC-FZ1000 4K QFHD/HD 16X on s...,Point and Shoot,18-September-2014,840102106244,09-October-2014,,Panasonic,...,840102106244,,,,,,,,,
3,www.gosale.com,1317,www.gosale.com//1317,Panasonic Lumix DMC-FZ8K 7.2MP Digital on sale...,,10-April-2007,37988986163,09-October-2014,,Panasonic,...,37988986163,4.43 x 3.11 x 2.84 in.,37988986163.0,Image Stabilization,7.0 - 7.9 Megapixels,2 lbs.,,,,
4,www.gosale.com,633,www.gosale.com//633,Nikon Coolpix S1100pj 14MP Digital Camera on s...,Point-and-Shoot,19-October-2010,18208262359,09-October-2014,,Nikon,...,18208262359,,18208262359.0,,,,14.1 MP,5 x,,


In [11]:
cols = ['spec_id', '<page title>', 'product name', 'manufacturer', 'camera type', 'weight', 'megapixels', 'lcd screen size', 'dimensions']

In [12]:
df_gosale = df_gosale[cols]
df_gosale.head()

Unnamed: 0,spec_id,<page title>,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions
0,www.gosale.com//840,Canon PowerShot A2300 IS 16MP Digital on sale ...,Canon PowerShot A2300 IS 16MP Digital Camera w...,Canon,Point-and-Shoot,,,,
1,www.gosale.com//1476,Canon PowerShot ELPH 320 HS 16.1MP Wi-Fi on sa...,Canon PowerShot ELPH 320 HS 16.1MP Wi-Fi Digit...,Canon,Point-and-Shoot,,,,
2,www.gosale.com//1146,Panasonic Lumix DMC-FZ1000 4K QFHD/HD 16X on s...,Panasonic Lumix DMC-FZ1000 4K QFHD/HD 16X Long...,Panasonic,Point and Shoot,,,,
3,www.gosale.com//1317,Panasonic Lumix DMC-FZ8K 7.2MP Digital on sale...,Panasonic Lumix DMC-FZ8K 7.2MP Digital Camera ...,Panasonic,,2 lbs.,,,4.43 x 3.11 x 2.84 in.
4,www.gosale.com//633,Nikon Coolpix S1100pj 14MP Digital Camera on s...,Nikon Coolpix S1100pj 14MP Digital Camera with...,Nikon,Point-and-Shoot,,14.1 MP,,


### Page title

In [16]:
df_gosale['page_title'] = df_gosale['<page title>'].apply(tokenize_stop_words_punctuation)
df_gosale.drop(columns=['<page title>'], inplace=True)
df_gosale.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,Canon PowerShot A2300 IS 16MP Digital Camera w...,Canon,Point-and-Shoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,Canon PowerShot ELPH 320 HS 16.1MP Wi-Fi Digit...,Canon,Point-and-Shoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,Panasonic Lumix DMC-FZ1000 4K QFHD/HD 16X Long...,Panasonic,Point and Shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,Panasonic Lumix DMC-FZ8K 7.2MP Digital Camera ...,Panasonic,,2 lbs.,,,4.43 x 3.11 x 2.84 in.,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,Nikon Coolpix S1100pj 14MP Digital Camera with...,Nikon,Point-and-Shoot,,14.1 MP,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


### Product name

In [18]:
df_gosale['product name'] = df_gosale['product name'].apply(tokenize_stop_words_punctuation)
df_gosale.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,"[canon, powershot, a2300, 16mp, digital, camer...",Canon,Point-and-Shoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,"[canon, powershot, elph, 320, hs, 161mp, wifi,...",Canon,Point-and-Shoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,...",Panasonic,Point and Shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,"[panasonic, lumix, dmcfz8k, 72mp, digital, cam...",Panasonic,,2 lbs.,,,4.43 x 3.11 x 2.84 in.,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,"[nikon, coolpix, s1100pj, 14mp, digital, camer...",Nikon,Point-and-Shoot,,14.1 MP,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


### Manufacturer

In [22]:
df_gosale['manufacturer'] = df_gosale['manufacturer'].apply(remove_punctuation_and_lower)
df_gosale.head()

Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,"[canon, powershot, a2300, 16mp, digital, camer...",canon,Point-and-Shoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,"[canon, powershot, elph, 320, hs, 161mp, wifi,...",canon,Point-and-Shoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,...",panasonic,Point and Shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,"[panasonic, lumix, dmcfz8k, 72mp, digital, cam...",panasonic,,2 lbs.,,,4.43 x 3.11 x 2.84 in.,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,"[nikon, coolpix, s1100pj, 14mp, digital, camer...",nikon,Point-and-Shoot,,14.1 MP,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


### Camera type

In [23]:
df_gosale['camera type'] = df_gosale['camera type'].apply(remove_punctuation_and_lower)
df_gosale.head()

Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,"[canon, powershot, a2300, 16mp, digital, camer...",canon,pointandshoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,"[canon, powershot, elph, 320, hs, 161mp, wifi,...",canon,pointandshoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,...",panasonic,point and shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,"[panasonic, lumix, dmcfz8k, 72mp, digital, cam...",panasonic,,2 lbs.,,,4.43 x 3.11 x 2.84 in.,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,"[nikon, coolpix, s1100pj, 14mp, digital, camer...",nikon,pointandshoot,,14.1 MP,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


In [27]:
def remove_and(value):
    if pd.isna(value):
        return value
    
    return value.replace('and', ' ')

In [28]:
df_gosale['camera type'] = df_gosale['camera type'].apply(remove_and)
df_gosale.head()

Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,"[canon, powershot, a2300, 16mp, digital, camer...",canon,point shoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,"[canon, powershot, elph, 320, hs, 161mp, wifi,...",canon,point shoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,...",panasonic,point shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,"[panasonic, lumix, dmcfz8k, 72mp, digital, cam...",panasonic,,2 lbs.,,,4.43 x 3.11 x 2.84 in.,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,"[nikon, coolpix, s1100pj, 14mp, digital, camer...",nikon,point shoot,,14.1 MP,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


### Weight

In [35]:
df_gosale['weight'] = df_gosale['weight'].apply(weight_to_grams_conversion)
df_gosale.head()

### Megapixels

In [41]:
df_gosale['megapixels'] = df_gosale['megapixels'].apply(pixels_to_megapixels)
df_gosale.head()

### LCD Screen Size

In [47]:
df_gosale['lcd screen size'] = df_gosale['lcd screen size'].apply(keep_inches)
df_gosale.head()

### Dimensions

In [56]:
def clean_dimensions(value):
    if pd.isna(value):
        return value
    
    dims = list(map(lambda x: str(round(float(keep_inches(x.strip())), 1)), value.split('x')))
    return 'h' + dims[0] + 'w' + dims[1] + 'd' + dims[2]

In [58]:
df_gosale['dimensions'] = df_gosale['dimensions'].apply(clean_dimensions)
df_gosale.head()

### Save to CSV

In [61]:
df_gosale.head()

Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,"[canon, powershot, a2300, 16mp, digital, camer...",canon,point shoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,"[canon, powershot, elph, 320, hs, 161mp, wifi,...",canon,point shoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,...",panasonic,point shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,"[panasonic, lumix, dmcfz8k, 72mp, digital, cam...",panasonic,,908.0,,,h4.4w3.1d2.8,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,"[nikon, coolpix, s1100pj, 14mp, digital, camer...",nikon,point shoot,,14.1,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


In [63]:
df_gosale.to_csv("../datasets/unlabeled/cleaned/gosale.com.csv", index=False)