In [3]:
import os
import numpy as np
import json
import pandas as pd
import itertools
import re
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

# Utilities

In [4]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    
    for specification in tqdm(os.listdir(os.path.join(dataset_path, source))):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
            
    print('>>> Dataframe created successfully!\n')
    return df

In [5]:
stop_words = set(stopwords.words('english'))
stop_words = set(['itself', 'down', 'by', 'with', 'doesn', 'wouldn', 'other', 'ours', 'of', 'then', 'where', 'don', 'these', 'nor', 'she', "should've", 'won', 'ma', 'from', 'had', "you're", 'our', 'did', 'them', 'too', 'her', 'that', 'haven', 'after', "you'll", 'hers', 'because', 'yourself', 'against', 'mightn', 'as', 'll', 'whom', 'how', 'couldn', 'further', 'aren', "you'd", 'and', 'needn', "couldn't", 'those', 'to', "doesn't", "weren't", 'both', 'ourselves', 'in', 'which', 'yours', 'under', 'some', 'what', 'during', 'before', "needn't", "shan't", 'here', 'having', 'hasn', 'your', "hasn't", 'between', 'me', "she's", 'into', 'all', 'at', 'shan', 'who', 'o', 'an', 'very', 'can', 'you', 'shouldn', 'such', 'but', 'do', 'out', 'am', "shouldn't", 'above', 'wasn', 'or', 'were', 'own', 'didn', "you've", 'on', 'will', 'my', 'it', 'have', 'once', 'only', 'been', 'themselves', 'his', 'be', "mightn't", 'they', 'not', 'so', 'up', 'any', 'most', 'has', 'myself', 't', 'yourselves', 'isn', "it's", 'y', 'm', 'now', 'until', 're', 'there', 'their', 'mustn', "mustn't", 'again', 'being', 'hadn', 'doing', 'just', 'no', 'if', 've', "wasn't", "won't", 'we', 'below', 'does', 'more', 'this', 'should', "isn't", 'ain', "don't", 'i', "haven't", 'than', "didn't", 'are', 'about', 'off', 'him', 'for', 'few', "wouldn't", 'was', 'weren', 'why', 'he', "that'll", 'd', 'the', 'its', 'a', 'each', 'is', 'while', "aren't", 'when', 'theirs', 'same', 's', 'himself', 'herself', "hadn't", 'through', 'over'])
punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

def tokenize_stop_words_punctuation(x):
    return [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stop_words]

In [6]:
def count_nan(df):
    return len(df) - df.count()

In [7]:
def remove_punctuation_and_lower(camera):
    if pd.isna(camera):
        return camera
    
    for p in punctuation:
        camera = camera.replace(p, "")
    return camera.lower()

In [8]:
def weight_to_grams_conversion(value):
    if pd.isna(value):
        return value
   
    tokens = word_tokenize(value)
    to_convert = tokens[0]
    if len(tokens) > 1:
        metric = tokens[1]
        if metric == "oz":
            return int(round(float(to_convert) * 28.35))
        elif metric == "lbs":
            return int(round(float(to_convert) * 454))
        else:
            return int(round(float(to_convert.replace('g', ''))))
    else:
        
        return int(round(float(re.sub(r'gm|g', '', to_convert))))

In [9]:
def pixels_to_megapixels(value):
    if pd.isna(value):
        return value
    
    tokens = word_tokenize(value)
    metric = tokens[1]
    if metric == "pixels":
        to_convert = tokens[0].replace(",", "")
        return round(float((to_convert)) / (10 ** 6), 1)
    else:
        return round(float(tokens[0]), 1)

In [10]:
def keep_inches(value):
    if pd.isna(value):
        return value
    elif value.strip() == 'NA':
        return np.nan
    
    return round(float(word_tokenize(value)[0]), 1)

In [11]:
def clean_dots(value):
    if pd.isna(value):
        return value
    else:
        value = ' '.join(value.split())
        value = value.replace(",", "")
        value = value.replace("Dots", "d")
        value = value.replace("dots", "d")
        if bool(re.match(r"[0-9]* [0-9]* d", value)):
            return value.replace(" ", "")
        elif bool(re.match(r"[0-9]* d [0-9]*", value)):
            splitted = value.split()
            return splitted[2] + splitted[0] + splitted[1]            
        elif bool(re.match(r"[0-9]* d", value)):
            return value.replace(" ", "")
                
        return value.replace(" ", "")

In [12]:
def parse_megapixels(value):
    if pd.isna(value):
        return value
    else:
        match2 = re.search('(\d+)x(\d+)', str(value))
        if match2 is None:
            print(value)
            return float("NaN")
        return round(float(match2.group(1))*float(match2.group(2))/(10 ** 6),1)


        

# priceme.co.nz

In [13]:
df_priceme = create_dataframe('../datasets/unlabeled/2013_camera_specs', "www.priceme.co.nz")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
  1%|▏         | 11/740 [00:00<00:06, 105.03it/s]

>>> Creating dataframe...



100%|██████████| 740/740 [00:11<00:00, 64.12it/s] 

>>> Dataframe created successfully!






In [14]:
df_priceme.head()

Unnamed: 0,3d support,<page title>,colour,digital zoom,focus adjustment,image stabilizer,internal memory,lens aperture,light sensitivity,max focal length,...,min focal length,optical sensor,optical zoom,resolution,shutter speed,source,spec_id,spec_number,still image format,storage media
0,,Nikon D4 New Zealand Prices - PriceMe,,,,Lens based only,,,"100 - 12,800 ISO",,...,,CMOS,,The amount of detail that the camera can captu...,30-1/8000s,www.priceme.co.nz,www.priceme.co.nz//2130,2130,"JPEG, NEF (RAW), TIFF",
1,No,Fujifilm FinePix S2980 New Zealand Prices - P...,,The number of times the image can be enlarged ...,,Optical,,F/3.1,64 ISO,504mm,...,28mm,,The number of times the image can be enlarged ...,The amount of detail that the camera can captu...,,www.priceme.co.nz,www.priceme.co.nz//2075,2075,JPEG,
2,,Leica V-LUX 4 New Zealand Prices - PriceMe,,The number of times the image can be enlarged ...,,,,F/2.8,"100 - 6,400 ISO",600mm,...,25mm,,The number of times the image can be enlarged ...,The amount of detail that the camera can captu...,,www.priceme.co.nz,www.priceme.co.nz//1823,1823,"JPEG, RAW",
3,,Sony Cybershot DSC-HX20 New Zealand Prices - ...,,The number of times the image can be enlarged ...,,Optical,,,"100 - 3,200 ISO",,...,,,,The amount of detail that the camera can captu...,,www.priceme.co.nz,www.priceme.co.nz//1989,1989,,
4,,Canon EOS 650D + 18-55/3.5-5.6 IS New Zealand...,,The number of times the image can be enlarged ...,,Lens based only,,,"100 - 12,800 ISO",,...,,CMOS,,The amount of detail that the camera can captu...,30-1/4000s,www.priceme.co.nz,www.priceme.co.nz//1966,1966,"JPEG, RAW",


In [15]:
cols = ["spec_id", "max image resolution"]

In [16]:
df_priceme = df_priceme[cols]
df_priceme.head()

Unnamed: 0,spec_id,max image resolution
0,www.priceme.co.nz//2130,4928x3264
1,www.priceme.co.nz//2075,4288 x3216
2,www.priceme.co.nz//1823,
3,www.priceme.co.nz//1989,4896x3672
4,www.priceme.co.nz//1966,5184x3456


### Max image resolution

In [17]:
df_priceme['max image resolution'].value_counts()

4608x3456      132
4000x3000       69
5184x3456       61
6000x4000       53
4928x3264       38
3648x2736       32
4320x3240       30
5472x3648       29
4912x3264       26
4896x3264       23
4592x3448       17
4608x3072       14
3872x2592       14
4896x3672       13
4288x3216       12
4592x3056       11
4032x3024       10
4288x2848        9
6016x4016        9
7360x4912        7
6016x4000        7
5456x3632        7
4272x2848        6
3968x2976        6
3264x2448        3
7360x4144        3
5760x3840        3
4352x3264        2
4256x2832        2
4240x2384        2
2640 × 1760      2
1920x1080        2
5212x3472        2
5184x3888        2
3888x2592        2
2272x1704        1
4672x3104        1
3296x2472        1
4224x3168        1
3664x2748        1
3456x3456        1
4928x3280        1
4752x3168        1
4288 x3216       1
6048x4032        1
3072x3204        1
4256x2848        1
4160x3120        1
4672x3120        1
5152x3864        1
4344x3258        1
3616x2712        1
Name: max im

In [18]:
# Number of nan
count_nan(df_priceme)

spec_id                  0
max image resolution    63
dtype: int64

In [19]:
def clean(val):
    if pd.isna(val):
        return val
    
    if ' × ' in val:
        l, r = tuple(val.split(' × '))
        return 'x'.join([l.strip(), r.strip()])
    
    l, r = tuple(val.split('x'))
    return 'x'.join([l.strip(), r.strip()])

In [20]:
df_priceme['max image resolution'] = df_priceme['max image resolution'].apply(clean).apply(parse_megapixels)
df_priceme.head()

Unnamed: 0,spec_id,max image resolution
0,www.priceme.co.nz//2130,16.1
1,www.priceme.co.nz//2075,13.8
2,www.priceme.co.nz//1823,
3,www.priceme.co.nz//1989,18.0
4,www.priceme.co.nz//1966,17.9


In [21]:
df_priceme['max image resolution'].value_counts()

15.9    132
12.0     69
17.9     61
24.0     53
16.0     49
10.0     46
14.0     41
16.1     38
20.0     29
12.2     25
15.8     17
14.2     17
18.0     13
13.8     13
24.2      9
19.8      7
36.2      7
24.1      7
11.8      6
10.1      5
22.1      3
12.1      3
8.0       3
30.5      3
2.1       2
4.6       2
20.2      2
9.8       2
18.1      2
14.6      1
11.9      1
14.5      1
13.0      1
3.9       1
16.2      1
8.1       1
13.4      1
19.9      1
24.4      1
15.1      1
Name: max image resolution, dtype: int64

### Save to CSV

In [22]:
df_priceme.to_csv("../datasets/unlabeled/cleaned/priceme.co.nz.csv", index=False)

# gosale.com

In [20]:
df_gosale = create_dataframe('../datasets/unlabeled/2013_camera_specs', "www.gosale.com")

  1%|          | 12/1002 [00:00<00:08, 115.23it/s]

>>> Creating dataframe...



100%|██████████| 1002/1002 [00:05<00:00, 169.45it/s]

>>> Dataframe created successfully!






In [21]:
df_gosale.head()

Unnamed: 0,source,spec_number,spec_id,<page title>,camera type,deal first added on,ean13,last updated,manufactured in,manufacturer,...,upc,dimensions,ean14,feature,megapixel range,weight,megapixels,optical zoom,lcd screen size,redeye reduction
0,www.gosale.com,840,www.gosale.com//840,Canon PowerShot A2300 IS 16MP Digital on sale ...,Point-and-Shoot,2-December-2012,13803146677,09-October-2014,Made in USA,Canon,...,13803146677,,,,,,,,,
1,www.gosale.com,1476,www.gosale.com//1476,Canon PowerShot ELPH 320 HS 16.1MP Wi-Fi on sa...,Point-and-Shoot,30-May-2012,13803145588,09-October-2014,,Canon,...,13803145588,,,,,,,,,
2,www.gosale.com,1146,www.gosale.com//1146,Panasonic Lumix DMC-FZ1000 4K QFHD/HD 16X on s...,Point and Shoot,18-September-2014,840102106244,09-October-2014,,Panasonic,...,840102106244,,,,,,,,,
3,www.gosale.com,1317,www.gosale.com//1317,Panasonic Lumix DMC-FZ8K 7.2MP Digital on sale...,,10-April-2007,37988986163,09-October-2014,,Panasonic,...,37988986163,4.43 x 3.11 x 2.84 in.,37988986163.0,Image Stabilization,7.0 - 7.9 Megapixels,2 lbs.,,,,
4,www.gosale.com,633,www.gosale.com//633,Nikon Coolpix S1100pj 14MP Digital Camera on s...,Point-and-Shoot,19-October-2010,18208262359,09-October-2014,,Nikon,...,18208262359,,18208262359.0,,,,14.1 MP,5 x,,


In [22]:
cols = ['spec_id', '<page title>', 'product name', 'manufacturer', 'camera type', 'weight', 'megapixels', 'lcd screen size', 'dimensions']

In [23]:
df_gosale = df_gosale[cols]
df_gosale.head()

Unnamed: 0,spec_id,<page title>,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions
0,www.gosale.com//840,Canon PowerShot A2300 IS 16MP Digital on sale ...,Canon PowerShot A2300 IS 16MP Digital Camera w...,Canon,Point-and-Shoot,,,,
1,www.gosale.com//1476,Canon PowerShot ELPH 320 HS 16.1MP Wi-Fi on sa...,Canon PowerShot ELPH 320 HS 16.1MP Wi-Fi Digit...,Canon,Point-and-Shoot,,,,
2,www.gosale.com//1146,Panasonic Lumix DMC-FZ1000 4K QFHD/HD 16X on s...,Panasonic Lumix DMC-FZ1000 4K QFHD/HD 16X Long...,Panasonic,Point and Shoot,,,,
3,www.gosale.com//1317,Panasonic Lumix DMC-FZ8K 7.2MP Digital on sale...,Panasonic Lumix DMC-FZ8K 7.2MP Digital Camera ...,Panasonic,,2 lbs.,,,4.43 x 3.11 x 2.84 in.
4,www.gosale.com//633,Nikon Coolpix S1100pj 14MP Digital Camera on s...,Nikon Coolpix S1100pj 14MP Digital Camera with...,Nikon,Point-and-Shoot,,14.1 MP,,


### Page title

In [24]:
df_gosale['page_title'] = df_gosale['<page title>'].apply(tokenize_stop_words_punctuation)
df_gosale.drop(columns=['<page title>'], inplace=True)
df_gosale.head()

Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,Canon PowerShot A2300 IS 16MP Digital Camera w...,Canon,Point-and-Shoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,Canon PowerShot ELPH 320 HS 16.1MP Wi-Fi Digit...,Canon,Point-and-Shoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,Panasonic Lumix DMC-FZ1000 4K QFHD/HD 16X Long...,Panasonic,Point and Shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,Panasonic Lumix DMC-FZ8K 7.2MP Digital Camera ...,Panasonic,,2 lbs.,,,4.43 x 3.11 x 2.84 in.,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,Nikon Coolpix S1100pj 14MP Digital Camera with...,Nikon,Point-and-Shoot,,14.1 MP,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


### Product name

In [25]:
df_gosale['product name'] = df_gosale['product name'].apply(tokenize_stop_words_punctuation)
df_gosale.head()

Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,"[canon, powershot, a2300, 16mp, digital, camer...",Canon,Point-and-Shoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,"[canon, powershot, elph, 320, hs, 161mp, wifi,...",Canon,Point-and-Shoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,...",Panasonic,Point and Shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,"[panasonic, lumix, dmcfz8k, 72mp, digital, cam...",Panasonic,,2 lbs.,,,4.43 x 3.11 x 2.84 in.,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,"[nikon, coolpix, s1100pj, 14mp, digital, camer...",Nikon,Point-and-Shoot,,14.1 MP,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


### Manufacturer

In [26]:
df_gosale['manufacturer'] = df_gosale['manufacturer'].apply(remove_punctuation_and_lower)
df_gosale.head()

Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,"[canon, powershot, a2300, 16mp, digital, camer...",canon,Point-and-Shoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,"[canon, powershot, elph, 320, hs, 161mp, wifi,...",canon,Point-and-Shoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,...",panasonic,Point and Shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,"[panasonic, lumix, dmcfz8k, 72mp, digital, cam...",panasonic,,2 lbs.,,,4.43 x 3.11 x 2.84 in.,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,"[nikon, coolpix, s1100pj, 14mp, digital, camer...",nikon,Point-and-Shoot,,14.1 MP,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


### Camera type

In [27]:
df_gosale['camera type'] = df_gosale['camera type'].apply(remove_punctuation_and_lower)
df_gosale.head()

Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,"[canon, powershot, a2300, 16mp, digital, camer...",canon,pointandshoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,"[canon, powershot, elph, 320, hs, 161mp, wifi,...",canon,pointandshoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,...",panasonic,point and shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,"[panasonic, lumix, dmcfz8k, 72mp, digital, cam...",panasonic,,2 lbs.,,,4.43 x 3.11 x 2.84 in.,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,"[nikon, coolpix, s1100pj, 14mp, digital, camer...",nikon,pointandshoot,,14.1 MP,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


In [28]:
def remove_and(value):
    if pd.isna(value):
        return value
    
    return value.replace('and', ' ')

In [29]:
df_gosale['camera type'] = df_gosale['camera type'].apply(remove_and)
df_gosale.head()

Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,"[canon, powershot, a2300, 16mp, digital, camer...",canon,point shoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,"[canon, powershot, elph, 320, hs, 161mp, wifi,...",canon,point shoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,...",panasonic,point shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,"[panasonic, lumix, dmcfz8k, 72mp, digital, cam...",panasonic,,2 lbs.,,,4.43 x 3.11 x 2.84 in.,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,"[nikon, coolpix, s1100pj, 14mp, digital, camer...",nikon,point shoot,,14.1 MP,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


### Weight

In [30]:
df_gosale['weight'] = df_gosale['weight'].apply(weight_to_grams_conversion)
df_gosale.head()

Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,"[canon, powershot, a2300, 16mp, digital, camer...",canon,point shoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,"[canon, powershot, elph, 320, hs, 161mp, wifi,...",canon,point shoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,...",panasonic,point shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,"[panasonic, lumix, dmcfz8k, 72mp, digital, cam...",panasonic,,908.0,,,4.43 x 3.11 x 2.84 in.,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,"[nikon, coolpix, s1100pj, 14mp, digital, camer...",nikon,point shoot,,14.1 MP,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


### Megapixels

In [31]:
df_gosale['megapixels'] = df_gosale['megapixels'].apply(pixels_to_megapixels)
df_gosale.head()

Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,"[canon, powershot, a2300, 16mp, digital, camer...",canon,point shoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,"[canon, powershot, elph, 320, hs, 161mp, wifi,...",canon,point shoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,...",panasonic,point shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,"[panasonic, lumix, dmcfz8k, 72mp, digital, cam...",panasonic,,908.0,,,4.43 x 3.11 x 2.84 in.,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,"[nikon, coolpix, s1100pj, 14mp, digital, camer...",nikon,point shoot,,14.1,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


### LCD Screen Size

In [32]:
df_gosale['lcd screen size'] = df_gosale['lcd screen size'].apply(keep_inches)
df_gosale.head()

Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,"[canon, powershot, a2300, 16mp, digital, camer...",canon,point shoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,"[canon, powershot, elph, 320, hs, 161mp, wifi,...",canon,point shoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,...",panasonic,point shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,"[panasonic, lumix, dmcfz8k, 72mp, digital, cam...",panasonic,,908.0,,,4.43 x 3.11 x 2.84 in.,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,"[nikon, coolpix, s1100pj, 14mp, digital, camer...",nikon,point shoot,,14.1,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


### Dimensions

In [34]:
def clean_dimensions(value):
    if pd.isna(value):
        return value
    
    dims = list(map(lambda x: str(round(float(keep_inches(x.strip())), 1)), value.split('x')))
    return 'h' + dims[0] + 'w' + dims[1] + 'd' + dims[2]

In [35]:
df_gosale['dimensions'] = df_gosale['dimensions'].apply(clean_dimensions)
df_gosale.head()

Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,"[canon, powershot, a2300, 16mp, digital, camer...",canon,point shoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,"[canon, powershot, elph, 320, hs, 161mp, wifi,...",canon,point shoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,...",panasonic,point shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,"[panasonic, lumix, dmcfz8k, 72mp, digital, cam...",panasonic,,908.0,,,h4.4w3.1d2.8,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,"[nikon, coolpix, s1100pj, 14mp, digital, camer...",nikon,point shoot,,14.1,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


### Save to CSV

In [37]:
df_gosale.head()

Unnamed: 0,spec_id,product name,manufacturer,camera type,weight,megapixels,lcd screen size,dimensions,page_title
0,www.gosale.com//840,"[canon, powershot, a2300, 16mp, digital, camer...",canon,point shoot,,,,,"[canon, powershot, a2300, 16mp, digital, sale,..."
1,www.gosale.com//1476,"[canon, powershot, elph, 320, hs, 161mp, wifi,...",canon,point shoot,,,,,"[canon, powershot, elph, 320, hs, 161mp, wifi,..."
2,www.gosale.com//1146,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,...",panasonic,point shoot,,,,,"[panasonic, lumix, dmcfz1000, 4k, qfhdhd, 16x,..."
3,www.gosale.com//1317,"[panasonic, lumix, dmcfz8k, 72mp, digital, cam...",panasonic,,908.0,,,h4.4w3.1d2.8,"[panasonic, lumix, dmcfz8k, 72mp, digital, sal..."
4,www.gosale.com//633,"[nikon, coolpix, s1100pj, 14mp, digital, camer...",nikon,point shoot,,14.1,,,"[nikon, coolpix, s1100pj, 14mp, digital, camer..."


In [38]:
df_gosale.to_csv("../datasets/unlabeled/cleaned/gosale.com.csv", index=False)

# garricks.com.au

In [85]:
df_garricks = create_dataframe('../datasets/unlabeled/2013_camera_specs', "www.garricks.com.au")
df_garricks.head()

 12%|█▏        | 16/130 [00:00<00:00, 151.32it/s]

>>> Creating dataframe...



100%|██████████| 130/130 [00:00<00:00, 195.71it/s]

>>> Dataframe created successfully!






Unnamed: 0,source,spec_number,spec_id,<page title>,flash unit,memory type,resolution,sensor details,viewfinder type,zoom range,image format,lens mount,special attribute,image stabilization,video recording format,weight,autofocus array
0,www.garricks.com.au,50,www.garricks.com.au//50,Nikon Coolpix S6800 Digital Still Camera,Inbuilt,SDHC-SDXC,16mp,CMOS,Electronic,12x Optical,,,,,,,
1,www.garricks.com.au,34,www.garricks.com.au//34,Olympus Stylus 1 Digital Compact Camera,Inbuilt,SDHC-SDXC,12mp,CMOS,,10x Optical Constant F/2.8 Aperture,,,,,,,
2,www.garricks.com.au,47,www.garricks.com.au//47,Sony α A58 Twin Lens Kit,Inbuilt,"Memory Stick PRO Duo, Memory Stick PRO-HG Duo,...",20 mp,CMOS,EVF - Electronic OLED,,APS-C,Sony A,,,,,
3,www.garricks.com.au,40,www.garricks.com.au//40,Panasonic Lumix DMC-FZ200 Digital Compact Camera,Inbuilt,SDHC-SDXC,12.1mp,CMOS,,24x Optical,,,Constant f2.8 aperture,,,,
4,www.garricks.com.au,139,www.garricks.com.au//139,Olympus OM-D E-M10 Silver with 14-42mm EZ and ...,,SDHC-SDXC,16.1 mp,CMOS,EVF - Electronic OLED,,M-43,Micro 4/3,,IBIS (In Body Stabilization),Full HD,,


In [86]:
cols = ['spec_id', '<page title>', 'resolution', 'weight']

In [87]:
df_garricks = df_garricks[cols]
df_garricks.head()

Unnamed: 0,spec_id,<page title>,resolution,weight
0,www.garricks.com.au//50,Nikon Coolpix S6800 Digital Still Camera,16mp,
1,www.garricks.com.au//34,Olympus Stylus 1 Digital Compact Camera,12mp,
2,www.garricks.com.au//47,Sony α A58 Twin Lens Kit,20 mp,
3,www.garricks.com.au//40,Panasonic Lumix DMC-FZ200 Digital Compact Camera,12.1mp,
4,www.garricks.com.au//139,Olympus OM-D E-M10 Silver with 14-42mm EZ and ...,16.1 mp,


### Page title

In [88]:
df_garricks['page_title'] = df_garricks['<page title>'].apply(tokenize_stop_words_punctuation)
df_garricks.drop(columns=['<page title>'], inplace=True)
df_garricks.head()

Unnamed: 0,spec_id,resolution,weight,page_title
0,www.garricks.com.au//50,16mp,,"[nikon, coolpix, s6800, digital, still, camera]"
1,www.garricks.com.au//34,12mp,,"[olympus, stylus, 1, digital, compact, camera]"
2,www.garricks.com.au//47,20 mp,,"[sony, α, a58, twin, lens, kit]"
3,www.garricks.com.au//40,12.1mp,,"[panasonic, lumix, dmcfz200, digital, compact,..."
4,www.garricks.com.au//139,16.1 mp,,"[olympus, omd, em10, silver, 1442mm, ez, 40150..."


### Resolution

In [89]:
def clean_megapixels(value):
    if pd.isna(value):
        return value
    
    tokens = word_tokenize(value)
    if len(tokens) == 1:
        if tokens[0].strip() == '1080p':
            return 2.1
        else:
            return round(float(re.sub(r'mp|mb', '', tokens[0])), 1) 
    else:
        return round(float(tokens[0]), 1)

In [90]:
df_garricks['resolution'] = df_garricks['resolution'].apply(clean_megapixels)
df_garricks.head()

Unnamed: 0,spec_id,resolution,weight,page_title
0,www.garricks.com.au//50,16.0,,"[nikon, coolpix, s6800, digital, still, camera]"
1,www.garricks.com.au//34,12.0,,"[olympus, stylus, 1, digital, compact, camera]"
2,www.garricks.com.au//47,20.0,,"[sony, α, a58, twin, lens, kit]"
3,www.garricks.com.au//40,12.1,,"[panasonic, lumix, dmcfz200, digital, compact,..."
4,www.garricks.com.au//139,16.1,,"[olympus, omd, em10, silver, 1442mm, ez, 40150..."


### Weight

In [92]:
df_garricks['weight'] = df_garricks['weight'].apply(weight_to_grams_conversion)
df_garricks.head()

Unnamed: 0,spec_id,resolution,weight,page_title
0,www.garricks.com.au//50,16.0,,"[nikon, coolpix, s6800, digital, still, camera]"
1,www.garricks.com.au//34,12.0,,"[olympus, stylus, 1, digital, compact, camera]"
2,www.garricks.com.au//47,20.0,,"[sony, α, a58, twin, lens, kit]"
3,www.garricks.com.au//40,12.1,,"[panasonic, lumix, dmcfz200, digital, compact,..."
4,www.garricks.com.au//139,16.1,,"[olympus, omd, em10, silver, 1442mm, ez, 40150..."


### Save to CSV

In [93]:
df_garricks.head()

Unnamed: 0,spec_id,resolution,weight,page_title
0,www.garricks.com.au//50,16.0,,"[nikon, coolpix, s6800, digital, still, camera]"
1,www.garricks.com.au//34,12.0,,"[olympus, stylus, 1, digital, compact, camera]"
2,www.garricks.com.au//47,20.0,,"[sony, α, a58, twin, lens, kit]"
3,www.garricks.com.au//40,12.1,,"[panasonic, lumix, dmcfz200, digital, compact,..."
4,www.garricks.com.au//139,16.1,,"[olympus, omd, em10, silver, 1442mm, ez, 40150..."


In [94]:
df_garricks.to_csv("../datasets/unlabeled/cleaned/garricks.com.au.csv", index=False)

# pricedekho.com

In [16]:
df_pricedekho = create_dataframe('../datasets/unlabeled/2013_camera_specs', "www.pricedekho.com")
df_pricedekho.head()

  2%|▏         | 8/366 [00:00<00:04, 77.58it/s]

>>> Creating dataframe...



100%|██████████| 366/366 [00:04<00:00, 87.23it/s]

>>> Dataframe created successfully!






Unnamed: 0,source,spec_number,spec_id,<page title>,amazon,announced,auto focus,bangalore,brand,color,...,pictbridge,snapdeal,indiatimes,ebay,wifi,infibeam,seventymm,naaptol,usb charge,shopclues
0,www.pricedekho.com,1146,www.pricedekho.com//1146,Pentax Optio LS465 Price in India with Offers...,Infibeam Snapdeal Indiatimes,"2014, March",Yes,Hyderabad Chennai Mumbai Delhi Pune,Pentax,Pink,...,,,,,,,,,,
1,www.pricedekho.com,1160,www.pricedekho.com//1160,Fujifilm Instax mini 25 Instant Price in India...,,,,Hyderabad Chennai Mumbai Delhi Pune,,,...,,,,,,,,,,
2,www.pricedekho.com,1195,www.pricedekho.com//1195,Sony CyberShot DSC-WX200 Point & Shoot Price i...,,,Center Weighted AF,Hyderabad Chennai Mumbai Delhi Pune,Sony,Black,...,,,,,,,,,,
3,www.pricedekho.com,1094,www.pricedekho.com//1094,Canon PowerShot SX120 IS Price in India with O...,,,Yes,Hyderabad Chennai Mumbai Delhi Pune,,Black,...,,,,,,,,,,
4,www.pricedekho.com,1052,www.pricedekho.com//1052,"Samsung PL200 Price in India with Offers, Revi...",,,,,,,...,,,,,,,,,,


In [17]:
cols = ['spec_id', '<page title>', 'screen size', 'image display resolution']

In [18]:
df_pricedekho = df_pricedekho[cols]
df_pricedekho.head()

Unnamed: 0,spec_id,<page title>,screen size,image display resolution
0,www.pricedekho.com//1146,Pentax Optio LS465 Price in India with Offers...,2.7 Inches,230000 Dots
1,www.pricedekho.com//1160,Fujifilm Instax mini 25 Instant Price in India...,,
2,www.pricedekho.com//1195,Sony CyberShot DSC-WX200 Point & Shoot Price i...,2.7 Inches,460800 dots
3,www.pricedekho.com//1094,Canon PowerShot SX120 IS Price in India with O...,3 Inches,
4,www.pricedekho.com//1052,"Samsung PL200 Price in India with Offers, Revi...",,


### Page title

In [19]:
df_pricedekho['page_title'] = df_pricedekho['<page title>'].apply(tokenize_stop_words_punctuation)
df_pricedekho.drop(columns=['<page title>'], inplace=True)
df_pricedekho.head()

Unnamed: 0,spec_id,screen size,image display resolution,page_title
0,www.pricedekho.com//1146,2.7 Inches,230000 Dots,"[pentax, optio, ls465, price, india, offers, f..."
1,www.pricedekho.com//1160,,,"[fujifilm, instax, mini, 25, instant, price, i..."
2,www.pricedekho.com//1195,2.7 Inches,460800 dots,"[sony, cybershot, dscwx200, point, shoot, pric..."
3,www.pricedekho.com//1094,3 Inches,,"[canon, powershot, sx120, price, india, offers..."
4,www.pricedekho.com//1052,,,"[samsung, pl200, price, india, offers, reviews..."


### Screen size

In [20]:
df_pricedekho['screen size'] = df_pricedekho['screen size'].apply(keep_inches)
df_pricedekho.head()

Unnamed: 0,spec_id,screen size,image display resolution,page_title
0,www.pricedekho.com//1146,2.7,230000 Dots,"[pentax, optio, ls465, price, india, offers, f..."
1,www.pricedekho.com//1160,,,"[fujifilm, instax, mini, 25, instant, price, i..."
2,www.pricedekho.com//1195,2.7,460800 dots,"[sony, cybershot, dscwx200, point, shoot, pric..."
3,www.pricedekho.com//1094,3.0,,"[canon, powershot, sx120, price, india, offers..."
4,www.pricedekho.com//1052,,,"[samsung, pl200, price, india, offers, reviews..."


### Image display resolution

In [22]:
df_pricedekho['image display resolution'] = df_pricedekho['image display resolution'].apply(clean_dots)
df_pricedekho = df_pricedekho[df_pricedekho['image display resolution'] != '-']
df_pricedekho.head()

Unnamed: 0,spec_id,screen size,image display resolution,page_title
0,www.pricedekho.com//1146,2.7,230000d,"[pentax, optio, ls465, price, india, offers, f..."
1,www.pricedekho.com//1160,,,"[fujifilm, instax, mini, 25, instant, price, i..."
2,www.pricedekho.com//1195,2.7,460800d,"[sony, cybershot, dscwx200, point, shoot, pric..."
3,www.pricedekho.com//1094,3.0,,"[canon, powershot, sx120, price, india, offers..."
4,www.pricedekho.com//1052,,,"[samsung, pl200, price, india, offers, reviews..."


### Save to CSV

In [23]:
df_pricedekho.head()

Unnamed: 0,spec_id,screen size,image display resolution,page_title
0,www.pricedekho.com//1146,2.7,230000d,"[pentax, optio, ls465, price, india, offers, f..."
1,www.pricedekho.com//1160,,,"[fujifilm, instax, mini, 25, instant, price, i..."
2,www.pricedekho.com//1195,2.7,460800d,"[sony, cybershot, dscwx200, point, shoot, pric..."
3,www.pricedekho.com//1094,3.0,,"[canon, powershot, sx120, price, india, offers..."
4,www.pricedekho.com//1052,,,"[samsung, pl200, price, india, offers, reviews..."


In [24]:
df_pricedekho.to_csv("../datasets/unlabeled/cleaned/pricedekho.com.csv", index=False)

# ukdigitalcameras.co.uk

In [18]:
df_ukdigital = create_dataframe('../datasets/unlabeled/2013_camera_specs', "www.ukdigitalcameras.co.uk")

 12%|█▏        | 15/129 [00:00<00:00, 144.64it/s]

>>> Creating dataframe...



100%|██████████| 129/129 [00:00<00:00, 195.20it/s]

>>> Dataframe created successfully!






In [19]:
df_ukdigital.head()

Unnamed: 0,source,spec_number,spec_id,35mm equivalent,<page title>,brand,camera resolution,colour,features,hd video,lcd size,lens tele mm,lens wide mm,mpn,optical zoom,optical zoom range,variangle lcd,waterproof depth,included lens
0,www.ukdigitalcameras.co.uk,50,www.ukdigitalcameras.co.uk//50,25-750mm,Canon Powershot SX700 HS Digital Camera (Red) ...,Canon,16 Megapixels,Red,Compact Super-Zoom,Full HD (1080p),"3.0""",750,25,9339B014AA,30,18x and higher,,,
1,www.ukdigitalcameras.co.uk,34,www.ukdigitalcameras.co.uk//34,24-90mm,Panasonic Lumix LX7 Digital Camera (Black) | U...,Panasonic,10.1 Megapixels,Black,Raw Shooting\nSemi-Pro\nWide-Angle,Full HD (1080P),3.0'',90,24,DMC-LX5EB-K,3.8,4x or less,No,,
2,www.ukdigitalcameras.co.uk,47,www.ukdigitalcameras.co.uk//47,25-100mm,Ricoh WG-4 GPS Digital Camera (Black) | UK Dig...,Ricoh,16 Megapixels,Black,GPS\nWaterproof,Full HD (1080P),"3.0""",100,25,08541,4x,4x to 7x,No,14m,
3,www.ukdigitalcameras.co.uk,40,www.ukdigitalcameras.co.uk//40,23mm,Fuji X100T Digital Camera (Black) | UK Digital...,Fuji,16 Megapixels,Black,Raw Shooting\nWide-Angle,Full HD (1080P),2.8'',23,23,P10NC13250A,,,,,
4,www.ukdigitalcameras.co.uk,227,www.ukdigitalcameras.co.uk//227,25-200mm,Nikon Coolpix S3600 Digital Camera (Silver) | ...,Nikon,20 Megapixels,Black,Slimline,HD (720P),2.7'',200,25,VNA551E1,8,8x to 11x,,,


In [20]:
cols = ['spec_id', '<page title>', 'brand', 'camera resolution', 'lcd size']

In [21]:
df_ukdigital = df_ukdigital[cols]
df_ukdigital.head()

Unnamed: 0,spec_id,<page title>,brand,camera resolution,lcd size
0,www.ukdigitalcameras.co.uk//50,Canon Powershot SX700 HS Digital Camera (Red) ...,Canon,16 Megapixels,"3.0"""
1,www.ukdigitalcameras.co.uk//34,Panasonic Lumix LX7 Digital Camera (Black) | U...,Panasonic,10.1 Megapixels,3.0''
2,www.ukdigitalcameras.co.uk//47,Ricoh WG-4 GPS Digital Camera (Black) | UK Dig...,Ricoh,16 Megapixels,"3.0"""
3,www.ukdigitalcameras.co.uk//40,Fuji X100T Digital Camera (Black) | UK Digital...,Fuji,16 Megapixels,2.8''
4,www.ukdigitalcameras.co.uk//227,Nikon Coolpix S3600 Digital Camera (Silver) | ...,Nikon,20 Megapixels,2.7''


### Page title

In [22]:
df_ukdigital['page_title'] = df_ukdigital['<page title>'].apply(tokenize_stop_words_punctuation)
df_ukdigital.drop(columns=['<page title>'], inplace=True)
df_ukdigital.head()

Unnamed: 0,spec_id,brand,camera resolution,lcd size,page_title
0,www.ukdigitalcameras.co.uk//50,Canon,16 Megapixels,"3.0""","[canon, powershot, sx700, hs, digital, camera,..."
1,www.ukdigitalcameras.co.uk//34,Panasonic,10.1 Megapixels,3.0'',"[panasonic, lumix, lx7, digital, camera, black..."
2,www.ukdigitalcameras.co.uk//47,Ricoh,16 Megapixels,"3.0""","[ricoh, wg4, gps, digital, camera, black, uk, ..."
3,www.ukdigitalcameras.co.uk//40,Fuji,16 Megapixels,2.8'',"[fuji, x100t, digital, camera, black, uk, digi..."
4,www.ukdigitalcameras.co.uk//227,Nikon,20 Megapixels,2.7'',"[nikon, coolpix, s3600, digital, camera, silve..."


### Brand

In [23]:
df_ukdigital['brand'] = df_ukdigital['brand'].apply(lambda x: x.lower())
df_ukdigital.head()

Unnamed: 0,spec_id,brand,camera resolution,lcd size,page_title
0,www.ukdigitalcameras.co.uk//50,canon,16 Megapixels,"3.0""","[canon, powershot, sx700, hs, digital, camera,..."
1,www.ukdigitalcameras.co.uk//34,panasonic,10.1 Megapixels,3.0'',"[panasonic, lumix, lx7, digital, camera, black..."
2,www.ukdigitalcameras.co.uk//47,ricoh,16 Megapixels,"3.0""","[ricoh, wg4, gps, digital, camera, black, uk, ..."
3,www.ukdigitalcameras.co.uk//40,fuji,16 Megapixels,2.8'',"[fuji, x100t, digital, camera, black, uk, digi..."
4,www.ukdigitalcameras.co.uk//227,nikon,20 Megapixels,2.7'',"[nikon, coolpix, s3600, digital, camera, silve..."


### Camera resolution

In [25]:
df_ukdigital['camera resolution'] = df_ukdigital['camera resolution'].apply(pixels_to_megapixels)
df_ukdigital.head()

Unnamed: 0,spec_id,brand,camera resolution,lcd size,page_title
0,www.ukdigitalcameras.co.uk//50,canon,16.0,"3.0""","[canon, powershot, sx700, hs, digital, camera,..."
1,www.ukdigitalcameras.co.uk//34,panasonic,10.1,3.0'',"[panasonic, lumix, lx7, digital, camera, black..."
2,www.ukdigitalcameras.co.uk//47,ricoh,16.0,"3.0""","[ricoh, wg4, gps, digital, camera, black, uk, ..."
3,www.ukdigitalcameras.co.uk//40,fuji,16.0,2.8'',"[fuji, x100t, digital, camera, black, uk, digi..."
4,www.ukdigitalcameras.co.uk//227,nikon,20.0,2.7'',"[nikon, coolpix, s3600, digital, camera, silve..."


### LCD size

In [28]:
def clean_screen_size(value):
    if pd.isna(value):
        return value
    
    return value[:3]

In [29]:
df_ukdigital['lcd size'] = df_ukdigital['lcd size'].apply(clean_screen_size)
df_ukdigital.head()

Unnamed: 0,spec_id,brand,camera resolution,lcd size,page_title
0,www.ukdigitalcameras.co.uk//50,canon,16.0,3.0,"[canon, powershot, sx700, hs, digital, camera,..."
1,www.ukdigitalcameras.co.uk//34,panasonic,10.1,3.0,"[panasonic, lumix, lx7, digital, camera, black..."
2,www.ukdigitalcameras.co.uk//47,ricoh,16.0,3.0,"[ricoh, wg4, gps, digital, camera, black, uk, ..."
3,www.ukdigitalcameras.co.uk//40,fuji,16.0,2.8,"[fuji, x100t, digital, camera, black, uk, digi..."
4,www.ukdigitalcameras.co.uk//227,nikon,20.0,2.7,"[nikon, coolpix, s3600, digital, camera, silve..."


### Save to CSV

In [31]:
df_ukdigital.head()

Unnamed: 0,spec_id,brand,camera resolution,lcd size,page_title
0,www.ukdigitalcameras.co.uk//50,canon,16.0,3.0,"[canon, powershot, sx700, hs, digital, camera,..."
1,www.ukdigitalcameras.co.uk//34,panasonic,10.1,3.0,"[panasonic, lumix, lx7, digital, camera, black..."
2,www.ukdigitalcameras.co.uk//47,ricoh,16.0,3.0,"[ricoh, wg4, gps, digital, camera, black, uk, ..."
3,www.ukdigitalcameras.co.uk//40,fuji,16.0,2.8,"[fuji, x100t, digital, camera, black, uk, digi..."
4,www.ukdigitalcameras.co.uk//227,nikon,20.0,2.7,"[nikon, coolpix, s3600, digital, camera, silve..."


In [32]:
df_ukdigital.to_csv("../datasets/unlabeled/cleaned/ukdigitalcameras.co.uk.csv", index=False)

# camerafarm.com.au

In [111]:
df_camerafarm = create_dataframe('../datasets/unlabeled/2013_camera_specs', "www.camerafarm.com.au")

  5%|▌         | 6/120 [00:00<00:01, 58.61it/s]

>>> Creating dataframe...



100%|██████████| 120/120 [00:01<00:00, 74.94it/s]

>>> Dataframe created successfully!






In [112]:
df_camerafarm.head()

Unnamed: 0,source,spec_number,spec_id,<page title>,afarea mode,autofocus system,brand,builtin flash,bulb shutter setting,card slot,...,drive modes,modes in movie,movies,slow sync speed,weight,zoom,focal length,image stabilisation,charging time,power sources
0,www.camerafarm.com.au,819,www.camerafarm.com.au//819,Nikon 1 J2 Digital Camera - Black (VVK161XH) |...,Single-point AF: 135 focus areas Auto-area AF:...,Hybrid autofocus (phase detection/contrast-det...,Nikon Web Site,Yes,Yes,1 Secure Digital (SD),...,,,,,,,,,,
1,www.camerafarm.com.au,824,www.camerafarm.com.au//824,Canon EOS 1100D Digital SLR Camera - 12.2MP Bl...,,,Canon Web Site,,,,...,,,,,,,,,,
2,www.camerafarm.com.au,726,www.camerafarm.com.au//726,Canon IXUS155R Digital Camera - Red | Camerafa...,,,Canon Web Site,,,,...,,,,,,,,,,
3,www.camerafarm.com.au,796,www.camerafarm.com.au//796,Nikon D800 Digital SLR Camera - 36.3MP - Black...,"9, 21 or 51 point Dynamic-area AF\nAuto-area A...",Nikon Advanced Multi-CAM 3500FX autofocus sens...,,Yes,Yes,1 CompactFlash (CF) card and 1 Secure Digital ...,...,,,,,,,,,,
4,www.camerafarm.com.au,810,www.camerafarm.com.au//810,Canon 600DKIS EOS 600D Digital SLR Camera - 18...,,,Canon Web Site,,,,...,,,,,,,,,,


In [113]:
cols = ['spec_id', '<page title>', 'brand']

In [114]:
df_camerafarm = df_camerafarm[cols]
df_camerafarm.head()

Unnamed: 0,spec_id,<page title>,brand
0,www.camerafarm.com.au//819,Nikon 1 J2 Digital Camera - Black (VVK161XH) |...,Nikon Web Site
1,www.camerafarm.com.au//824,Canon EOS 1100D Digital SLR Camera - 12.2MP Bl...,Canon Web Site
2,www.camerafarm.com.au//726,Canon IXUS155R Digital Camera - Red | Camerafa...,Canon Web Site
3,www.camerafarm.com.au//796,Nikon D800 Digital SLR Camera - 36.3MP - Black...,
4,www.camerafarm.com.au//810,Canon 600DKIS EOS 600D Digital SLR Camera - 18...,Canon Web Site


### Page title

In [115]:
df_camerafarm['page_title'] = df_camerafarm['<page title>'].apply(tokenize_stop_words_punctuation)
df_camerafarm.drop(columns=['<page title>'], inplace=True)
df_camerafarm.head()

Unnamed: 0,spec_id,brand,page_title
0,www.camerafarm.com.au//819,Nikon Web Site,"[nikon, 1, j2, digital, camera, black, vvk161x..."
1,www.camerafarm.com.au//824,Canon Web Site,"[canon, eos, 1100d, digital, slr, camera, 122m..."
2,www.camerafarm.com.au//726,Canon Web Site,"[canon, ixus155r, digital, camera, red, camera..."
3,www.camerafarm.com.au//796,,"[nikon, d800, digital, slr, camera, 363mp, bla..."
4,www.camerafarm.com.au//810,Canon Web Site,"[canon, 600dkis, eos, 600d, digital, slr, came..."


### Brand

In [116]:
def clean_brand(value):
    if pd.isna(value):
        return value
    
    return word_tokenize(value)[0].lower()

In [117]:
df_camerafarm['brand'] = df_camerafarm['brand'].apply(clean_brand)
df_camerafarm.head()

Unnamed: 0,spec_id,brand,page_title
0,www.camerafarm.com.au//819,nikon,"[nikon, 1, j2, digital, camera, black, vvk161x..."
1,www.camerafarm.com.au//824,canon,"[canon, eos, 1100d, digital, slr, camera, 122m..."
2,www.camerafarm.com.au//726,canon,"[canon, ixus155r, digital, camera, red, camera..."
3,www.camerafarm.com.au//796,,"[nikon, d800, digital, slr, camera, 363mp, bla..."
4,www.camerafarm.com.au//810,canon,"[canon, 600dkis, eos, 600d, digital, slr, came..."


### Save to CSV

In [119]:
df_camerafarm.head()

Unnamed: 0,spec_id,brand,page_title
0,www.camerafarm.com.au//819,nikon,"[nikon, 1, j2, digital, camera, black, vvk161x..."
1,www.camerafarm.com.au//824,canon,"[canon, eos, 1100d, digital, slr, camera, 122m..."
2,www.camerafarm.com.au//726,canon,"[canon, ixus155r, digital, camera, red, camera..."
3,www.camerafarm.com.au//796,,"[nikon, d800, digital, slr, camera, 363mp, bla..."
4,www.camerafarm.com.au//810,canon,"[canon, 600dkis, eos, 600d, digital, slr, came..."


In [120]:
df_camerafarm.to_csv("../datasets/unlabeled/cleaned/camerafarm.com.au.csv", index=False)

# mypriceindia.com

In [60]:
df_myprice = create_dataframe('../datasets/unlabeled/2013_camera_specs', "www.mypriceindia.com")

  2%|▏         | 7/347 [00:00<00:05, 64.22it/s]

>>> Creating dataframe...



100%|██████████| 347/347 [00:04<00:00, 85.61it/s]

>>> Dataframe created successfully!






In [61]:
df_myprice.head()

Unnamed: 0,source,spec_number,spec_id,<page title>,aperture range,audio formats,auto focus,camera resolution,digital zoom,focal length,...,dimensions,macro mode,type,weight,flash range,internal memory,memory card type,upgradable memory,battery type,power supply
0,www.mypriceindia.com,50,www.mypriceindia.com//50,"Panasonic HC V130 Price In India, Bangalore, H...",F1.8 (W) - F4.2 (T),"HA, HG, HE: Dolby Digital (2ch) / iFrame, MP4:...",Yes,8.9 MP,100x 2500x,2.35 - 89.3 mm (35 mm Equivalent to 32.9 - 164...,...,,,,,,,,,,
1,www.mypriceindia.com,34,www.mypriceindia.com//34,Canon EOS 1100D (EF-S 18-55 mm IS II) Price In...,F3.5 (W) - F5.6 (T),Linear PCM,"Yes, Contrast Detect, Phase Detect, Multi-Area...",12.2 MP,,18 - 55 mm (35 mm Equivalent to 28.8 - 88 mm),...,,,,,,,,,,
2,www.mypriceindia.com,47,www.mypriceindia.com//47,"Panasonic Lumix DMC TZ30 Price In India, Banga...",F3.3 (W) - F6.4 (T),,"Yes, Contrast Detect, Multi-Area, Center, Trac...",14 MP,4x,4.3 - 86 mm (35 mm Equivalent to 24 - 480 mm),...,,,,,,,,,,
3,www.mypriceindia.com,40,www.mypriceindia.com//40,"Sony Alpha ILCE 7S (Body Only) Price In India,...",F3.5 (W) - F5.6 (T),"AVCHD: Dolby Digital (AC-3) 2ch, Dolby Digital...","Yes, Contrast-detection AF, AF Mode (Single-sh...",12.4 MP,,24 - 70 mm,...,,,,,,,,,,
4,www.mypriceindia.com,726,www.mypriceindia.com//726,"Samsung ST72 Price In India, Bangalore, Hydera...",F/2.5 - F/6.3,AAC Stereo,"Yes, TTL Auto Focus",16.2 MP,"1x 5x Still Image Mode, 1x 14.4x Play Mode",4.5 - 22.5 mm (35 mm Equivalent to 25 - 125 mm),...,94.4 (W) x 58 (H) x 17.7 (D) mm,"Yes, 5 cm 80 cm (W), 100 cm 250 cm (T)",Point & Shoot,114 g (without Battery and Memory Card Media),,,,,,


In [62]:
cols = ['spec_id', '<page title>', 'camera resolution']

In [63]:
df_myprice = df_myprice[cols]
df_myprice.head()

Unnamed: 0,spec_id,<page title>,camera resolution
0,www.mypriceindia.com//50,"Panasonic HC V130 Price In India, Bangalore, H...",8.9 MP
1,www.mypriceindia.com//34,Canon EOS 1100D (EF-S 18-55 mm IS II) Price In...,12.2 MP
2,www.mypriceindia.com//47,"Panasonic Lumix DMC TZ30 Price In India, Banga...",14 MP
3,www.mypriceindia.com//40,"Sony Alpha ILCE 7S (Body Only) Price In India,...",12.4 MP
4,www.mypriceindia.com//726,"Samsung ST72 Price In India, Bangalore, Hydera...",16.2 MP


### Page title

In [64]:
df_myprice['page_title'] = df_myprice['<page title>'].apply(tokenize_stop_words_punctuation)
df_myprice.drop(columns=['<page title>'], inplace=True)
df_myprice.head()

Unnamed: 0,spec_id,camera resolution,page_title
0,www.mypriceindia.com//50,8.9 MP,"[panasonic, hc, v130, price, india, bangalore,..."
1,www.mypriceindia.com//34,12.2 MP,"[canon, eos, 1100d, efs, 1855, mm, ii, price, ..."
2,www.mypriceindia.com//47,14 MP,"[panasonic, lumix, dmc, tz30, price, india, ba..."
3,www.mypriceindia.com//40,12.4 MP,"[sony, alpha, ilce, 7s, body, price, india, ba..."
4,www.mypriceindia.com//726,16.2 MP,"[samsung, st72, price, india, bangalore, hyder..."


### Camera resolution

In [65]:
df_myprice['camera resolution'] = df_myprice['camera resolution'].apply(pixels_to_megapixels)
df_myprice.head()

Unnamed: 0,spec_id,camera resolution,page_title
0,www.mypriceindia.com//50,8.9,"[panasonic, hc, v130, price, india, bangalore,..."
1,www.mypriceindia.com//34,12.2,"[canon, eos, 1100d, efs, 1855, mm, ii, price, ..."
2,www.mypriceindia.com//47,14.0,"[panasonic, lumix, dmc, tz30, price, india, ba..."
3,www.mypriceindia.com//40,12.4,"[sony, alpha, ilce, 7s, body, price, india, ba..."
4,www.mypriceindia.com//726,16.2,"[samsung, st72, price, india, bangalore, hyder..."


### Save to CSV

In [68]:
df_myprice.head()

Unnamed: 0,spec_id,camera resolution,page_title
0,www.mypriceindia.com//50,8.9,"[panasonic, hc, v130, price, india, bangalore,..."
1,www.mypriceindia.com//34,12.2,"[canon, eos, 1100d, efs, 1855, mm, ii, price, ..."
2,www.mypriceindia.com//47,14.0,"[panasonic, lumix, dmc, tz30, price, india, ba..."
3,www.mypriceindia.com//40,12.4,"[sony, alpha, ilce, 7s, body, price, india, ba..."
4,www.mypriceindia.com//726,16.2,"[samsung, st72, price, india, bangalore, hyder..."


In [69]:
df_myprice.to_csv("../datasets/unlabeled/cleaned/mypriceindia.com.csv", index=False)