In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    #source = "buy.net"
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
    #print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.price-hunt.com")

>>> Creating dataframe...



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


>>> Dataframe created successfully!



In [4]:
df.head()

Unnamed: 0,3d images,<page title>,accelerometer,accessory shoe,additional features,ae lockexposure lock,all,altimeter,aperture range,audio formats,...,warranty service type,warranty summary,water resistance depth,weight,weight without battery,white balancing,wifi,wifi connectivity,wifi standard,wireless connectivity
0,,Pentax WG 10 Point & Shoot Camera best price i...,,,"Voice Memo, Index (6 or 12 Thumbnails), Croppi...",,,,"), f3.5 (w) - f5.5 (t","Pcm, WAV, Monaural",...,,3 Year Pentax India Warranty and Free Transit ...,,167.26 g,147.41,"Manual Setting, Shade, Daylight, Tungsten Ligh...",,,,
1,3d Images,Sony NEX 7K DSLR Camera best price in India 20...,,,"Creative Style: Standard, Vivid, Neutral, Clea...",Ae is Locked with Focus Locked,,,F3.5 - f5.6,"Audio Recording Format: Dolby Digital (AC-3), ...",...,,2 Year Sony India Warranty and Free Transit In...,,350 g (with Battery and Memory Stick pro Duo),291.0,"Auto wb, Daylight, Shade, Cloudy, Incandescent...",,,,
2,,Panasonic SDR H101 Camcorder Camera best price...,,,,,,,F1.9(Wide) / 5.7(Tele),,...,Customer Needs to Carry the Product to the Nea...,3 Year Panasonic India Warranty and Free Trans...,,273 g,,Auto / Indoor1 / Indoor2 / Sunny / Cloudy / Wh...,,,,
3,,Sony HVR HD1000P Camcorder Camera best price i...,,,"Speaker: 16 mm, Shoulder-Mount Design, Full Co...",,,,,,...,,,,,2700.0,,,,,
4,,Panasonic HX DC2 Camcorder Camera best price i...,,,"Zoom: 15x Intelligent Zoom, 12x w-Range Zoom, ...",,,,F3.5 (w) - f3.7 (t),Aac (2ch),...,Customer Needs to Carry the Product to the Nea...,3 Year Panasonic India Warranty and Free Trans...,,"180 g (with Battery and SD Memory Card), 162 g...",,"Manual, Auto, Cloudy, Indoor 1, Indoor 2, Whit...",,,,


In [5]:
cols = ["spec_id", "lcd screen size", "brand", "type", "optical sensor resolution in megapixel", "image display resolution", "weight", "dimensions"]

In [6]:
df = df[cols]

In [7]:
df.head()

Unnamed: 0,spec_id,lcd screen size,brand,type,optical sensor resolution in megapixel,image display resolution,weight,dimensions
0,www.price-hunt.com//729,2.7 Inch,Pentax,Point & Shoot,14 Megapixels,"230,000 Dots",167.26 g,") x 27.94 (d) mm, 114.3 (w) x 58.42 (h"
1,www.price-hunt.com//9793,3 Inch,Sony,DSLR,24.3 Megapixels,"921,600 Dots",350 g (with Battery and Memory Stick pro Duo),119.9 (w) x 66.9 (h) x 42.6 (d) mm
2,www.price-hunt.com//9681,2.7 Inch,Panasonic,Camcorder,,"123,200 Dots",273 g,55.1 (w) x 64.0 (h) x 107.3 (d) mm
3,www.price-hunt.com//791,2.7 Inch,Sony,Camcorder,,"211,200 Dots",,
4,www.price-hunt.com//9814,3 Inch,Panasonic,Camcorder,14.3 Megapixels,"230,400 Dots","180 g (with Battery and SD Memory Card), 162 g...",86.2 (w) x 120.8 (h) x 38.1 (d) mm


In [8]:
df.isna().sum()

spec_id                                    0
lcd screen size                           12
brand                                      0
type                                      26
optical sensor resolution in megapixel    13
image display resolution                  45
weight                                    16
dimensions                                59
dtype: int64

## Lcd screen size

In [9]:
def clean_size(value):
    if pd.isna(value):
        return value
    elif len(value.split()) == 2:
        return float(word_tokenize(value)[0])
    else:
        return float(value)

In [10]:
df["lcd screen size"].value_counts()

3 Inch       165
2.7 Inch     105
3.2 Inch      11
2.4 Inch       6
1.5 Inch       6
2.8 Inch       5
3.5 Inch       4
2.5 Inch       4
1.4 Inch       3
2 Inch         2
4.77 Inch      1
3.0 Inch       1
2.7            1
1.8 Inch       1
Name: lcd screen size, dtype: int64

In [11]:
df["lcd screen size"] = df["lcd screen size"].apply(clean_size)

In [12]:
df["lcd screen size"].value_counts()

3.00    166
2.70    106
3.20     11
2.40      6
1.50      6
2.80      5
2.50      4
3.50      4
1.40      3
2.00      2
1.80      1
4.77      1
Name: lcd screen size, dtype: int64

In [13]:
df.head()

Unnamed: 0,spec_id,lcd screen size,brand,type,optical sensor resolution in megapixel,image display resolution,weight,dimensions
0,www.price-hunt.com//729,2.7,Pentax,Point & Shoot,14 Megapixels,"230,000 Dots",167.26 g,") x 27.94 (d) mm, 114.3 (w) x 58.42 (h"
1,www.price-hunt.com//9793,3.0,Sony,DSLR,24.3 Megapixels,"921,600 Dots",350 g (with Battery and Memory Stick pro Duo),119.9 (w) x 66.9 (h) x 42.6 (d) mm
2,www.price-hunt.com//9681,2.7,Panasonic,Camcorder,,"123,200 Dots",273 g,55.1 (w) x 64.0 (h) x 107.3 (d) mm
3,www.price-hunt.com//791,2.7,Sony,Camcorder,,"211,200 Dots",,
4,www.price-hunt.com//9814,3.0,Panasonic,Camcorder,14.3 Megapixels,"230,400 Dots","180 g (with Battery and SD Memory Card), 162 g...",86.2 (w) x 120.8 (h) x 38.1 (d) mm


## Brand

In [14]:
df["brand"].value_counts()

Sony         83
Panasonic    45
Canon        43
Nikon        42
Fujifilm     29
Olympus      21
Samsung      12
Kodak        10
Pentax       10
Ricoh         5
Yourdeal      5
Aiptek        3
Garmin        3
Wespro        3
Casio         2
Tvc           2
Benq          2
Drift         2
Rollei        2
Gopro         1
Polaroid      1
Jvc           1
Name: brand, dtype: int64

In [15]:
df["brand"] = df["brand"].apply(lambda brand : brand.lower())

## Type

In [16]:
df["type"].value_counts()

Point & Shoot                                    149
DSLR                                              52
Camcorder                                         49
Advanced Point & Shoot                            15
Mirrorless                                        15
Sports & Action                                   10
Instant                                            4
SLR                                                2
Video Camera                                       2
Lens Style Camera                                  1
Digital Interchangeable Lens                       1
Micro Four Thirds Interchangeable Lens System      1
Name: type, dtype: int64

In [17]:
def clean_camera_type(camera):
    if pd.isna(camera):
        return camera
    else:
        punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"
        for p in punctuation:
            camera = camera.replace(p, "")
        camera = camera.replace("camera", "")
        camera = ' '.join(camera.split())
        return camera.lower()

In [18]:
df["type"] = df["type"].apply(clean_camera_type)

In [19]:
df["type"].value_counts()

point shoot                                      149
dslr                                              52
camcorder                                         49
advanced point shoot                              15
mirrorless                                        15
sports action                                     10
instant                                            4
slr                                                2
video camera                                       2
lens style camera                                  1
micro four thirds interchangeable lens system      1
digital interchangeable lens                       1
Name: type, dtype: int64

In [20]:
df.head()

Unnamed: 0,spec_id,lcd screen size,brand,type,optical sensor resolution in megapixel,image display resolution,weight,dimensions
0,www.price-hunt.com//729,2.7,pentax,point shoot,14 Megapixels,"230,000 Dots",167.26 g,") x 27.94 (d) mm, 114.3 (w) x 58.42 (h"
1,www.price-hunt.com//9793,3.0,sony,dslr,24.3 Megapixels,"921,600 Dots",350 g (with Battery and Memory Stick pro Duo),119.9 (w) x 66.9 (h) x 42.6 (d) mm
2,www.price-hunt.com//9681,2.7,panasonic,camcorder,,"123,200 Dots",273 g,55.1 (w) x 64.0 (h) x 107.3 (d) mm
3,www.price-hunt.com//791,2.7,sony,camcorder,,"211,200 Dots",,
4,www.price-hunt.com//9814,3.0,panasonic,camcorder,14.3 Megapixels,"230,400 Dots","180 g (with Battery and SD Memory Card), 162 g...",86.2 (w) x 120.8 (h) x 38.1 (d) mm


## Optical sensor resolution in megapixel

In [21]:
def megapixels_metrics(value):
    if pd.isna(value):
        return value
    else:
        return word_tokenize(value)[1]

In [22]:
def clean_megapixels(value):
    if pd.isna(value):
        return value
    else:
        return round(float(word_tokenize(value)[0]), 1)

In [23]:
df["optical sensor resolution in megapixel"].apply(megapixels_metrics).value_counts()

Megapixels    294
mp             20
Name: optical sensor resolution in megapixel, dtype: int64

In [24]:
df["optical sensor resolution in megapixel"] = df["optical sensor resolution in megapixel"].apply(clean_megapixels)

## Image display resolution

In [25]:
df["image display resolution"].value_counts()

230,000 Dots        92
460,000 Dots        31
921,600 Dots        22
230,400 Dots        17
921,000 Dots        16
460,800 Dots        15
461,000 Dots         9
000 Dots, 230        8
921, 000 Dots        6
1,040,000 Dots       6
1,228,800 Dots       6
460, 000 Dots        5
230, 000 Dots        4
400 Dots, 230        4
460, 800 Dots        3
000 Dots, 460        3
123,200 Dots         3
461, 000 Dots        2
460000 Dots          2
1,037,000 Dots       2
614,000 Dots         2
920,000 Dots         2
230400 Dots          2
922,000 Dots         2
112,000 Dots         2
000 Dots, 921        2
819,000 Dots         1
110,000 Dots         1
1,230,000 Dots       1
230, 400 Dots        1
1,036,800 Dots       1
211,200 Dots         1
040, 1, 000 Dots     1
921, 600 Dots        1
922, 000 Dots        1
920, 000 Dots        1
1,152,000 Dots       1
641,000 Dots         1
288,000 Dots         1
460,000dots          1
Name: image display resolution, dtype: int64

In [26]:
def clean_dots(value):
    if pd.isna(value):
        return value
    else:
        value = ' '.join(value.split())
        value = value.replace(",", "")
        value = value.replace("Dots", "d")
        value = value.replace("dots", "d")
        if bool(re.match(r"[0-9]* [0-9]* d", value)):
            return value.replace(" ", "")
        elif bool(re.match(r"[0-9]* d [0-9]*", value)):
            splitted = value.split()
            return splitted[2] + splitted[0] + splitted[1]            
        elif bool(re.match(r"[0-9]* d", value)):
            return value.replace(" ", "")
                
        return value.replace(" ", "")

In [27]:
df["image display resolution"] = df["image display resolution"].apply(clean_dots)

In [28]:
df.head()

Unnamed: 0,spec_id,lcd screen size,brand,type,optical sensor resolution in megapixel,image display resolution,weight,dimensions
0,www.price-hunt.com//729,2.7,pentax,point shoot,14.0,230000d,167.26 g,") x 27.94 (d) mm, 114.3 (w) x 58.42 (h"
1,www.price-hunt.com//9793,3.0,sony,dslr,24.3,921600d,350 g (with Battery and Memory Stick pro Duo),119.9 (w) x 66.9 (h) x 42.6 (d) mm
2,www.price-hunt.com//9681,2.7,panasonic,camcorder,,123200d,273 g,55.1 (w) x 64.0 (h) x 107.3 (d) mm
3,www.price-hunt.com//791,2.7,sony,camcorder,,211200d,,
4,www.price-hunt.com//9814,3.0,panasonic,camcorder,14.3,230400d,"180 g (with Battery and SD Memory Card), 162 g...",86.2 (w) x 120.8 (h) x 38.1 (d) mm


## Weight

In [31]:
df["weight"].value_counts()

210 g                                                                                  7
100                                                                                    6
480 g (Cipa Guidelines)                                                                4
589.67 g (Without Battery and Removable Memory), 649.20 g (Loaded and Ready)           4
411 g (with Battery and Memory Stick pro Duo)                                          3
177.4 g (with Battery)                                                                 3
344 g (with Battery and Memory Stick pro Duo)                                          3
269 g (with Battery and Memory Stick pro Duo)                                          3
135 g (Camera Body Only), 155 g (with Battery and Memory Card)                         2
325 g                                                                                  2
142 g (Camera Body Only), 163 g (with Battery and Memory Card)                         2
474 g (with Battery a

In [61]:
def clean_weight(value):
    if pd.isna(value):
        return value
    else:
        tokenized = word_tokenize(value)
        if len(tokenized) >= 2:
            return int(round(float(tokenized[0].replace("g", ""))))
        else:
            return int(round(float(value)))

In [64]:
df["weight"] = df["weight"].apply(clean_weight)

In [66]:
df.head()

Unnamed: 0,spec_id,lcd screen size,brand,type,optical sensor resolution in megapixel,image display resolution,weight,dimensions
0,www.price-hunt.com//729,2.7,pentax,point shoot,14.0,230000d,167.0,") x 27.94 (d) mm, 114.3 (w) x 58.42 (h"
1,www.price-hunt.com//9793,3.0,sony,dslr,24.3,921600d,350.0,119.9 (w) x 66.9 (h) x 42.6 (d) mm
2,www.price-hunt.com//9681,2.7,panasonic,camcorder,,123200d,273.0,55.1 (w) x 64.0 (h) x 107.3 (d) mm
3,www.price-hunt.com//791,2.7,sony,camcorder,,211200d,,
4,www.price-hunt.com//9814,3.0,panasonic,camcorder,14.3,230400d,180.0,86.2 (w) x 120.8 (h) x 38.1 (d) mm


## Dimensions

In [67]:
df["dimensions"].value_counts()

128.6 (w) x 95.5 (h) x 77.7 (d) mm         4
129.54 (w) x 96.52 (h) x 71.12 (d) mm      4
5.10 (w) x 3.93 (h) x 3.07 (d) Inch        4
53 (w) x 60 (h) x 114 (d) mm               3
126.9 (w) x 94.4 (h) x 48.2 (d) mm         3
53 (w) x 32 (h) x 111 (d)                  3
128 (w) x 90.9 (h) x 84.5 (d) mm           2
141 (w) x 113 (h) x 82 (d) mm              2
24.5 (w) x 47 (h) x 82 (d) mm              2
86.2 (w) x 120.8 (h) x 38.1 (d) mm         2
95.4 (w) x 58.2 (h, ) x 21.0 (d) mm        2
119.9 (w) x 66.9 (h) x 42.6 (d) mm         2
53 (w) x 59 (h) x 116 (d) mm               2
132.1 (w) x 97.5 (h) x 80.7 (d) mm         2
130.4 (w) x 93.5 (h) x 63.1 (d) mm         2
92.9 (w) x 52.4 (h) x 21.6 (d) mm          2
) x 45.1 (d) mm, 120 (w) x 66.9 (h         2
111.1 (w) x 76.3 (h) x 83.1 (d) mm         2
142.6 (w) x 104.0 (h) x 80.9 (d) mm        2
109.6 (w) x 62.8 (h) x 35.7 (d) mm         2
65 (w) x 73 (h) x 139 (d) mm               2
152 (w) x 116.4 (h) x 76.4 (d) mm          2
95.3 (w) x

In [80]:
print(bool(re.match(r"[0-9]+[.][0-9]+ \(w\) x [0-9]+[.][0-9]+ \(h\) x [0-9]+[.][0-9]+ \(d\) mm", "85.8 (w) x 53.5 (h) x 19.8 (d) mm")))

True


In [83]:
first_regex = r"[0-9]+[.][0-9]+ \(w\) x [0-9]+[.][0-9]+ \(h\) x [0-9]+[.][0-9]+ \(d\) mm" #69
second_regex = r"\) x [0-9]+[.][0-9]+ \(d\) mm, [0-9]+[.][0-9]+ \(w\) x [0-9]+[.][0-9]+ \(h" #57

In [84]:
df.isna().sum()

spec_id                                    0
lcd screen size                           12
brand                                      0
type                                      26
optical sensor resolution in megapixel    13
image display resolution                  45
weight                                    16
dimensions                                59
dtype: int64

In [82]:
sum(df["dimensions"].apply(lambda dim : bool(re.match(r"[0-9]+[.][0-9]+ \(w\) x [0-9]+[.][0-9]+ \(h\) x [0-9]+[.][0-9]+ \(d\) mm", str(dim)))))

69

In [99]:
sum(df["dimensions"].apply(lambda dim : bool(re.match(r"\) x [0-9]+[.][0-9]+ \(d\) mm, [0-9]+[.][0-9]+ \(w\) x [0-9]+[.][0-9]+ \(h", str(dim)))))

57

In [314]:
def mm_to_inches(value):
    if pd.isna(value):
        return value
    else:
        return round(float(value) / 25.4, 1)

In [325]:
def clean_dimensions_regex(value):
    first_regex = r"([0-9]+[.]?[0-9]+?) \(w\) x ([0-9]+[.]?[0-9]+?) \(h\) x ([0-9]+[.]?[0-9]+?) \(d\) mm" #69
    second_regex = r"\) x ([0-9]+[.]?[0-9]+?) \(d\) mm, ([0-9]+[.]?[0-9]+?) \(w\) x ([0-9]+[.]?[0-9]+?) x?[ ]?\(h" #79
    third_regex = r"([0-9]+[.]?[0-9]+?) \(w\) x ([0-9]+[.]?[0-9]+?) \(h, \) x ([0-9]+[.]?[0-9]+?) \(d\) mm" #56
    fourth_regex = r"([0-9]+[.]?[0-9]+?) \(w\) x?[ ]?([0-9]+[.]?[0-9]+?) \(h,?[ ]?\),? x?[ ]?([0-9]+[.]?[0-9]+?) [(]?(d|l)[)][ ]?m?m?"#14
    final_regex = r"([0-9]+[.]?[0-9]+?) x ([0-9]+[.]?[0-9]+?) x ([0-9]+[.]?[0-9]+?) mm"
    if pd.isna(value):
        return value
    else:
        if re.match(first_regex, value):
            groups = re.match(first_regex, value)
            dimension = str(mm_to_inches(groups.group(1))) + "h" + str(mm_to_inches(groups.group(2))) + "w" + str(mm_to_inches(groups.group(3))) + "d"
            return dimension
        elif re.match(second_regex, value):
            groups = re.match(second_regex, value)
            dimension = str(mm_to_inches(groups.group(3))) + "h" + str(mm_to_inches(groups.group(2))) + "w" + str(mm_to_inches(groups.group(1))) + "d"
            return dimension
        elif re.match(third_regex, value):
            groups = re.match(third_regex, value)
            dimension = str(mm_to_inches(groups.group(2))) + "h" + str(mm_to_inches(groups.group(1))) + "w" + str(mm_to_inches(groups.group(3))) + "d"
            return dimension
        elif re.match(fourth_regex, value):
            groups = re.match(fourth_regex, value)
            dimension = str(mm_to_inches(groups.group(2))) + "h" + str(mm_to_inches(groups.group(1))) + "w" + str(mm_to_inches(groups.group(3))) + "d"
            return dimension
        elif re.match(final_regex, value):
            groups = re.match(final_regex, value)
            dimension = str(mm_to_inches(groups.group(2))) + "h" + str(mm_to_inches(groups.group(1))) + "w" + str(mm_to_inches(groups.group(3))) + "d"
            return dimension
        else:
            return value

In [328]:
df["dimensions"] = df["dimensions"].apply(clean_dimensions_regex)