In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    #source = "buy.net"
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
    #print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.price-hunt.com")

>>> Creating dataframe...

>>> Dataframe created successfully!



In [4]:
df.head()

Unnamed: 0,source,spec_number,spec_id,<page title>,additional features,aperture range,audio formats,audio video interface,auto focus,battery type,...,3d images,still image 3d,dynamic range adjustment,recording time,still image mode,view magnification,iso speed range,flash bracketing,maximum recording time,eyefi ready
0,www.price-hunt.com,840,www.price-hunt.com//840,Canon IXUS 132 Advanced Point & Shoot Camera b...,"Humidity: 10% - 90%, 100% Coverage, Intelligen...",F3.2 (w) - f6.9 (t),Linear pcm,"Audio / Video Output (Ntsc, Pal)",Ttl,Lithium Battery,...,,,,,,,,,,
1,www.price-hunt.com,9603,www.price-hunt.com//9603,Sony Cyber Shot DSC W380 best price in India 2...,,F3.5 - f6.3,,,,Lithium-ion,...,,,,,,,,,,
2,www.price-hunt.com,9809,www.price-hunt.com//9809,Samsung PL120 Point & Shoot Camera best price ...,Operating Temperature: 0deg - 40deg c,,Aac,"Audio / Video Output (Ntsc, Pal)","Smart Face Recognition af, Face Detection af, ...",Lithium Battery,...,,,,,,,,,,
3,www.price-hunt.com,940,www.price-hunt.com//940,Nikon Coolpix P530 Point & Shoot Camera best p...,"Viewfinder: 0.5 cm (0.2 Inch), 201,000 Dots Eq...",F3 - f5.9,"Audio: Lpcm Stereo, WAV",,Contrast Detect af,Lithium Battery,...,,,,,,,,,,
4,www.price-hunt.com,9777,www.price-hunt.com//9777,Kodak Pixpro FZ41 Point & Shoot Camera best pr...,,"F 3.0 (Wide) - f6.6 (Tele, )",,,,Aa,...,,,,,,,,,,


In [5]:
cols = ["spec_id", "lcd screen size", "brand", "type", "optical sensor resolution in megapixel", "image display resolution", "weight", "dimensions"]

In [6]:
df = df[cols]

In [7]:
df.head()

Unnamed: 0,spec_id,lcd screen size,brand,type,optical sensor resolution in megapixel,image display resolution,weight,dimensions
0,www.price-hunt.com//840,2.7 Inch,Canon,Advanced Point & Shoot,16.0 Megapixels,"230,000 Dots","117 g (Body Only), 133 g (with Battery and Meo...",92.9 (w) x 52.4 (h) x 21.6 (d) mm
1,www.price-hunt.com//9603,2.7 Inch,Sony,,14.1 mp,,108,
2,www.price-hunt.com//9809,2.7 Inch,Samsung,Point & Shoot,14.2 Megapixels,"230,000 Dots",110 g,") x 18.8 (d) mm, 94.0 (w) x 54.5 (h"
3,www.price-hunt.com//940,3 Inch,Nikon,Point & Shoot,16.1 Megapixels,"921,000 Dots",494 g (with Battery and SD Memory Card),") x 98.2 (d) mm, 122.8 (w) x 84.1 (h"
4,www.price-hunt.com//9777,2.7 Inch,Kodak,Point & Shoot,16 Megapixels,"230,000 Dots",116 g,


In [8]:
df.isna().sum()

spec_id                                    0
lcd screen size                           12
brand                                      0
type                                      26
optical sensor resolution in megapixel    13
image display resolution                  45
weight                                    16
dimensions                                59
dtype: int64

## Lcd screen size

In [9]:
def clean_size(value):
    if pd.isna(value):
        return value
    elif len(value.split()) == 2:
        return float(word_tokenize(value)[0])
    else:
        return float(value)

In [10]:
df["lcd screen size"].value_counts()

3 Inch       165
2.7 Inch     105
3.2 Inch      11
1.5 Inch       6
2.4 Inch       6
2.8 Inch       5
2.5 Inch       4
3.5 Inch       4
1.4 Inch       3
2 Inch         2
1.8 Inch       1
2.7            1
3.0 Inch       1
4.77 Inch      1
Name: lcd screen size, dtype: int64

In [11]:
df["lcd screen size"] = df["lcd screen size"].apply(clean_size)

In [12]:
df["lcd screen size"].value_counts()

3.00    166
2.70    106
3.20     11
2.40      6
1.50      6
2.80      5
2.50      4
3.50      4
1.40      3
2.00      2
1.80      1
4.77      1
Name: lcd screen size, dtype: int64

In [13]:
df.head()

Unnamed: 0,spec_id,lcd screen size,brand,type,optical sensor resolution in megapixel,image display resolution,weight,dimensions
0,www.price-hunt.com//840,2.7,Canon,Advanced Point & Shoot,16.0 Megapixels,"230,000 Dots","117 g (Body Only), 133 g (with Battery and Meo...",92.9 (w) x 52.4 (h) x 21.6 (d) mm
1,www.price-hunt.com//9603,2.7,Sony,,14.1 mp,,108,
2,www.price-hunt.com//9809,2.7,Samsung,Point & Shoot,14.2 Megapixels,"230,000 Dots",110 g,") x 18.8 (d) mm, 94.0 (w) x 54.5 (h"
3,www.price-hunt.com//940,3.0,Nikon,Point & Shoot,16.1 Megapixels,"921,000 Dots",494 g (with Battery and SD Memory Card),") x 98.2 (d) mm, 122.8 (w) x 84.1 (h"
4,www.price-hunt.com//9777,2.7,Kodak,Point & Shoot,16 Megapixels,"230,000 Dots",116 g,


## Brand

In [14]:
df["brand"].value_counts()

Sony         83
Panasonic    45
Canon        43
Nikon        42
Fujifilm     29
Olympus      21
Samsung      12
Kodak        10
Pentax       10
Ricoh         5
Yourdeal      5
Aiptek        3
Garmin        3
Wespro        3
Benq          2
Rollei        2
Casio         2
Tvc           2
Drift         2
Jvc           1
Polaroid      1
Gopro         1
Name: brand, dtype: int64

In [15]:
df["brand"] = df["brand"].apply(lambda brand : brand.lower())

## Type

In [16]:
df["type"].value_counts()

Point & Shoot                                    149
DSLR                                              52
Camcorder                                         49
Advanced Point & Shoot                            15
Mirrorless                                        15
Sports & Action                                   10
Instant                                            4
Video Camera                                       2
SLR                                                2
Lens Style Camera                                  1
Micro Four Thirds Interchangeable Lens System      1
Digital Interchangeable Lens                       1
Name: type, dtype: int64

In [17]:
def clean_camera_type(camera):
    if pd.isna(camera):
        return camera
    else:
        punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"
        for p in punctuation:
            camera = camera.replace(p, "")
        camera = camera.replace("camera", "")
        camera = ' '.join(camera.split())
        return camera.lower()

In [18]:
df["type"] = df["type"].apply(clean_camera_type)

In [19]:
df["type"].value_counts()

point shoot                                      149
dslr                                              52
camcorder                                         49
mirrorless                                        15
advanced point shoot                              15
sports action                                     10
instant                                            4
slr                                                2
video camera                                       2
lens style camera                                  1
micro four thirds interchangeable lens system      1
digital interchangeable lens                       1
Name: type, dtype: int64

In [20]:
df.head()

Unnamed: 0,spec_id,lcd screen size,brand,type,optical sensor resolution in megapixel,image display resolution,weight,dimensions
0,www.price-hunt.com//840,2.7,canon,advanced point shoot,16.0 Megapixels,"230,000 Dots","117 g (Body Only), 133 g (with Battery and Meo...",92.9 (w) x 52.4 (h) x 21.6 (d) mm
1,www.price-hunt.com//9603,2.7,sony,,14.1 mp,,108,
2,www.price-hunt.com//9809,2.7,samsung,point shoot,14.2 Megapixels,"230,000 Dots",110 g,") x 18.8 (d) mm, 94.0 (w) x 54.5 (h"
3,www.price-hunt.com//940,3.0,nikon,point shoot,16.1 Megapixels,"921,000 Dots",494 g (with Battery and SD Memory Card),") x 98.2 (d) mm, 122.8 (w) x 84.1 (h"
4,www.price-hunt.com//9777,2.7,kodak,point shoot,16 Megapixels,"230,000 Dots",116 g,


## Optical sensor resolution in megapixel

In [21]:
def megapixels_metrics(value):
    if pd.isna(value):
        return value
    else:
        return word_tokenize(value)[1]

In [22]:
def clean_megapixels(value):
    if pd.isna(value):
        return value
    else:
        return round(float(word_tokenize(value)[0]), 1)

In [23]:
df["optical sensor resolution in megapixel"].apply(megapixels_metrics).value_counts()

Megapixels    294
mp             20
Name: optical sensor resolution in megapixel, dtype: int64

In [24]:
df["optical sensor resolution in megapixel"] = df["optical sensor resolution in megapixel"].apply(clean_megapixels)

## Image display resolution

In [25]:
df["image display resolution"].value_counts()

230,000 Dots        92
460,000 Dots        31
921,600 Dots        22
230,400 Dots        17
921,000 Dots        16
460,800 Dots        15
461,000 Dots         9
000 Dots, 230        8
921, 000 Dots        6
1,228,800 Dots       6
1,040,000 Dots       6
460, 000 Dots        5
400 Dots, 230        4
230, 000 Dots        4
123,200 Dots         3
000 Dots, 460        3
460, 800 Dots        3
460000 Dots          2
1,037,000 Dots       2
112,000 Dots         2
920,000 Dots         2
614,000 Dots         2
461, 000 Dots        2
000 Dots, 921        2
922,000 Dots         2
230400 Dots          2
288,000 Dots         1
1,036,800 Dots       1
641,000 Dots         1
920, 000 Dots        1
110,000 Dots         1
1,152,000 Dots       1
040, 1, 000 Dots     1
921, 600 Dots        1
460,000dots          1
211,200 Dots         1
819,000 Dots         1
1,230,000 Dots       1
230, 400 Dots        1
922, 000 Dots        1
Name: image display resolution, dtype: int64

In [26]:
def clean_dots(value):
    if pd.isna(value):
        return value
    else:
        value = ' '.join(value.split())
        value = value.replace(",", "")
        value = value.replace("Dots", "d")
        value = value.replace("dots", "d")
        if bool(re.match(r"[0-9]* [0-9]* d", value)):
            return value.replace(" ", "")
        elif bool(re.match(r"[0-9]* d [0-9]*", value)):
            splitted = value.split()
            return splitted[2] + splitted[0] + splitted[1]            
        elif bool(re.match(r"[0-9]* d", value)):
            return value.replace(" ", "")
                
        return value.replace(" ", "")

In [27]:
df["image display resolution"] = df["image display resolution"].apply(clean_dots)

In [28]:
df.head()

Unnamed: 0,spec_id,lcd screen size,brand,type,optical sensor resolution in megapixel,image display resolution,weight,dimensions
0,www.price-hunt.com//840,2.7,canon,advanced point shoot,16.0,230000d,"117 g (Body Only), 133 g (with Battery and Meo...",92.9 (w) x 52.4 (h) x 21.6 (d) mm
1,www.price-hunt.com//9603,2.7,sony,,14.1,,108,
2,www.price-hunt.com//9809,2.7,samsung,point shoot,14.2,230000d,110 g,") x 18.8 (d) mm, 94.0 (w) x 54.5 (h"
3,www.price-hunt.com//940,3.0,nikon,point shoot,16.1,921000d,494 g (with Battery and SD Memory Card),") x 98.2 (d) mm, 122.8 (w) x 84.1 (h"
4,www.price-hunt.com//9777,2.7,kodak,point shoot,16.0,230000d,116 g,


## Weight

In [29]:
df["weight"].value_counts()

210 g                                                                           7
100                                                                             6
589.67 g (Without Battery and Removable Memory), 649.20 g (Loaded and Ready)    4
480 g (Cipa Guidelines)                                                         4
269 g (with Battery and Memory Stick pro Duo)                                   3
                                                                               ..
113 g (with Battery and Memory Stick), 97 g (Body Only)                         1
765 g (with Battery and Meomry Card), 675 g (Body Only)                         1
218 g (Cipa Guideline Compliant, Including Batteries, Card)                     1
160 g (Without Battery and SD Meomry Card)                                      1
494 g (with Battery and SD Memory Card)                                         1
Name: weight, Length: 257, dtype: int64

In [30]:
def clean_weight(value):
    if pd.isna(value):
        return value
    else:
        tokenized = word_tokenize(value)
        if len(tokenized) >= 2:
            return int(round(float(tokenized[0].replace("g", ""))))
        else:
            return int(round(float(value)))

In [31]:
df["weight"] = df["weight"].apply(clean_weight)

In [32]:
df.head()

Unnamed: 0,spec_id,lcd screen size,brand,type,optical sensor resolution in megapixel,image display resolution,weight,dimensions
0,www.price-hunt.com//840,2.7,canon,advanced point shoot,16.0,230000d,117.0,92.9 (w) x 52.4 (h) x 21.6 (d) mm
1,www.price-hunt.com//9603,2.7,sony,,14.1,,108.0,
2,www.price-hunt.com//9809,2.7,samsung,point shoot,14.2,230000d,110.0,") x 18.8 (d) mm, 94.0 (w) x 54.5 (h"
3,www.price-hunt.com//940,3.0,nikon,point shoot,16.1,921000d,494.0,") x 98.2 (d) mm, 122.8 (w) x 84.1 (h"
4,www.price-hunt.com//9777,2.7,kodak,point shoot,16.0,230000d,116.0,


## Dimensions

In [33]:
df["dimensions"].value_counts()

5.10 (w) x 3.93 (h) x 3.07 (d) Inch       4
129.54 (w) x 96.52 (h) x 71.12 (d) mm     4
128.6 (w) x 95.5 (h) x 77.7 (d) mm        4
53 (w) x 60 (h) x 114 (d) mm              3
126.9 (w) x 94.4 (h) x 48.2 (d) mm        3
                                         ..
) x 16.90 (d) mm, 90.05 (w) x 52.60 (h    1
) x 22.2 (d) mm, 103.7 (w) x 62.5 (h      1
205 (w) x 217 (h) x 479 (d) mm            1
) x 19 (d) mm, 129 (w) x 71 (h            1
92.2 (w) x 52.0 (h, ) x 19.1 (d) mm       1
Name: dimensions, Length: 237, dtype: int64

In [34]:
print(bool(re.match(r"[0-9]+[.][0-9]+ \(w\) x [0-9]+[.][0-9]+ \(h\) x [0-9]+[.][0-9]+ \(d\) mm", "85.8 (w) x 53.5 (h) x 19.8 (d) mm")))

True


In [35]:
first_regex = r"[0-9]+[.][0-9]+ \(w\) x [0-9]+[.][0-9]+ \(h\) x [0-9]+[.][0-9]+ \(d\) mm" #69
second_regex = r"\) x [0-9]+[.][0-9]+ \(d\) mm, [0-9]+[.][0-9]+ \(w\) x [0-9]+[.][0-9]+ \(h" #57

In [36]:
df.isna().sum()

spec_id                                    0
lcd screen size                           12
brand                                      0
type                                      26
optical sensor resolution in megapixel    13
image display resolution                  45
weight                                    16
dimensions                                59
dtype: int64

In [37]:
sum(df["dimensions"].apply(lambda dim : bool(re.match(r"[0-9]+[.][0-9]+ \(w\) x [0-9]+[.][0-9]+ \(h\) x [0-9]+[.][0-9]+ \(d\) mm", str(dim)))))

69

In [38]:
sum(df["dimensions"].apply(lambda dim : bool(re.match(r"\) x [0-9]+[.][0-9]+ \(d\) mm, [0-9]+[.][0-9]+ \(w\) x [0-9]+[.][0-9]+ \(h", str(dim)))))

57

In [39]:
def mm_to_inches(value):
    if pd.isna(value):
        return value
    else:
        return round(float(value) / 25.4, 1)

In [40]:
def clean_dimensions_regex(value):
    first_regex = r"([0-9]+[.]?[0-9]+?) \(w\) x ([0-9]+[.]?[0-9]+?) \(h\) x ([0-9]+[.]?[0-9]+?) \(d\) mm" #69
    second_regex = r"\) x ([0-9]+[.]?[0-9]+?) \(d\) mm, ([0-9]+[.]?[0-9]+?) \(w\) x ([0-9]+[.]?[0-9]+?) x?[ ]?\(h" #79
    third_regex = r"([0-9]+[.]?[0-9]+?) \(w\) x ([0-9]+[.]?[0-9]+?) \(h, \) x ([0-9]+[.]?[0-9]+?) \(d\) mm" #56
    fourth_regex = r"([0-9]+[.]?[0-9]+?) \(w\) x?[ ]?([0-9]+[.]?[0-9]+?) \(h,?[ ]?\),? x?[ ]?([0-9]+[.]?[0-9]+?) [(]?(d|l)[)][ ]?m?m?"#14
    final_regex = r"([0-9]+[.]?[0-9]+?) x ([0-9]+[.]?[0-9]+?) x ([0-9]+[.]?[0-9]+?) mm"
    if pd.isna(value):
        return value
    else:
        if re.match(first_regex, value):
            groups = re.match(first_regex, value)
            dimension = str(mm_to_inches(groups.group(1))) + "h" + str(mm_to_inches(groups.group(2))) + "w" + str(mm_to_inches(groups.group(3))) + "d"
            return dimension
        elif re.match(second_regex, value):
            groups = re.match(second_regex, value)
            dimension = str(mm_to_inches(groups.group(3))) + "h" + str(mm_to_inches(groups.group(2))) + "w" + str(mm_to_inches(groups.group(1))) + "d"
            return dimension
        elif re.match(third_regex, value):
            groups = re.match(third_regex, value)
            dimension = str(mm_to_inches(groups.group(2))) + "h" + str(mm_to_inches(groups.group(1))) + "w" + str(mm_to_inches(groups.group(3))) + "d"
            return dimension
        elif re.match(fourth_regex, value):
            groups = re.match(fourth_regex, value)
            dimension = str(mm_to_inches(groups.group(2))) + "h" + str(mm_to_inches(groups.group(1))) + "w" + str(mm_to_inches(groups.group(3))) + "d"
            return dimension
        elif re.match(final_regex, value):
            groups = re.match(final_regex, value)
            dimension = str(mm_to_inches(groups.group(2))) + "h" + str(mm_to_inches(groups.group(1))) + "w" + str(mm_to_inches(groups.group(3))) + "d"
            return dimension
        else:
            return value

In [41]:
df["dimensions"] = df["dimensions"].apply(clean_dimensions_regex)

## Saving

In [42]:
df.rename(columns={'lcd screen size': 'screen_size', 'optical sensor resolution in megapixel': 'megapixels', 'image display resolution': 'dots'}, inplace=True)
df.head()

Unnamed: 0,spec_id,screen_size,brand,type,megapixels,dots,weight,dimensions
0,www.price-hunt.com//840,2.7,canon,advanced point shoot,16.0,230000d,117.0,3.7h2.1w0.9d
1,www.price-hunt.com//9603,2.7,sony,,14.1,,108.0,
2,www.price-hunt.com//9809,2.7,samsung,point shoot,14.2,230000d,110.0,2.1h3.7w0.7d
3,www.price-hunt.com//940,3.0,nikon,point shoot,16.1,921000d,494.0,3.3h4.8w3.9d
4,www.price-hunt.com//9777,2.7,kodak,point shoot,16.0,230000d,116.0,


In [43]:
df.to_csv("../../datasets/unlabeled/cleaned/pricehunt.csv", index=False)