In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    #source = "buy.net"
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
    #print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "cammarkt.com")

>>> Creating dataframe...



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


>>> Dataframe created successfully!



In [4]:
df.head()

Unnamed: 0,35mm zoom lens,3d digital camera,<page title>,additional features,aperture range,auto focus type,battery life,battery type,brand,builtin microphone,...,video resolutions,video speed,viewfinder,viewfinder field coverage,viewfinder magnification,viewfinder type,warranty,weight,white balance,width
0,,,CamMarkt | Pentax K-5 IIs 16.3 MP DSLR Camera ...,,,,,,Pentax,SDHC Card\nSDXC Card\nWith Built-in Microphone...,...,640 x 480 (VGA)\nA CCD sensor will provide a s...,,Digital\nLCD stands for 'Liquid Crystal Displa...,,,,,"[739.35 gr, 1.63 lb., 0 pounds, 1.63 lb.]",Auto\nThe shutter speed of a camera depends on...,"[5.2 in., 13.2 cm\nA viewfinder is the small s..."
1,,,CamMarkt | Nikon D5200 24.1 MP CMOS Digital SL...,,,,,,Nikon,,...,,,,,,,,"[1,360.78 gr, 3 lb.\nA CCD sensor will provide...",,
2,,,CamMarkt | Canon EOS Rebel T3i 18 MP CMOS APS-...,,,,,,Canon,,...,,,,,,,,[1.14 lb.\nThe sensor in a digital camera is c...,,
3,,,CamMarkt | Pentax K-30 Weather-Sealed 16 MP CM...,,,,,,Pentax,SDHC Card\nSDXC Card\nWith Built-in Microphone...,...,,,Optical (Through-the-lens)\nLCD stands for 'Li...,,,,1 year parts and labor,1.4 pounds,,[13 cm\nA viewfinder is the small square on th...
4,,,CamMarkt | Nikon 1 J3 14.2 MP HD Digital Camer...,,,,,,Nikon,,...,,,,,,,,[1.19 lb.\nThe optical zoom ratio of a digital...,,


In [5]:
cols = ["spec_id", 'brand', "manufacturer", 'weight', "lcd screen size", "height", "depth", "width"]

In [6]:
df = df[cols]

In [7]:
df.head()

Unnamed: 0,spec_id,brand,manufacturer,weight,lcd screen size,height,depth,width
0,cammarkt.com//457,Pentax,Pentax,"[739.35 gr, 1.63 lb., 0 pounds, 1.63 lb.]",3 in.,"[3.8 in., 9.7 cm]","[2.9 in., 7.4 cm]","[5.2 in., 13.2 cm\nA viewfinder is the small s..."
1,cammarkt.com//292,Nikon,Nikon,"[1,360.78 gr, 3 lb.\nA CCD sensor will provide...",3 in.,,,
2,cammarkt.com//504,Canon,Canon,[1.14 lb.\nThe sensor in a digital camera is c...,3 in.,,,
3,cammarkt.com//441,Pentax,Pentax,1.4 pounds,,"[3.8 in., 9.7 cm]","[2.8 in., 7.1 cm]",[13 cm\nA viewfinder is the small square on th...
4,cammarkt.com//103,Nikon,Nikon,[1.19 lb.\nThe optical zoom ratio of a digital...,,,,


## Brand

In [8]:
df["brand"].value_counts()

Nikon                 50
Pentax                46
Canon                 42
Olympus               12
[Pentax, Pentax]       9
[Canon, Canon]         8
Sony                   8
[Nikon, Nikon]         8
Panasonic              4
[Olympus, Olympus]     2
[NikonEu, Nikon]       1
[Focus, Sony]          1
Blackmagic Design      1
Sigma                  1
Leica                  1
Samsung                1
[Sony, Sony]           1
Opteka                 1
[Samsung, Samsung]     1
Name: brand, dtype: int64

In [9]:
def clean_brand(value):
    if isinstance(value, list):
        return value[1].lower()
    else:
        value = value.replace(" design", "")
        return value.lower()

In [10]:
df["brand"] = df["brand"].apply(clean_brand)

In [11]:
df["brand"].value_counts()

nikon                59
pentax               55
canon                50
olympus              14
sony                 10
panasonic             4
samsung               2
leica                 1
sigma                 1
opteka                1
blackmagic design     1
Name: brand, dtype: int64

In [12]:
df.head()

Unnamed: 0,spec_id,brand,manufacturer,weight,lcd screen size,height,depth,width
0,cammarkt.com//457,pentax,Pentax,"[739.35 gr, 1.63 lb., 0 pounds, 1.63 lb.]",3 in.,"[3.8 in., 9.7 cm]","[2.9 in., 7.4 cm]","[5.2 in., 13.2 cm\nA viewfinder is the small s..."
1,cammarkt.com//292,nikon,Nikon,"[1,360.78 gr, 3 lb.\nA CCD sensor will provide...",3 in.,,,
2,cammarkt.com//504,canon,Canon,[1.14 lb.\nThe sensor in a digital camera is c...,3 in.,,,
3,cammarkt.com//441,pentax,Pentax,1.4 pounds,,"[3.8 in., 9.7 cm]","[2.8 in., 7.1 cm]",[13 cm\nA viewfinder is the small square on th...
4,cammarkt.com//103,nikon,Nikon,[1.19 lb.\nThe optical zoom ratio of a digital...,,,,


## Manufacturer

In [13]:
df["manufacturer"].value_counts()

Nikon                      54
Pentax                     53
Canon                      48
Olympus                    14
Sony                       10
NIKO9                       4
Panasonic                   4
PENX9                       1
Pentax-Ricoh                1
Samsung                     1
Canon Cameras, US           1
Samsung Pleomax - Zirex     1
CANU9                       1
Blackmagic                  1
Leica                       1
NikonEu                     1
SIGMA                       1
Opteka                      1
Name: manufacturer, dtype: int64

In [14]:
def clean_manufacturer(camera):
    if pd.isna(camera):
        return camera
    else:
        punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"
        for p in punctuation:
            camera = camera.replace(p, "")
        camera = ' '.join(camera.split())
        return camera.lower()

In [15]:
df["manufacturer"] = df["manufacturer"].apply(clean_manufacturer)

In [16]:
df.head()

Unnamed: 0,spec_id,brand,manufacturer,weight,lcd screen size,height,depth,width
0,cammarkt.com//457,pentax,pentax,"[739.35 gr, 1.63 lb., 0 pounds, 1.63 lb.]",3 in.,"[3.8 in., 9.7 cm]","[2.9 in., 7.4 cm]","[5.2 in., 13.2 cm\nA viewfinder is the small s..."
1,cammarkt.com//292,nikon,nikon,"[1,360.78 gr, 3 lb.\nA CCD sensor will provide...",3 in.,,,
2,cammarkt.com//504,canon,canon,[1.14 lb.\nThe sensor in a digital camera is c...,3 in.,,,
3,cammarkt.com//441,pentax,pentax,1.4 pounds,,"[3.8 in., 9.7 cm]","[2.8 in., 7.1 cm]",[13 cm\nA viewfinder is the small square on th...
4,cammarkt.com//103,nikon,nikon,[1.19 lb.\nThe optical zoom ratio of a digital...,,,,


## Weight

In [17]:
def clean_weight_regex(value):
    regex = r"([0-9]+([.][0-9]+)?) ([A-Z]+)*"
    if pd.isna(value):
        return value
    else:
        if re.match(regex, value):
            groups = re.match(regex, value)
        
            amount = groups.group(1)
            unit = value.split("\n")[0].split()[1].replace('.', "")
            unit = unit.lower()
            if unit == "pounds" or unit == "lb":
                return int(round(float(amount) * 454))
            elif unit == "oz":
                return int(round(float(amount) * 28.35))
            else:
                return int(round(float(amount)))
        else:
            value = int(round(float(value.split()[0].replace(",", ""))))
            return value

In [18]:
df["weight"] = df["weight"].apply(lambda shit : shit[0] if isinstance(shit, list) else shit)

In [19]:
df["weight"] = df["weight"].apply(clean_weight_regex)

In [20]:
df.head()

Unnamed: 0,spec_id,brand,manufacturer,weight,lcd screen size,height,depth,width
0,cammarkt.com//457,pentax,pentax,739,3 in.,"[3.8 in., 9.7 cm]","[2.9 in., 7.4 cm]","[5.2 in., 13.2 cm\nA viewfinder is the small s..."
1,cammarkt.com//292,nikon,nikon,1361,3 in.,,,
2,cammarkt.com//504,canon,canon,518,3 in.,,,
3,cammarkt.com//441,pentax,pentax,636,,"[3.8 in., 9.7 cm]","[2.8 in., 7.1 cm]",[13 cm\nA viewfinder is the small square on th...
4,cammarkt.com//103,nikon,nikon,540,,,,


## Lcd screen size

In [21]:
df["lcd screen size"].value_counts()

3 in.             88
2.5 in.           20
2.7 in.           14
3.2 in.            7
1.8 in.            7
2 in.              3
[3 in., 3 in.]     1
7.6 in.            1
Name: lcd screen size, dtype: int64

In [22]:
def clean_screen_size(value):
    if not isinstance(value, list) and pd.isna(value):
        return value
    else:
        if isinstance(value, list):
            return float(value[0].split()[0])
        else:
            return float(value.split()[0])

In [23]:
df["lcd screen size"] = df["lcd screen size"].apply(clean_screen_size)

In [24]:
df.head()

Unnamed: 0,spec_id,brand,manufacturer,weight,lcd screen size,height,depth,width
0,cammarkt.com//457,pentax,pentax,739,3.0,"[3.8 in., 9.7 cm]","[2.9 in., 7.4 cm]","[5.2 in., 13.2 cm\nA viewfinder is the small s..."
1,cammarkt.com//292,nikon,nikon,1361,3.0,,,
2,cammarkt.com//504,canon,canon,518,3.0,,,
3,cammarkt.com//441,pentax,pentax,636,,"[3.8 in., 9.7 cm]","[2.8 in., 7.1 cm]",[13 cm\nA viewfinder is the small square on th...
4,cammarkt.com//103,nikon,nikon,540,,,,


## Height

In [25]:
df["height"].value_counts()

[9.7 cm, 3.8 in.]                                                                               20
[3.8 in., 9.7 cm]                                                                               13
[4.2 in., 10.7 cm]                                                                               7
[3.6 in., 9.1 cm]                                                                                6
[3.7 in., 9.4 cm]                                                                                6
[9.1 cm, 3.6 in.]                                                                                4
3.9 in.\nDisplay Rotation                                                                        4
[9.4 cm, 3.71 in.]                                                                               3
[6.6 cm, 2.6 in.]                                                                                3
4.2 in.\nDiagonal Screen Size is the size of your TV screen, measured diagonally in inches.      3
3.8 in.\nD

In [26]:
def clean_dim(value):
    if not isinstance(value, list) and pd.isna(value):
        return value
    else:
        if isinstance(value, list):
            for e in value:
                if "in." in e:
                    return round(float(e.split()[0]), 1)
        else:
            return round(float(value.split("\n")[0].split()[0]), 1)

In [27]:
df["height"] = df["height"].apply(clean_dim)

In [28]:
df.head()

Unnamed: 0,spec_id,brand,manufacturer,weight,lcd screen size,height,depth,width
0,cammarkt.com//457,pentax,pentax,739,3.0,3.8,"[2.9 in., 7.4 cm]","[5.2 in., 13.2 cm\nA viewfinder is the small s..."
1,cammarkt.com//292,nikon,nikon,1361,3.0,,,
2,cammarkt.com//504,canon,canon,518,3.0,,,
3,cammarkt.com//441,pentax,pentax,636,,3.8,"[2.8 in., 7.1 cm]",[13 cm\nA viewfinder is the small square on th...
4,cammarkt.com//103,nikon,nikon,540,,,,


## Depth

In [30]:
df["depth"] = df["depth"].apply(clean_dim)

In [31]:
df.head()

Unnamed: 0,spec_id,brand,manufacturer,weight,lcd screen size,height,depth,width
0,cammarkt.com//457,pentax,pentax,739,3.0,3.8,2.9,"[5.2 in., 13.2 cm\nA viewfinder is the small s..."
1,cammarkt.com//292,nikon,nikon,1361,3.0,,,
2,cammarkt.com//504,canon,canon,518,3.0,,,
3,cammarkt.com//441,pentax,pentax,636,,3.8,2.8,[13 cm\nA viewfinder is the small square on th...
4,cammarkt.com//103,nikon,nikon,540,,,,


## Width

In [34]:
df["width"] = df["width"].apply(clean_dim)

In [35]:
df.head()

Unnamed: 0,spec_id,brand,manufacturer,weight,lcd screen size,height,depth,width
0,cammarkt.com//457,pentax,pentax,739,3.0,3.8,2.9,5.2
1,cammarkt.com//292,nikon,nikon,1361,3.0,,,
2,cammarkt.com//504,canon,canon,518,3.0,,,
3,cammarkt.com//441,pentax,pentax,636,,3.8,2.8,5.1
4,cammarkt.com//103,nikon,nikon,540,,,,


In [44]:
df["dimensions"] = df.apply(lambda row : str(row.height) + "h" + str(row.width) + "w" + str(row.depth) + "d" if not pd.isna(row.depth) and not pd.isna(row.height) and not pd.isna(row.width) else np.nan, axis = 1)

In [45]:
df.head()

Unnamed: 0,spec_id,brand,manufacturer,weight,lcd screen size,height,depth,width,dimensions
0,cammarkt.com//457,pentax,pentax,739,3.0,3.8,2.9,5.2,3.8h5.2w2.9d
1,cammarkt.com//292,nikon,nikon,1361,3.0,,,,
2,cammarkt.com//504,canon,canon,518,3.0,,,,
3,cammarkt.com//441,pentax,pentax,636,,3.8,2.8,5.1,3.8h5.1w2.8d
4,cammarkt.com//103,nikon,nikon,540,,,,,


In [46]:
df = df.drop(columns=["height", "depth", "width"], axis = 1)

In [47]:
df.head()

Unnamed: 0,spec_id,brand,manufacturer,weight,lcd screen size,dimensions
0,cammarkt.com//457,pentax,pentax,739,3.0,3.8h5.2w2.9d
1,cammarkt.com//292,nikon,nikon,1361,3.0,
2,cammarkt.com//504,canon,canon,518,3.0,
3,cammarkt.com//441,pentax,pentax,636,,3.8h5.1w2.8d
4,cammarkt.com//103,nikon,nikon,540,,


## Saving

In [48]:
df.to_csv("../../datasets/unlabeled/cleaned/cammarkt.csv", index=False)