In [9]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    #source = "buy.net"
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
    #print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs', "buy.net")

>>> Creating dataframe...



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


>>> Dataframe created successfully!



In [4]:
df.head()

Unnamed: 0,<page title>,analog video out,autofocus points,battery builtin,battery include,battery model supported,battery rechargeable,brand name,builtin flash,bulb setting,...,video capture format,viewfinder type,warranty information,weight,white balance,white balance modes,wide angle,width,wifi,you are reviewing
0,Canon PowerShot SX170 IS Red Digital Camera (1...,,,,,,,,,,...,,,1 Year,8.04 oz,,,,4.3 in,,
1,"Leica V-LUX 4 Black Digital Camera (12.1 MP, 2...",,,,,,,,,,...,,,,1.30 lbs,,,,4.9 in,,
2,Olympus Stylus SH-1 Silver Digital Camera (16 ...,,,,,,,,,,...,,,1 year(s),9.5 oz,,,,4.3 in,Yes,
3,Panasonic Lumix DMC-FX48 12.1 Megapixel Compac...,,,,,,,,,,...,,,1 Year Limited,4.48 oz,,,,3.8 in,,
4,"Olympus VR-370 Black Digital Camera (16 MP, 12...",,,,,,,,,,...,,,1 Year,6.07 oz,,,,4.1 in,,


In [5]:
cols = ["spec_id", "lcd screen size", "camera type", "effective megapixels", "height", "width", "depth", "weight", "total pixels"]

In [6]:
df = df[cols]

### Screen size

In [7]:
df.head()

Unnamed: 0,spec_id,lcd screen size,camera type,effective megapixels,height,width,depth,weight,total pixels
0,buy.net//6036,3 in,Compact Camera,16 Megapixel,2.8 in,4.3 in,1.7 in,8.04 oz,16600000.0
1,buy.net//5860,3 in,Bridge Camera,12.1 Megapixel,3.4 in,4.9 in,4.3 in,1.30 lbs,12800000.0
2,buy.net//5925,3 in,Point & Shoot Digital Camera,16000000 pixels,2.5 in,4.3 in,1.7 in,9.5 oz,
3,buy.net//6061,2.5 in,Compact Camera,12.1 Megapixel,2.1 in,3.8 in,0.9 in,4.48 oz,12700000.0
4,buy.net//5837,3 in,Compact Camera,16 Megapixel,2.4 in,4.1 in,1.1 in,6.07 oz,16500000.0


In [8]:
screen_size = df["lcd screen size"]

In [10]:
def clean_size(value):
    if pd.isna(value):
        return value
    else:
        return word_tokenize(value)[1]

In [11]:
#screen size values FIX
pd.Series(screen_size.apply(clean_size)).value_counts()

in    334
Name: lcd screen size, dtype: int64

In [12]:
sum(pd.isna(screen_size))

24

In [13]:
def keep_inches(value):
    if pd.isna(value):
        return value
    else:
        return word_tokenize(value)[0]

In [14]:
df["lcd screen size"] = df["lcd screen size"].apply(keep_inches)

In [15]:
df.head()

Unnamed: 0,spec_id,lcd screen size,camera type,effective megapixels,height,width,depth,weight,total pixels
0,buy.net//6036,3.0,Compact Camera,16 Megapixel,2.8 in,4.3 in,1.7 in,8.04 oz,16600000.0
1,buy.net//5860,3.0,Bridge Camera,12.1 Megapixel,3.4 in,4.9 in,4.3 in,1.30 lbs,12800000.0
2,buy.net//5925,3.0,Point & Shoot Digital Camera,16000000 pixels,2.5 in,4.3 in,1.7 in,9.5 oz,
3,buy.net//6061,2.5,Compact Camera,12.1 Megapixel,2.1 in,3.8 in,0.9 in,4.48 oz,12700000.0
4,buy.net//5837,3.0,Compact Camera,16 Megapixel,2.4 in,4.1 in,1.1 in,6.07 oz,16500000.0


### Effective Megapixels

In [16]:
#screen size values
pd.Series(df["effective megapixels"].apply(clean_size)).value_counts()

Megapixel    193
pixels       139
Name: effective megapixels, dtype: int64

In [17]:
def pixels_to_megapixels(value):
    if pd.isna(value):
        return value
    else:
        metric = word_tokenize(value)[1]
        if metric == "pixels":
            to_convert = word_tokenize(value)[0].replace(",", "")
            return float((to_convert)) / (10 ** 6)
        else:
            return float(word_tokenize(value)[0])

In [18]:
df["effective megapixels"] = df["effective megapixels"].apply(pixels_to_megapixels).head()

In [19]:
df.head()

Unnamed: 0,spec_id,lcd screen size,camera type,effective megapixels,height,width,depth,weight,total pixels
0,buy.net//6036,3.0,Compact Camera,16.0,2.8 in,4.3 in,1.7 in,8.04 oz,16600000.0
1,buy.net//5860,3.0,Bridge Camera,12.1,3.4 in,4.9 in,4.3 in,1.30 lbs,12800000.0
2,buy.net//5925,3.0,Point & Shoot Digital Camera,16.0,2.5 in,4.3 in,1.7 in,9.5 oz,
3,buy.net//6061,2.5,Compact Camera,12.1,2.1 in,3.8 in,0.9 in,4.48 oz,12700000.0
4,buy.net//5837,3.0,Compact Camera,16.0,2.4 in,4.1 in,1.1 in,6.07 oz,16500000.0


### Height width depth

In [20]:
pd.Series(df["height"].apply(clean_size)).value_counts()

in    316
mm      2
Name: height, dtype: int64

In [21]:
pd.Series(df["width"].apply(clean_size)).value_counts()

in    316
mm      2
Name: width, dtype: int64

In [22]:
pd.Series(df["depth"].apply(clean_size)).value_counts()

in    315
mm      2
Name: depth, dtype: int64

In [23]:
for index, row in df.iterrows():
    if pd.isna(row["height"]) or pd.isna(row["width"]) or pd.isna(row["depth"]):
        df.at[index, "height"] = np.nan
        df.at[index, "width"] = np.nan
        df.at[index, "depth"] = np.nan

In [24]:
len(df["depth"])

358

In [25]:
def mm_to_inches(value):
    if pd.isna(value):
        return value
    else:
        to_convert = word_tokenize(value)[0]
        metric = word_tokenize(value)[1]
        if metric == "mm":
            return round(float(to_convert) / 25.4, 1)
        else:
            return float(to_convert)

In [26]:
df["height"] = df["height"].apply(mm_to_inches)

In [27]:
df["depth"] = df["depth"].apply(mm_to_inches)
df["width"] = df["width"].apply(mm_to_inches)

In [28]:
df.head()

Unnamed: 0,spec_id,lcd screen size,camera type,effective megapixels,height,width,depth,weight,total pixels
0,buy.net//6036,3.0,Compact Camera,16.0,2.8,4.3,1.7,8.04 oz,16600000.0
1,buy.net//5860,3.0,Bridge Camera,12.1,3.4,4.9,4.3,1.30 lbs,12800000.0
2,buy.net//5925,3.0,Point & Shoot Digital Camera,16.0,2.5,4.3,1.7,9.5 oz,
3,buy.net//6061,2.5,Compact Camera,12.1,2.1,3.8,0.9,4.48 oz,12700000.0
4,buy.net//5837,3.0,Compact Camera,16.0,2.4,4.1,1.1,6.07 oz,16500000.0


In [29]:
def merge_dimensions(row):
    if pd.isna(row["height"]) or pd.isna(row["width"]) or pd.isna(row["depth"]):
        return np.nan
    row["dimension"] = 'h' + str(row["height"]) + 'w' + str(row["width"]) + "d" + str(row["depth"])
    return row

In [30]:
df = df.apply(merge_dimensions, axis = 1)

In [31]:
df.head()

Unnamed: 0,spec_id,lcd screen size,camera type,effective megapixels,height,width,depth,weight,total pixels,dimension
0,buy.net//6036,3.0,Compact Camera,16.0,2.8,4.3,1.7,8.04 oz,16600000.0,h2.8w4.3d1.7
1,buy.net//5860,3.0,Bridge Camera,12.1,3.4,4.9,4.3,1.30 lbs,12800000.0,h3.4w4.9d4.3
2,buy.net//5925,3.0,Point & Shoot Digital Camera,16.0,2.5,4.3,1.7,9.5 oz,,h2.5w4.3d1.7
3,buy.net//6061,2.5,Compact Camera,12.1,2.1,3.8,0.9,4.48 oz,12700000.0,h2.1w3.8d0.9
4,buy.net//5837,3.0,Compact Camera,16.0,2.4,4.1,1.1,6.07 oz,16500000.0,h2.4w4.1d1.1


### Weight

In [32]:
pd.Series(df["weight"].apply(clean_size)).value_counts()

oz     252
lbs     58
g        3
Name: weight, dtype: int64

In [33]:
def weight_to_grams_conversion(value):
    if pd.isna(value):
        return value
    else:
        to_convert = word_tokenize(value)[0]
        metric = word_tokenize(value)[1]
        if metric == "oz":
            return int(round(float(to_convert) * 28.35))
        elif metric == "lbs":
            return int(round(float(to_convert) * 454))
        else:
            return int(round(float(to_convert)))

In [34]:
df["weight"] = df["weight"].apply(weight_to_grams_conversion)

In [35]:
df.head()

Unnamed: 0,spec_id,lcd screen size,camera type,effective megapixels,height,width,depth,weight,total pixels,dimension
0,buy.net//6036,3.0,Compact Camera,16.0,2.8,4.3,1.7,228.0,16600000.0,h2.8w4.3d1.7
1,buy.net//5860,3.0,Bridge Camera,12.1,3.4,4.9,4.3,590.0,12800000.0,h3.4w4.9d4.3
2,buy.net//5925,3.0,Point & Shoot Digital Camera,16.0,2.5,4.3,1.7,269.0,,h2.5w4.3d1.7
3,buy.net//6061,2.5,Compact Camera,12.1,2.1,3.8,0.9,127.0,12700000.0,h2.1w3.8d0.9
4,buy.net//5837,3.0,Compact Camera,16.0,2.4,4.1,1.1,172.0,16500000.0,h2.4w4.1d1.1


### Camera type

In [41]:
def clean_camera_type(camera):
    if pd.isna(camera):
        return camera
    else:
        punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"
        for p in punctuation:
            camera = camera.replace(p, "")
        return camera.lower()

In [42]:
df["camera type"] = df["camera type"].apply(clean_camera_type)

In [43]:
df.head()

Unnamed: 0,spec_id,lcd screen size,camera type,effective megapixels,weight,dimension
0,buy.net//6036,3.0,compact camera,16.0,228.0,h2.8w4.3d1.7
1,buy.net//5860,3.0,bridge camera,12.1,590.0,h3.4w4.9d4.3
2,buy.net//5925,3.0,point shoot digital camera,16.0,269.0,h2.5w4.3d1.7
3,buy.net//6061,2.5,compact camera,12.1,127.0,h2.1w3.8d0.9
4,buy.net//5837,3.0,compact camera,16.0,172.0,h2.4w4.1d1.1


### Final cleaning

In [44]:
df = df.drop(["height", "width", "depth", "total pixels"], axis = 1)

KeyError: "['height' 'width' 'depth' 'total pixels'] not found in axis"

In [45]:
df.head()

Unnamed: 0,spec_id,lcd screen size,camera type,effective megapixels,weight,dimension
0,buy.net//6036,3.0,compact camera,16.0,228.0,h2.8w4.3d1.7
1,buy.net//5860,3.0,bridge camera,12.1,590.0,h3.4w4.9d4.3
2,buy.net//5925,3.0,point shoot digital camera,16.0,269.0,h2.5w4.3d1.7
3,buy.net//6061,2.5,compact camera,12.1,127.0,h2.1w3.8d0.9
4,buy.net//5837,3.0,compact camera,16.0,172.0,h2.4w4.1d1.1


In [46]:
df.to_csv("../datasets/unlabeled/cleaned/buy_net.csv", index=False)