In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import re

In [2]:
def create_dataframe(dataset_path, source, columns_df):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')

    progressive_id = 0
    progressive_id2row_df = {}
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            specification_data = json.load(specification_file)
            brand = specification_data.get("brand")
            megapixels = specification_data.get("megapixels")
            screen_size = specification_data.get("screen size")
            weight = specification_data.get("weight")
            row = (specification_id, brand, megapixels, screen_size, weight)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.ebay.com", ["spec_id","brand", "megapixels", "screen size", "weight"])

>>> Creating dataframe...

>>> Dataframe created successfully!



In [4]:
df.head()

Unnamed: 0,spec_id,brand,megapixels,screen size,weight
0,www.ebay.com//56784,Sony,4.0 MP,"1.8""",
1,www.ebay.com//24141,Olympus,14.0 MP,,
2,www.ebay.com//59471,Canon,16.0 MP,"3""",
3,www.ebay.com//47195,Canon,6.3 MP,"1.8""",
4,www.ebay.com//41942,Panasonic,12.0 MP,,


In [5]:
df2=df.copy(deep=True)

### megapixels

In [6]:
df[df['megapixels'].notnull()]

Unnamed: 0,spec_id,brand,megapixels,screen size,weight
0,www.ebay.com//56784,Sony,4.0 MP,"1.8""",
1,www.ebay.com//24141,Olympus,14.0 MP,,
2,www.ebay.com//59471,Canon,16.0 MP,"3""",
3,www.ebay.com//47195,Canon,6.3 MP,"1.8""",
4,www.ebay.com//41942,Panasonic,12.0 MP,,
5,www.ebay.com//54243,Canon,22.3 MP,"3.2""",
6,www.ebay.com//53586,Canon,18.0 MP,,
8,www.ebay.com//45002,Canon,8.0 MP,"2.5""",
9,www.ebay.com//59021,Fujifilm,16.0 MP,"3""",
11,www.ebay.com//48630,Nikon,5.1 MP,"2""",


In [7]:
df=df2.copy(deep=True)

In [8]:
def parse_megapixels(value):
    if(isinstance(value, list)):
        
        value=value[0]
    if pd.isna(value):
        return value
    else:
        match1 = re.search('(\d*\,\d+|\d*\.\d+|\d+)( MP|)', str(value))
        if match1 is None: 
            return float("NaN")


        return (match1.group(1).replace(",","."))

        

In [9]:
df["megapixels"] = df["megapixels"].apply(parse_megapixels)

In [10]:
df.head()

Unnamed: 0,spec_id,brand,megapixels,screen size,weight
0,www.ebay.com//56784,Sony,4.0,"1.8""",
1,www.ebay.com//24141,Olympus,14.0,,
2,www.ebay.com//59471,Canon,16.0,"3""",
3,www.ebay.com//47195,Canon,6.3,"1.8""",
4,www.ebay.com//41942,Panasonic,12.0,,


### Weight

In [11]:
df[df['weight'].notnull()]

Unnamed: 0,spec_id,brand,megapixels,screen size,weight
17,www.ebay.com//46784,,16.0,,20.8 oz. (589g)
19,www.ebay.com//47896,Nikon,6.1,"2""",600 gr
60,www.ebay.com//60120,Canon,12.2,"2.7""",7.1 oz (201 g)
75,www.ebay.com//44886,Canon,18.0,"3""","[15.3 oz. (434g) (body only), 7.1 oz. (200g)]"
129,www.ebay.com//53928,Nikon,10.2,"2.5""",16.1 Oz.
175,www.ebay.com//58372,Sharper Image,,"45""",48.5 lbs
186,www.ebay.com//53247,Canon,18.0,"3""","[1.0 lb (454 g), 23.8 oz / 675 g]"
213,www.ebay.com//57204,Casio,6.0,"2.5""",4.16 Oz.
241,www.ebay.com//45413,Nikon,16.2,"3""",10.6 ounces
275,www.ebay.com//43482,Sony,10.2,"2.7""",18.8 Oz.


In [12]:
def parse_weight(value):
    if (isinstance(value, list)):
        value=value[0]
        

    if pd.isna(value):
        return value
    else:
        value = value.lower()
       
        match1 = re.search('()(\d*\,\d+|\d*\.\d+|\d+)( g|g| kg|kg|lbs| lbs| oz| ounce)', str(value))
        if match1 is None: 
            return float("NaN")




        to_convert= match1.group(2).replace(" ","").replace(",",".")
        metric= match1.group(3).replace(" ","")
        converted=float("NaN")
        if (metric == "oz" or metric== "ounce"):
            converted= int(round(float(to_convert) * 28.35))
        elif metric == "lbs":
            converted= int(round(float(to_convert) * 454))
        elif metric == "kg":
            converted= int(round(float(to_convert) * 1000))
        else:
            converted= int(round(float(to_convert)))

        return converted
        

In [13]:
df["weight"] = df["weight"].apply(parse_weight)

In [14]:
df.head()

Unnamed: 0,spec_id,brand,megapixels,screen size,weight
0,www.ebay.com//56784,Sony,4.0,"1.8""",
1,www.ebay.com//24141,Olympus,14.0,,
2,www.ebay.com//59471,Canon,16.0,"3""",
3,www.ebay.com//47195,Canon,6.3,"1.8""",
4,www.ebay.com//41942,Panasonic,12.0,,


## screen size

In [15]:
df[df['screen size'].notnull()]

Unnamed: 0,spec_id,brand,megapixels,screen size,weight
0,www.ebay.com//56784,Sony,4.0,"1.8""",
2,www.ebay.com//59471,Canon,16.0,"3""",
3,www.ebay.com//47195,Canon,6.3,"1.8""",
5,www.ebay.com//54243,Canon,22.3,"3.2""",
8,www.ebay.com//45002,Canon,8.0,"2.5""",
9,www.ebay.com//59021,Fujifilm,16.0,"3""",
11,www.ebay.com//48630,Nikon,5.1,"2""",
12,www.ebay.com//55901,Leica,10.0,"3""",
13,www.ebay.com//57195,Nikon,16.2,"3""",
15,www.ebay.com//43586,Sanyo,14.0,"3""",


In [16]:
def parse_screen_size(value):
    if (isinstance(value, list)):
        return float("NaN")
    if pd.isna(value):
        return value

    else:
        m = re.search('(\d*\.\d+|\d*\,\d+|\d+)(cm|\"| \"| in|)', str(value))
        if m is None:

            return float("NaN")

        to_convert= m.group(1).replace(" ","").replace(",",".")
        metric= m.group(2).replace(" ","")
        converted=float("NaN")
        if (metric == "cm"):

            converted= int(round(float(to_convert) / 2.54))
        else:
            converted= int(round(float(to_convert)))

        return converted

In [17]:
df["screen size"] = df["screen size"].apply(parse_screen_size)

In [18]:
df.head()

Unnamed: 0,spec_id,brand,megapixels,screen size,weight
0,www.ebay.com//56784,Sony,4.0,2.0,
1,www.ebay.com//24141,Olympus,14.0,,
2,www.ebay.com//59471,Canon,16.0,3.0,
3,www.ebay.com//47195,Canon,6.3,2.0,
4,www.ebay.com//41942,Panasonic,12.0,,


## Brand

In [19]:
df['brand'] = df['brand'].apply(lambda x: str(x).lower())

In [20]:
pd.set_option('display.max_rows', 500)
print(df.brand.value_counts())

canon                                                           3580
nikon                                                           2959
sony                                                            1866
fujifilm                                                         835
olympus                                                          833
kodak                                                            646
samsung                                                          602
panasonic                                                        583
pentax                                                           301
none                                                             290
vivitar                                                          162
casio                                                            147
leica                                                            111
polaroid                                                          99
ge                                

In [21]:
df.loc[df['brand'] == 'blackmagicdesign', 'brand'] = "blackmagic"
df.loc[df['brand'] == 'lg electronics', 'brand'] = "lg"
df.loc[df['brand'] == 'new', 'brand'] = float("NaN")
df.loc[df['brand'] == 'panasonic limix', 'brand'] = "panasonic"
df.loc[df['brand'] == 'unbrand', 'brand'] = float("NaN")
df.loc[df['brand'] == 'dxg technology', 'brand'] = "dxg"
df.loc[df['brand'] == 'unknown', 'brand'] = float("NaN")
df.loc[df['brand'] == 'panasonic / lumix', 'brand'] = "panasonic"
df.loc[df['brand'] == str(['nikon\ntype:\ndigital slr', 'nikon']), 'brand'] = "nikon"
df.loc[df['brand'] == 'canon/japan', 'brand'] = "canon"
df.loc[df['brand'] == '2000 ixla', 'brand'] = float("NaN")
df.loc[df['brand'] == 'insigniaâ¢', 'brand'] = "insignia"
df.loc[df['brand'] == 'vizio, inc.', 'brand'] = "vizio"
df.loc[df['brand'] == str(['nikon megapixels: 12.1 mp', 'nikon\nmegapixels:\n12.1 mp']), 'brand'] = "nikon"
df.loc[df['brand'] == 'motorolathis is a nice, motorola', 'brand'] = "motorola"
df.loc[df['brand'] == str(['kodak', 'kodak'])  , 'brand'] = "kodak"
df.loc[df['brand'] == 'panasonic/lumix ', 'brand'] = "panasonic"
df.loc[df['brand'] == 'unbranded 252 generic', 'brand'] = float("NaN")
df.loc[df['brand'] == 'mamiya afd ii', 'brand'] = float("NaN")
df.loc[df['brand'] == 'unbranded/generic', 'brand'] = float("NaN")
df.loc[df['brand'] == 'sj4000', 'brand'] = float("NaN")
df.loc[df['brand'] == "akai (built by samsung)", 'brand'] = "akai"
df.loc[df['brand'] == 'vivitar & samsung', 'brand'] = "samsung"
df.loc[df['brand'] == '[\'pentax\', \'pentax\']', 'brand'] = "pentax"
df.loc[df['brand'] == '"easy shot" clip', 'brand'] = float("NaN")
df.loc[df['brand'] == 'jazz dv150', 'brand'] = "jazz"
df.loc[df['brand'] == 'spectra merchandising international', 'brand'] = "spectra"
df.loc[df['brand'] == 'vistaquestâ', 'brand'] = "pentax"
df.loc[df['brand'] == 'canon power shot sx130', 'brand'] = 'canon'
df.loc[df['brand'] == '6.0 mp', 'brand'] = float("NaN")
df.loc[df['brand'] == 'olympu', 'brand'] = "olympus"
df.loc[df['brand'] == 'blackmagic design', 'brand'] = "blackmagic"
df.loc[df['brand'] == 'vivitar, kodak, sanyo, nikon', 'brand'] = "vivitar"
df.loc[df['brand'] == 'pentax corporation', 'brand'] = "pentax"
df.loc[df['brand'] == 'kodak, samsung, vivitar, canon , olympus', 'brand'] = "kodak"
df.loc[df['brand'] == 'i_p.mium', 'brand'] = float("NaN")
df.loc[df['brand'] == str(['kodak optical zoom: 8x', 'kodak\noptical zoom:\n8x']), 'brand'] = "kodak"
df.loc[df['brand'] == 'kobian group', 'brand'] = "kobian"
df.loc[df['brand'] == 'fujifilm finepix telephoto digitalcamera', 'brand'] = "fujifilm"
df.loc[df['brand'] == 'hewlett packard', 'brand'] = "hp"
df.loc[df['brand'] == 'unbranded', 'brand'] = float("NaN")
df.loc[df['brand'] == 'pioneer research', 'brand'] = "pioneer"
df.loc[df['brand'] == 'kodak, samsung, vivitar, canon , olympus ', 'brand'] = "kodak "

### Final cleaning

In [24]:
df.head()

Unnamed: 0,spec_id,brand,megapixels,screen_size,weight
0,www.ebay.com//56784,sony,4.0,2.0,
1,www.ebay.com//24141,olympus,14.0,,
2,www.ebay.com//59471,canon,16.0,3.0,
3,www.ebay.com//47195,canon,6.3,2.0,
4,www.ebay.com//41942,panasonic,12.0,,


In [23]:
df.columns=['spec_id','brand', 'megapixels', 'screen_size',"weight" ]

In [25]:
df.to_csv("../../datasets/unlabeled/cleaned/ebay.csv", index=False)