In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import re

In [2]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    #source = "buy.net"
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df), sort=False)
            progressive_id2row_df = {}
    #print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [None]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.ebay.com")

>>> Creating dataframe...



In [None]:
df.head()

In [None]:
cols = ["spec_id","brand", "megapixels", "screen size", "weight"]

In [None]:
df = df[cols]

In [None]:
df.head()

In [None]:
df2=df.copy(deep=True)

### megapixels

In [None]:
df[df['megapixels'].notnull()]

In [None]:
df=df2.copy(deep=True)

In [None]:
def parse_megapixels(value):
    if(isinstance(value, list)):
        
        value=value[0]
    if pd.isna(value):
        return value
    else:
        match1 = re.search('(\d*\,\d+|\d*\.\d+|\d+)( MP|)', str(value))
        if match1 is None: 
            return float("NaN")


        return (match1.group(1).replace(",","."))

        

In [None]:
df["megapixels"] = df["megapixels"].apply(parse_megapixels)

In [None]:
df.head()

### Weight

In [None]:
df[df['weight'].notnull()]

In [None]:
def parse_weight(value):
    if (isinstance(value, list)):
        value=value[0]
        

    if pd.isna(value):
        return value
    else:
        value = value.lower()
       
        match1 = re.search('()(\d*\,\d+|\d*\.\d+|\d+)( g|g| kg|kg|lbs| lbs| oz| ounce)', str(value))
        if match1 is None: 
            return float("NaN")




        to_convert= match1.group(2).replace(" ","").replace(",",".")
        metric= match1.group(3).replace(" ","")
        converted=float("NaN")
        if (metric == "oz" or metric== "ounce"):
            converted= int(round(float(to_convert) * 28.35))
        elif metric == "lbs":
            converted= int(round(float(to_convert) * 454))
        elif metric == "kg":
            converted= int(round(float(to_convert) * 1000))
        else:
            converted= int(round(float(to_convert)))

        return converted
        

In [None]:
df["weight"] = df["weight"].apply(parse_weight)

In [None]:
df.head()

## screen size

In [None]:
df[df['screen size'].notnull()]

In [None]:
def parse_screen_size(value):
    if (isinstance(value, list)):
        return float("NaN")
    if pd.isna(value):
        return value

    else:
        m = re.search('(\d*\.\d+|\d*\,\d+|\d+)(cm|\"| \"| in|)', str(value))
        if m is None:

            return float("NaN")

        to_convert= m.group(1).replace(" ","").replace(",",".")
        metric= m.group(2).replace(" ","")
        converted=float("NaN")
        if (metric == "cm"):

            converted= int(round(float(to_convert) / 2.54))
        else:
            converted= int(round(float(to_convert)))

        return converted

In [None]:
df["screen size"] = df["screen size"].apply(parse_screen_size)

In [None]:
df.head()

## Brand

In [96]:
df['brand'] = df['brand'].apply(lambda x: str(x).lower())

In [104]:
df.brand.value_counts()

canon                                       3580
nikon                                       2959
sony                                        1866
fujifilm                                     835
olympus                                      833
kodak                                        646
samsung                                      602
panasonic                                    583
pentax                                       301
nan                                          290
vivitar                                      162
casio                                        147
leica                                        111
polaroid                                      99
ge                                            90
hp                                            87
gopro                                         59
ricoh                                         58
konica minolta                                54
svp                                           51
sanyo               

### Final cleaning

In [105]:
df.to_csv("../../datasets/unlabeled/cleaned/ebay.csv", index=False)