In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import re

In [2]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    #source = "buy.net"
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
    #print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.eglobalcentral.co.uk")

>>> Creating dataframe...



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


>>> Dataframe created successfully!



In [4]:
cols = ["spec_id", "weight", "dimensions w x h x d", "max resolution"]

In [5]:
df = df[cols]

### Effective Megapixels

In [6]:
df[df['max resolution'].notnull()]

Unnamed: 0,spec_id,weight,dimensions w x h x d,max resolution
0,www.eglobalcentral.co.uk//729,760 g (1.68 lb / 26.81 oz),144 x 110 x 67 mm (5.67 x 4.33 x 2.64″),4928 x 3280
13,www.eglobalcentral.co.uk//441,23.28 oz / 660 g,,20MP: 5184 x 3888
14,www.eglobalcentral.co.uk//380,,125 x 98 x 76 mm (4.92 x 3.86 x 2.99″),6000 x 4000
15,www.eglobalcentral.co.uk//553,,102 x 58 x 36 mm,5472 x 3648
16,www.eglobalcentral.co.uk//103,,125 x 98 x 78 mm,6000 x 4000
17,www.eglobalcentral.co.uk//416,"[1.60 lb / 726 g with battery and memory card,...",,24MP: 6000 x 4000
21,www.eglobalcentral.co.uk//744,,125 x 98 x 76 mm (4.92 x 3.86 x 2.99″),6000 x 4000
22,www.eglobalcentral.co.uk//528,1.79 lb / 813 g with battery and memory card,,20MP: 5472 x 3648 @ 3:2
27,www.eglobalcentral.co.uk//713,,133.1 x 99.5 x 79.7 mm,5184 x 3456
35,www.eglobalcentral.co.uk//617,350 g (0.77 lb / 12.35 oz),129 x 75 x 37 mm (5.08 x 2.95 x 1.46″),4896 x 3264


In [7]:
def parse_megapixels(value):
    if pd.isna(value):
        return value
    else:
        match1 = re.search('(\d*\.\d+|\d+)MP:', str(value))
        if match1 is None: 
            match2 = re.search('(\d+) x (\d+)', str(value))
            if match2 is None:
                return float("NaN")

            return round(float(match2.group(1))*float(match2.group(2))/(10 ** 6),1)

        return match1.group(1)

        

In [8]:
df["max resolution"] = df["max resolution"].apply(parse_megapixels)

In [9]:
df.head()

Unnamed: 0,spec_id,weight,dimensions w x h x d,max resolution
0,www.eglobalcentral.co.uk//729,760 g (1.68 lb / 26.81 oz),144 x 110 x 67 mm (5.67 x 4.33 x 2.64″),16.2
1,www.eglobalcentral.co.uk//683,Approx. 165g / 5.82oz,,
2,www.eglobalcentral.co.uk//379,,,
3,www.eglobalcentral.co.uk//396,1910g / 67.4oz.,,
4,www.eglobalcentral.co.uk//545,,,


### Weight

In [10]:
pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")

In [11]:

def weight_to_grams_conversion(value):
    if(isinstance(value, list)): #Keep only the weight that inludes the battery
        value=value[0]
    if pd.isna(value):
        return value
    else:
        ret=[]
        to_convert = ''
        metric = ''

        for w in word_tokenize(value):
            if (len(ret)>=2):
                to_convert = ret[0]
                metric = ret[1]
                break
            if pattern.match(w):
                l=re.split('(\d*\.\d+|\d+)',w)
                to_convert = l[1]
                metric = l[2]
                break
            if (w=='Approximately'or w=='Approx'):
                continue
            ret.append(w)
            

        if(metric.isalpha() and  bool(re.search('(\d*\.\d+|\d+)', to_convert))):

            if metric == "oz":
                return int(round(float(to_convert) * 28.35))
            elif metric == "lbs":
                return int(round(float(to_convert) * 454))
            else:
                return int(round(float(to_convert)))
        else:
            return float("NaN")

In [12]:
df["weight"] = df["weight"].apply(weight_to_grams_conversion)

In [13]:
df.head()

Unnamed: 0,spec_id,weight,dimensions w x h x d,max resolution
0,www.eglobalcentral.co.uk//729,760.0,144 x 110 x 67 mm (5.67 x 4.33 x 2.64″),16.2
1,www.eglobalcentral.co.uk//683,165.0,,
2,www.eglobalcentral.co.uk//379,,,
3,www.eglobalcentral.co.uk//396,1910.0,,
4,www.eglobalcentral.co.uk//545,,,


## w x h x d

In [14]:
df[df['dimensions w x h x d'].notnull()]

Unnamed: 0,spec_id,weight,dimensions w x h x d,max resolution
0,www.eglobalcentral.co.uk//729,760.0,144 x 110 x 67 mm (5.67 x 4.33 x 2.64″),16.2
14,www.eglobalcentral.co.uk//380,,125 x 98 x 76 mm (4.92 x 3.86 x 2.99″),24
15,www.eglobalcentral.co.uk//553,,102 x 58 x 36 mm,20
16,www.eglobalcentral.co.uk//103,,125 x 98 x 78 mm,24
21,www.eglobalcentral.co.uk//744,,125 x 98 x 76 mm (4.92 x 3.86 x 2.99″),24
27,www.eglobalcentral.co.uk//713,,133.1 x 99.5 x 79.7 mm,17.9
35,www.eglobalcentral.co.uk//617,350.0,129 x 75 x 37 mm (5.08 x 2.95 x 1.46″),16
36,www.eglobalcentral.co.uk//247,,125 x 98 x 76 mm (4.92 x 3.86 x 2.99″),24
41,www.eglobalcentral.co.uk//549,,139.0 x 104.3 x 78.5 mm,20
43,www.eglobalcentral.co.uk//375,,99 x 55 x 30 mm (3.88 x 2.16 x 1.2″),15.8


In [15]:
def mm_to_inches(value):
    
    return round(float(value) / 25.4, 1)


In [16]:
def parse_whd(value):
    if pd.isna(value):
        return value
    else:
        m = re.search('(\d*\.\d+|\d+) x (\d*\.\d+|\d+) x (\d*\.\d+|\d+)', str(value))
        if m is None:
            return float("NaN")

        return str(mm_to_inches(m.group(2))) + 'h' + str(mm_to_inches(m.group(1))) + 'w' + str(mm_to_inches(m.group(3))) + "d"
        

In [17]:
df["dimensions w x h x d"] = df["dimensions w x h x d"].apply(parse_whd)

In [18]:
df.head()

Unnamed: 0,spec_id,weight,dimensions w x h x d,max resolution
0,www.eglobalcentral.co.uk//729,760.0,4.3h5.7w2.6d,16.2
1,www.eglobalcentral.co.uk//683,165.0,,
2,www.eglobalcentral.co.uk//379,,,
3,www.eglobalcentral.co.uk//396,1910.0,,
4,www.eglobalcentral.co.uk//545,,,


### Final cleaning

In [19]:
df.to_csv("../../datasets/unlabeled/cleaned/eglobalcentral.csv", index=False)