In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import re

In [2]:
def create_dataframe(dataset_path, source, columns_df):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')

    progressive_id = 0
    progressive_id2row_df = {}
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            specification_data = json.load(specification_file)
            dimensions = specification_data.get("dimensions")

            screen_size = specification_data.get("lcd size")
            megapixels = specification_data.get("sensor resolution")
            weight = specification_data.get("weight")
            row = (specification_id, dimensions, screen_size,megapixels, weight)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.shopbot.com.au",["spec_id","dimensions", "lcd size", "sensor resolution", "weight"])

>>> Creating dataframe...

>>> Dataframe created successfully!



In [4]:
df.head()

Unnamed: 0,spec_id,dimensions,lcd size,sensor resolution,weight
0,www.shopbot.com.au//379,,,,
1,www.shopbot.com.au//1659,125 x 96 x 77 mm,"3 ""","24,2 Mpx",505 g
2,www.shopbot.com.au//1209,,,,
3,www.shopbot.com.au//1570,,,,
4,www.shopbot.com.au//284,,,,


### sensor resolution

In [5]:
df[df['sensor resolution'].notnull()]

Unnamed: 0,spec_id,dimensions,lcd size,sensor resolution,weight
1,www.shopbot.com.au//1659,125 x 96 x 77 mm,"3 ""","24,2 Mpx",505 g
6,www.shopbot.com.au//1198,104 x 63 x 33 mm,"3 """,16 Mpx,215 g
8,www.shopbot.com.au//1618,96 x 57 x 19 mm,"3 ""","16,1 Mpx",123 g
13,www.shopbot.com.au//1264,140 x 82 x 43 mm,"3 ""","16,3 Mpx",450 g
18,www.shopbot.com.au//1399,116 x 68 x 39 mm,"3 """,16 Mpx,318 g
21,www.shopbot.com.au//1376,132 x 105 x 77 mm,"3 ""","16,2 Mpx",780 g
28,www.shopbot.com.au//247,133 x 100 x 80 mm,"3 """,18 Mpx,570 g
29,www.shopbot.com.au//1272,146 x 123 x 82 mm,"3,2 ""","36,3 Mpx",900 g
32,www.shopbot.com.au//1439,122 x 69 x 34 mm,"3 ""","12,3 Mpx",369 g
34,www.shopbot.com.au//1340,108 x 67 x 32 mm,"3 ""","12,1 Mpx",264 g


In [6]:
def parse_megapixels(value):
    if pd.isna(value):
        return value
    else:
        match1 = re.search('(\d*\,\d+|\d+) Mpx', str(value))
        if match1 is None: 
            return float("NaN")


        return (match1.group(1).replace(",","."))

        

In [7]:
df["sensor resolution"] = df["sensor resolution"].apply(parse_megapixels)

In [8]:
df.head()

Unnamed: 0,spec_id,dimensions,lcd size,sensor resolution,weight
0,www.shopbot.com.au//379,,,,
1,www.shopbot.com.au//1659,125 x 96 x 77 mm,"3 """,24.2,505 g
2,www.shopbot.com.au//1209,,,,
3,www.shopbot.com.au//1570,,,,
4,www.shopbot.com.au//284,,,,


### Weight

In [9]:
df[df['weight'].notnull()]

Unnamed: 0,spec_id,dimensions,lcd size,sensor resolution,weight
1,www.shopbot.com.au//1659,125 x 96 x 77 mm,"3 """,24.2,505 g
6,www.shopbot.com.au//1198,104 x 63 x 33 mm,"3 """,16,215 g
8,www.shopbot.com.au//1618,96 x 57 x 19 mm,"3 """,16.1,123 g
13,www.shopbot.com.au//1264,140 x 82 x 43 mm,"3 """,16.3,450 g
18,www.shopbot.com.au//1399,116 x 68 x 39 mm,"3 """,16,318 g
21,www.shopbot.com.au//1376,132 x 105 x 77 mm,"3 """,16.2,780 g
28,www.shopbot.com.au//247,133 x 100 x 80 mm,"3 """,18,570 g
29,www.shopbot.com.au//1272,146 x 123 x 82 mm,"3,2 """,36.3,900 g
32,www.shopbot.com.au//1439,122 x 69 x 34 mm,"3 """,12.3,369 g
34,www.shopbot.com.au//1340,108 x 67 x 32 mm,"3 """,12.1,264 g


In [10]:
def parse_weight(value):

    if pd.isna(value):
        return value
    else:
       
        match1 = re.search('(\d*\,\d+|\d+) g', str(value))
        if match1 is None: 
            return float("NaN")

        return match1.group(1)
        

In [11]:
df["weight"] = df["weight"].apply(parse_weight)

In [12]:
df.head()

Unnamed: 0,spec_id,dimensions,lcd size,sensor resolution,weight
0,www.shopbot.com.au//379,,,,
1,www.shopbot.com.au//1659,125 x 96 x 77 mm,"3 """,24.2,505.0
2,www.shopbot.com.au//1209,,,,
3,www.shopbot.com.au//1570,,,,
4,www.shopbot.com.au//284,,,,


## w x h x d

In [13]:
df[df['dimensions'].notnull()]

Unnamed: 0,spec_id,dimensions,lcd size,sensor resolution,weight
1,www.shopbot.com.au//1659,125 x 96 x 77 mm,"3 """,24.2,505
6,www.shopbot.com.au//1198,104 x 63 x 33 mm,"3 """,16,215
8,www.shopbot.com.au//1618,96 x 57 x 19 mm,"3 """,16.1,123
13,www.shopbot.com.au//1264,140 x 82 x 43 mm,"3 """,16.3,450
18,www.shopbot.com.au//1399,116 x 68 x 39 mm,"3 """,16,318
21,www.shopbot.com.au//1376,132 x 105 x 77 mm,"3 """,16.2,780
28,www.shopbot.com.au//247,133 x 100 x 80 mm,"3 """,18,570
29,www.shopbot.com.au//1272,146 x 123 x 82 mm,"3,2 """,36.3,900
32,www.shopbot.com.au//1439,122 x 69 x 34 mm,"3 """,12.3,369
34,www.shopbot.com.au//1340,108 x 67 x 32 mm,"3 """,12.1,264


In [14]:
def mm_to_inches(value):
    
    return round(float(value) / 25.4, 1)


In [15]:
def parse_whd(value):
    if pd.isna(value):
        return value
    else:
        m = re.search('(\d*\.\d+|\d*\,\d+|\d+) x (\d*\.\d+|\d*\,\d+|\d+) x (\d*\.\d+|\d*\,\d+|\d+)', str(value))
        if m is None:

            return float("NaN")

        return str(mm_to_inches(m.group(2).replace(",","."))) + 'h' + str(mm_to_inches(m.group(1).replace(",","."))) + 'w' + str(mm_to_inches(m.group(3).replace(",","."))) + "d"
        

In [16]:
df["dimensions"] = df["dimensions"].apply(parse_whd)

In [17]:
df.head()

Unnamed: 0,spec_id,dimensions,lcd size,sensor resolution,weight
0,www.shopbot.com.au//379,,,,
1,www.shopbot.com.au//1659,3.8h4.9w3.0d,"3 """,24.2,505.0
2,www.shopbot.com.au//1209,,,,
3,www.shopbot.com.au//1570,,,,
4,www.shopbot.com.au//284,,,,


## lcd size

In [18]:
df[df['lcd size'].notnull()]

Unnamed: 0,spec_id,dimensions,lcd size,sensor resolution,weight
1,www.shopbot.com.au//1659,3.8h4.9w3.0d,"3 """,24.2,505
6,www.shopbot.com.au//1198,2.5h4.1w1.3d,"3 """,16,215
8,www.shopbot.com.au//1618,2.2h3.8w0.7d,"3 """,16.1,123
13,www.shopbot.com.au//1264,3.2h5.5w1.7d,"3 """,16.3,450
18,www.shopbot.com.au//1399,2.7h4.6w1.5d,"3 """,16,318
21,www.shopbot.com.au//1376,4.1h5.2w3.0d,"3 """,16.2,780
28,www.shopbot.com.au//247,3.9h5.2w3.1d,"3 """,18,570
29,www.shopbot.com.au//1272,4.8h5.7w3.2d,"3,2 """,36.3,900
32,www.shopbot.com.au//1439,2.7h4.8w1.3d,"3 """,12.3,369
34,www.shopbot.com.au//1340,2.6h4.3w1.3d,"3 """,12.1,264


In [19]:
def parse_screen_size(value):
    if pd.isna(value):
        return value
    else:
        m = re.search('(\d*\.\d+|\d*\,\d+|\d+) \"', str(value))
        if m is None:

            print("SKIP")
            return float("NaN")

        return m.group(1).replace(",",".")
        

In [20]:
df["lcd size"] = df["lcd size"].apply(parse_screen_size)

In [21]:
df.head()

Unnamed: 0,spec_id,dimensions,lcd size,sensor resolution,weight
0,www.shopbot.com.au//379,,,,
1,www.shopbot.com.au//1659,3.8h4.9w3.0d,3.0,24.2,505.0
2,www.shopbot.com.au//1209,,,,
3,www.shopbot.com.au//1570,,,,
4,www.shopbot.com.au//284,,,,


### Final cleaning

In [22]:
df.columns=["spec_id","dimensions", "screen_size", "megapixels", "weight"]

In [23]:
df.to_csv("../../datasets/unlabeled/cleaned/shopbot.csv", index=False)