In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import re

In [2]:
def create_dataframe(dataset_path, source, columns_df):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')

    progressive_id = 0
    progressive_id2row_df = {}
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            specification_data = json.load(specification_file)
            dimensions = specification_data.get("dimensions w x h x d")
            weight = specification_data.get("weight")
            row = (specification_id, dimensions, weight)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.cambuy.com.au",['spec_id',"dimensions w x h x d", "weight"])

>>> Creating dataframe...

>>> Dataframe created successfully!



In [5]:
df.head()

Unnamed: 0,spec_id,dimensions w x h x d,weight
0,www.cambuy.com.au//115,109.9 x 64.1 x 27.6mm,217g
1,www.cambuy.com.au//142,95.3 x 56.8 x 23.7 mm,Approx. 142 g (including battery/batteries and...
2,www.cambuy.com.au//103,,
3,www.cambuy.com.au//20,122.60x 70.7 x 54.6 mm / 4.83 x 2.78 x 2.15 inch,"Approx. 402g / 0.89 lb （SD card, Battery, Body..."
4,www.cambuy.com.au//98,,Approx. 304 g (including battery and memory card)


### Weight

In [6]:
df[df['weight'].notnull()]

Unnamed: 0,spec_id,dimensions w x h x d,weight
0,www.cambuy.com.au//115,109.9 x 64.1 x 27.6mm,217g
1,www.cambuy.com.au//142,95.3 x 56.8 x 23.7 mm,Approx. 142 g (including battery/batteries and...
3,www.cambuy.com.au//20,122.60x 70.7 x 54.6 mm / 4.83 x 2.78 x 2.15 inch,"Approx. 402g / 0.89 lb （SD card, Battery, Body..."
4,www.cambuy.com.au//98,,Approx. 304 g (including battery and memory card)
5,www.cambuy.com.au//77,Approx. 97.1 x 57.9 x 19.9 mm (3.9 x 2.3 x 0.8...,Approx. 125 g (4.5 oz) (with battery and SD me...
6,www.cambuy.com.au//139,116.2 x 87 x 56.5mm,Approx. 346g [CIPA with BLS-5 Battery and Memo...
7,www.cambuy.com.au//61,111.5 x 65.9 x 31.2mm,247g
8,www.cambuy.com.au//36,Approx. 124 x 98 x 75.5 mm (4.9 x 3.9 x 3 in.),Approx. 460 g (1 lb 0.2 oz) with battery and m...
9,www.cambuy.com.au//119,Approx. 143.5 × 110 × 66.5 mm (5.6 × 4.3 × 2.6...,Approx. 765 g (1 lb 11 oz) with battery and me...
11,www.cambuy.com.au//158,122.45 x 84.6 x 71.4 mm / 4.82 x 3.33 x 2.81 i...,Approx. 340g / 0.75 lb （Body only) / Approx. 6...


In [7]:
def parse_weight(value):

    if pd.isna(value):
        return value
    else:
#         print(value)
        match1 = re.search('(Approx.|Approx.|)(\d*\,\d+|\d+)( g|g)', str(value))
        if match1 is None: 
            print("SKIP")
            print(value)
            return float("NaN")

        return float(match1.group(2))
        

In [8]:
df["weight"] = df["weight"].apply(parse_weight)

In [9]:
df.head()

Unnamed: 0,spec_id,dimensions w x h x d,weight
0,www.cambuy.com.au//115,109.9 x 64.1 x 27.6mm,217.0
1,www.cambuy.com.au//142,95.3 x 56.8 x 23.7 mm,142.0
2,www.cambuy.com.au//103,,
3,www.cambuy.com.au//20,122.60x 70.7 x 54.6 mm / 4.83 x 2.78 x 2.15 inch,402.0
4,www.cambuy.com.au//98,,304.0


## w x h x d

In [10]:
df[df['dimensions w x h x d'].notnull()]

Unnamed: 0,spec_id,dimensions w x h x d,weight
0,www.cambuy.com.au//115,109.9 x 64.1 x 27.6mm,217.0
1,www.cambuy.com.au//142,95.3 x 56.8 x 23.7 mm,142.0
3,www.cambuy.com.au//20,122.60x 70.7 x 54.6 mm / 4.83 x 2.78 x 2.15 inch,402.0
5,www.cambuy.com.au//77,Approx. 97.1 x 57.9 x 19.9 mm (3.9 x 2.3 x 0.8...,125.0
6,www.cambuy.com.au//139,116.2 x 87 x 56.5mm,346.0
7,www.cambuy.com.au//61,111.5 x 65.9 x 31.2mm,247.0
8,www.cambuy.com.au//36,Approx. 124 x 98 x 75.5 mm (4.9 x 3.9 x 3 in.),460.0
9,www.cambuy.com.au//119,Approx. 143.5 × 110 × 66.5 mm (5.6 × 4.3 × 2.6...,765.0
10,www.cambuy.com.au//16,116.8 x 90.7 x 69.4mm,
11,www.cambuy.com.au//158,122.45 x 84.6 x 71.4 mm / 4.82 x 3.33 x 2.81 i...,340.0


In [11]:
def mm_to_inches(value):
    
    return round(float(value) / 25.4, 1)


In [12]:
def parse_whd(value):
    if pd.isna(value):
        return value
    else:
        m = re.search('(Approx. |)(\d*\.\d+|\d*\,\d+|\d+)(x | x |× | × )(\d*\.\d+|\d*\,\d+|\d+)(x | x |× | × )(\d*\.\d+|\d*\,\d+|\d+)( mm|mm)', str(value))
        if m is None:

            print("SKIP")
            print(value)
            return float("NaN")

        return 'h' + str(mm_to_inches(m.group(2).replace(",","."))) + 'w' + str(mm_to_inches(m.group(4).replace(",","."))) + "d" + str(mm_to_inches(m.group(6).replace(",",".")))
        

In [13]:
df["dimensions w x h x d"] = df["dimensions w x h x d"].apply(parse_whd)

SKIP
Approx. 96.4mm x 59.3mm x 15.4mm
SKIP
Approx.146, x123, x81.5mm (5.8, x4.9, x3.3in.)
SKIP
101.6 x 58.1x 41. mm


In [14]:
# Miss 3

In [17]:
df.head()

Unnamed: 0,spec_id,dimensions,weight
0,www.cambuy.com.au//115,h4.3w2.5d1.1,217.0
1,www.cambuy.com.au//142,h3.8w2.2d0.9,142.0
2,www.cambuy.com.au//103,,
3,www.cambuy.com.au//20,h4.8w2.8d2.1,402.0
4,www.cambuy.com.au//98,,304.0


### Final cleaning

In [16]:
df.columns=['spec_id','dimensions',"weight" ]

In [18]:
df.to_csv("../../datasets/unlabeled/cleaned/cambuy.csv", index=False)