In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import re

In [2]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    #source = "buy.net"
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
    #print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.alibaba.com")

>>> Creating dataframe...



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


>>> Dataframe created successfully!



In [4]:
cols = ["spec_id","weight"]

In [5]:
df = df[cols]

In [6]:
df.head()

Unnamed: 0,spec_id,weight
0,www.alibaba.com//37297,
1,www.alibaba.com//29289,
2,www.alibaba.com//23684,0.19KG
3,www.alibaba.com//24141,1200g
4,www.alibaba.com//22996,600 g (1.32 lbs)


In [7]:
df2=df.copy(deep=True)

In [9]:
df2.head()

Unnamed: 0,spec_id,weight
0,www.alibaba.com//37297,
1,www.alibaba.com//29289,
2,www.alibaba.com//23684,0.19KG
3,www.alibaba.com//24141,1200g
4,www.alibaba.com//22996,600 g (1.32 lbs)


### Weight

In [10]:
df[df['weight'].notnull()]

Unnamed: 0,spec_id,weight
2,www.alibaba.com//23684,0.19KG
3,www.alibaba.com//24141,1200g
4,www.alibaba.com//22996,600 g (1.32 lbs)
5,www.alibaba.com//24511,0.09Kg
6,www.alibaba.com//35300,1200g
9,www.alibaba.com//25750,0.25kg
13,www.alibaba.com//28027,670g (1.5 lbs)
20,www.alibaba.com//25245,0.1KG/pc
21,www.alibaba.com//37628,700g (1.5 lbs)
22,www.alibaba.com//35615,1.25kg


In [71]:
df=df2.copy(deep=True)

In [72]:
def parse_weight(value):
    if (isinstance(value, list)):
        return float("NaN")
        

    if pd.isna(value):
        return value
    else:
        value = value.lower()
       
        match1 = re.search('(approx.|)(\d*\,\d+|\d*\.\d+|\d+)( g|g| kg|kg|lbs| lbs)', str(value))
        if match1 is None: 
            return float("NaN")


        to_convert= match1.group(2).replace(" ","").replace(",",".")
        metric= match1.group(3).replace(" ","")
        converted=float("NaN")
        if metric == "oz":
            converted= int(round(float(to_convert) * 28.35))
        elif metric == "lbs":
            converted= int(round(float(to_convert) * 454))
        elif metric == "kg":
            converted= int(round(float(to_convert) * 1000))
        else:
            converted= int(round(float(to_convert)))
        if(converted<50):
        return converted
        

In [73]:
df["weight"] = df["weight"].apply(parse_weight)

20g
20g
0.01kg/pc
44g(without batteries),58g with batteries
20g
0.01kg/pc
0.02kg
20g
approx. 78±3g (without waterproof case)
0.04kg
20g
44g(without batteries),58g with batteries
0.02kg
approx. 78±3g (without waterproof case)
20g
about 46.5 grams
20g
22 g
10g
46.5g
approx. 1,000 g (2 lb 3.3 oz) with battery and sd memory card but without body cap; approx. 900 g/1 lb 15.7 oz (camera body only)
approx620g
44g(without batteries),58g with batteries
15g
44g(without batteries),58g with batteries
20g
approx. 78±3g (without waterproof case)
approx. 78±3g (without waterproof case)
approx. 78±3g (without waterproof case)
15g
20g
10g
0.04kg
approx. 78±3g (without waterproof case)
20g


### Final cleaning

In [74]:
df.to_csv("../../datasets/unlabeled/cleaned/alibaba.csv", index=False)