In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    #source = "buy.net"
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
    #print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [5]:
df = df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.walmart.com")

>>> Creating dataframe...



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


>>> Dataframe created successfully!



In [6]:
cols = ["spec_id", "product in inches l x w x h", "shipping weight in pounds"]

In [7]:
df = df[cols]

In [8]:
df.head()

Unnamed: 0,spec_id,product in inches l x w x h,shipping weight in pounds
0,www.walmart.com//729,3.8 x 0.8 x 2.2,1.1
1,www.walmart.com//683,4.18 x 2.43 x 0.85,0.75
2,www.walmart.com//768,3.98 x 2.38 x 1.13,5.3
3,www.walmart.com//787,2.9 x 5.8 x 4.4,4.25
4,www.walmart.com//154,5.9 x 9.3 x 3.9,2.1


## Product in inches l x w x h

In [13]:
df = df.rename(columns = {"product in inches l x w x h" : "dimensions"})

In [15]:
df.head()

Unnamed: 0,spec_id,dimensions,shipping weight in pounds
0,www.walmart.com//729,3.8 x 0.8 x 2.2,1.1
1,www.walmart.com//683,4.18 x 2.43 x 0.85,0.75
2,www.walmart.com//768,3.98 x 2.38 x 1.13,5.3
3,www.walmart.com//787,2.9 x 5.8 x 4.4,4.25
4,www.walmart.com//154,5.9 x 9.3 x 3.9,2.1


In [17]:
df["dimensions"].value_counts()

4.47 x 2.56 x 0.98      5
4.0 x 3.0 x 2.0         5
4.0 x 2.0 x 3.0         5
1.0 x 4.1 x 2.4         4
4.5 x 3.1 x 2.8         4
3.8 x 2.4 x 1.2         4
3.0 x 4.92 x 3.9        4
3.97 x 2.41 x 0.81      4
7.0 x 6.0 x 3.0         4
5.0 x 3.8 x 3.1         4
5.1 x 3.9 x 3.1         4
4.0 x 3.0 x 1.0         4
3.72 x 2.28 x 0.71      3
5.3 x 4.2 x 3.0         3
6.0 x 4.0 x 3.0         3
4.3 x 3.2 x 2.9         3
3.81 x 2.28 x 0.9       3
3.1 x 5.5 x 4.1         2
4.9 x 3.41 x 3.62       2
4.875 x 3.33 x 3.625    2
4.33 x 2.6 x 1.0        2
0.7 x 3.1 x 2.1         2
2.46 x 2.8 x 2.3        2
4.87 x 3.5 x 3.38       2
3.2 x 5.6 x 3.2         2
4.37 x 2.53 x 1.59      2
10.35 x 7.3 x 8.13      2
1.38 x 4.63 x 2.5       2
4.4 x 3.0 x 3.6         2
5.08 x 2.8 x 0.75       2
                       ..
1.1 x 3.9 x 2.3         1
7.4 x 3.4 x 3.0         1
4.31 x 2.51 x 1.4       1
4.38 x 1.44 x 2.5       1
3.05 x 5.17 x 3.94      1
6.85 x 9.45 x 3.9       1
2.2 x 2.35 x 2.35       1
5.0 x 3.0 x 

In [20]:
def clean_dimensions_regex(value):
    regex = r"([0-9]+[.][0-9]+) x ([0-9]+[.][0-9]+) x ([0-9]+[.][0-9]+)"
    if pd.isna(value):
        return value
    else:
        if re.match(regex, value):
            groups = re.match(regex, value)
            dimension = str(round(float(groups.group(3)), 1)) + "h" + str(round(float(groups.group(2)), 1)) + "w" + str(round(float(groups.group(1)), 1)) + "d"
            return dimension
        else:
            return value

In [22]:
df["dimensions"] = df["dimensions"].apply(clean_dimensions_regex)

In [23]:
df.head()

Unnamed: 0,spec_id,dimensions,shipping weight in pounds
0,www.walmart.com//729,2.2h0.8w3.8d,1.1
1,www.walmart.com//683,0.8h2.4w4.2d,0.75
2,www.walmart.com//768,1.1h2.4w4.0d,5.3
3,www.walmart.com//787,4.4h5.8w2.9d,4.25
4,www.walmart.com//154,3.9h9.3w5.9d,2.1


## Shipping weight in pounds

In [24]:
df = df.rename(columns = {"shipping weight in pounds" : "weight"})

In [26]:
df["weight"].value_counts()

1.0      27
2.0      17
0.6       6
0.85      6
3.6       5
1.5       5
1.7       5
2.3       4
2.1       4
0.9       4
0.93      4
2.4       4
6.65      4
0.55      3
1.1       3
1.95      3
2.05      3
0.75      3
1.9       3
28.6      2
4.96      2
7.35      2
14.55     2
4.1       2
1.4       2
3.3       2
5.95      2
3.9       2
7.0       2
0.5       2
         ..
1.04      1
0.7       1
0.45      1
0.96      1
3.0       1
0.29      1
8.0       1
4.25      1
2.2       1
29.98     1
0.51      1
0.86      1
0.34      1
0.35      1
2.81      1
1.2       1
4.096     1
11.51     1
1.25      1
0.97      1
2.15      1
1.35      1
0.95      1
7.6       1
15.67     1
0.4       1
0.63      1
5.35      1
1.81      1
1.05      1
Name: weight, Length: 82, dtype: int64

In [27]:
def pounds_to_grams(value):
    if pd.isna(value):
        return value
    else:
        return int(round(float(value) * 454))

In [29]:
df["weight"] = df["weight"].apply(pounds_to_grams)

In [30]:
df.head()

Unnamed: 0,spec_id,dimensions,weight
0,www.walmart.com//729,2.2h0.8w3.8d,499.0
1,www.walmart.com//683,0.8h2.4w4.2d,340.0
2,www.walmart.com//768,1.1h2.4w4.0d,2406.0
3,www.walmart.com//787,4.4h5.8w2.9d,1930.0
4,www.walmart.com//154,3.9h9.3w5.9d,953.0


## Saving

In [31]:
df.to_csv("../../datasets/unlabeled/cleaned/walmart.csv", index=False)