In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path, source, columns_df):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')

    progressive_id = 0
    progressive_id2row_df = {}
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            specification_data = json.load(specification_file)
        
            screen_size = specification_data.get("lcd screen size")
            brand = specification_data.get("brand")
            disp = specification_data.get("image display resolution")

            row = (specification_id, brand, screen_size, disp)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.flipkart.com", ["spec_id", "brand", "lcd screen size", "image display resolution"])

>>> Creating dataframe...

>>> Dataframe created successfully!



In [4]:
df.head()

Unnamed: 0,spec_id,brand,lcd screen size,image display resolution
0,www.flipkart.com//2219,,,
1,www.flipkart.com//2167,,,
2,www.flipkart.com//2188,Canon,3 inch,"1,040,000 dots"
3,www.flipkart.com//2171,Nikon,2.7 inch,"230,000 dots"
4,www.flipkart.com//2258,Nikon,3 inch,"460,000 dots"


In [5]:
cols = ["spec_id", "brand", "lcd screen size", "image display resolution"]

In [6]:
df = df[cols]

In [7]:
df.head()

Unnamed: 0,spec_id,brand,lcd screen size,image display resolution
0,www.flipkart.com//2219,,,
1,www.flipkart.com//2167,,,
2,www.flipkart.com//2188,Canon,3 inch,"1,040,000 dots"
3,www.flipkart.com//2171,Nikon,2.7 inch,"230,000 dots"
4,www.flipkart.com//2258,Nikon,3 inch,"460,000 dots"


## Brand

In [8]:
df["brand"].value_counts()

Nikon        50
Canon        26
Sony         23
Tamron        5
YourDeal      4
Lowepro       4
DigiFlip      3
Sigma         2
Pentax        1
Panasonic     1
Name: brand, dtype: int64

In [9]:
df["brand"] = df["brand"].apply(lambda brand : brand.lower() if not pd.isna(brand) else np.nan)

In [10]:
df.head()

Unnamed: 0,spec_id,brand,lcd screen size,image display resolution
0,www.flipkart.com//2219,,,
1,www.flipkart.com//2167,,,
2,www.flipkart.com//2188,canon,3 inch,"1,040,000 dots"
3,www.flipkart.com//2171,nikon,2.7 inch,"230,000 dots"
4,www.flipkart.com//2258,nikon,3 inch,"460,000 dots"


## Lcd screen size

In [11]:
df["lcd screen size"].value_counts()

3 inch      62
2.7 inch    35
2 inch       4
3.2 inch     2
2.8 inch     1
Name: lcd screen size, dtype: int64

In [12]:
def clean_size(value):
    if pd.isna(value):
        return value
    else:
        return word_tokenize(value)[0]

In [14]:
df["lcd screen size"] = df["lcd screen size"].apply(lambda value : float(clean_size(value)) if not pd.isna(value) else value)

In [15]:
df.head()

Unnamed: 0,spec_id,brand,lcd screen size,image display resolution
0,www.flipkart.com//2219,,,
1,www.flipkart.com//2167,,,
2,www.flipkart.com//2188,canon,3.0,"1,040,000 dots"
3,www.flipkart.com//2171,nikon,2.7,"230,000 dots"
4,www.flipkart.com//2258,nikon,3.0,"460,000 dots"


## Image display resolution

In [16]:
df["image display resolution"].value_counts()

230,000 dots      37
921,000 dots      25
460,000 dots       9
460,800 dots       9
230,400 dots       7
1,040,000 dots     7
1,036,800 dots     2
921,600 dots       2
461,000 dots       1
920,000 dots       1
Name: image display resolution, dtype: int64

In [17]:
def clean_dots(value):
    if pd.isna(value):
        return value
    else:
        value = ' '.join(value.split())
        value = value.replace(",", "")
        value = value.replace("Dots", "d")
        value = value.replace("dots", "d")
        if bool(re.match(r"[0-9]* [0-9]* d", value)):
            return value.replace(" ", "")
        elif bool(re.match(r"[0-9]* d [0-9]*", value)):
            splitted = value.split()
            return splitted[2] + splitted[0] + splitted[1]            
        elif bool(re.match(r"[0-9]* d", value)):
            return value.replace(" ", "")
                
        return value.replace(" ", "")

In [18]:
df["image display resolution"] = df["image display resolution"].apply(clean_dots)

In [19]:
df.head()

Unnamed: 0,spec_id,brand,lcd screen size,image display resolution
0,www.flipkart.com//2219,,,
1,www.flipkart.com//2167,,,
2,www.flipkart.com//2188,canon,3.0,1040000d
3,www.flipkart.com//2171,nikon,2.7,230000d
4,www.flipkart.com//2258,nikon,3.0,460000d


## Saving

In [20]:
df.head()

Unnamed: 0,spec_id,brand,lcd screen size,image display resolution
0,www.flipkart.com//2219,,,
1,www.flipkart.com//2167,,,
2,www.flipkart.com//2188,canon,3.0,1040000d
3,www.flipkart.com//2171,nikon,2.7,230000d
4,www.flipkart.com//2258,nikon,3.0,460000d


In [21]:
df.columns = ["spec_id", "brand", "screen_size", "dots"]

In [22]:
df.to_csv("../../datasets/unlabeled/cleaned/flipkart.csv", index=False)