In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    #source = "buy.net"
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
    #print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.wexphotographic.com")

>>> Creating dataframe...



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


>>> Dataframe created successfully!



In [4]:
df.head()

Unnamed: 0,<page title>,an,battery model,cant find a,card format,colour,connectivity,exposure control,exposure modes,face detection technology,...,shutter speed max,shutter speed min,size,source,spec_id,spec_number,the description needs,the information is,viewfinder type,weight g
0,Nikon Coolpix AW120 Digital Camera - Camouflag...,image is incorrect,EN-EL12,related accessory,"SD, SDHC, SDXC",Camouflage,"Hi-Speed USB 2.0, HDMI, WiFi",,Scene Modes,,...,1/1500 sec,4 sec,110.0 x 66.0 x 25.8 mm,www.wexphotographic.com,www.wexphotographic.com//154,154,further detail,incorrect,3 Inch OLED Monitor,213
1,Canon IXUS 150 Digital Camera - Red (9148B007A...,image is incorrect,NB-11L,related accessory,"SD, SDHC, SDXC",Red,Hi-Speed USB (USB2.0),Â±2.0EV 1/3EV step,13 Scene Modes,Yes,...,1/2000 sec,15 sec,95 x 54 x 22 mm,www.wexphotographic.com,www.wexphotographic.com//553,553,further detail,incorrect,2.7 Inch TFT,130
2,Fuji FinePix S1 Digital Camera (P10NC12730A) -...,image is incorrect,NP-85,related accessory,"SD, SDHC, SDXC",Black,"Hi-Speed USB (USB2.0), HDMI Mini Connector",Â±2.0EV 1/3EV step,"Programmed AE, Shutter Priority AE, Aperture P...",Yes,...,1/2000 sec,30 sec,133 x 90.9 x 110.3 mm,www.wexphotographic.com,www.wexphotographic.com//601,601,further detail,incorrect,0.2 Inch Electronic Viewfinder,680
3,Nikon Coolpix S5300 Digital Camera - Black (VN...,image is incorrect,EN-EL19,related accessory,"25MB Internal, SD, SDHC, SDXC",Black,"Hi-Speed USB (USB2.0), HDMI",Â±2.0EV 1/3EV step,18 Scene Modes,Yes,...,1/1500 sec,4 sec,97 x 58 x 21 mm,www.wexphotographic.com,www.wexphotographic.com//197,197,further detail,incorrect,3 Inch LCD,138
4,Fuji FinePix S8600 Digital Camera - Red (P10NC...,image is incorrect,3 x AA type alkaline batteries or 3 x AA type ...,related accessory,"SD, SDHC, SDXC",Red,Hi-Speed USB (USB2.0),Â±2.0EV 1/3EV step,"Programmed AE, Shutter Priority AE, Manual exp...",Yes,...,1/2000 sec,,121 x 81 x 65 mm,www.wexphotographic.com,www.wexphotographic.com//178,178,further detail,incorrect,,450


In [5]:
cols = ["spec_id", "screen size inches", "megapixels"]

In [6]:
df = df[cols]

In [7]:
df.head()

Unnamed: 0,spec_id,screen size inches,megapixels
0,www.wexphotographic.com//154,3.0,16.0
1,www.wexphotographic.com//553,2.7,16.0
2,www.wexphotographic.com//601,3.0,16.4
3,www.wexphotographic.com//197,3.0,16.0
4,www.wexphotographic.com//178,3.0,16.0


### Screen size

In [8]:
def clean_size(value):
    if pd.isna(value):
        return value
    else:
        return word_tokenize(value)[1]

In [9]:
df["screen size inches"].value_counts()

3.0    72
3      43
2.7    22
0       4
2.8     3
4.8     2
2.5     1
Name: screen size inches, dtype: int64

In [10]:
df["screen size inches"] = df["screen size inches"].apply(lambda value : float(value))

In [11]:
df.head()

Unnamed: 0,spec_id,screen size inches,megapixels
0,www.wexphotographic.com//154,3.0,16.0
1,www.wexphotographic.com//553,2.7,16.0
2,www.wexphotographic.com//601,3.0,16.4
3,www.wexphotographic.com//197,3.0,16.0
4,www.wexphotographic.com//178,3.0,16.0


## Megapixels

In [12]:
df["megapixels"].value_counts()

16      30
16.0    18
20.1    14
16.2    12
16.3     9
16.1     8
12.1     7
20.2     6
12.8     5
20.4     4
12.0     4
13.2     3
12.2     3
18.2     3
16.4     3
14.0     3
24.3     2
46.0     2
10.1     2
18.0     2
29       1
0        1
15.4     1
20.0     1
18.1     1
10.0     1
16.8     1
Name: megapixels, dtype: int64

In [13]:
df["megapixels"] = df["megapixels"].apply(lambda value : float(value))

In [14]:
df["megapixels"].value_counts()

16.0    48
20.1    14
16.2    12
16.3     9
16.1     8
12.1     7
20.2     6
12.8     5
12.0     4
20.4     4
12.2     3
13.2     3
18.2     3
16.4     3
14.0     3
24.3     2
46.0     2
18.0     2
10.1     2
18.1     1
15.4     1
10.0     1
20.0     1
29.0     1
0.0      1
16.8     1
Name: megapixels, dtype: int64

In [15]:
df.head()

Unnamed: 0,spec_id,screen size inches,megapixels
0,www.wexphotographic.com//154,3.0,16.0
1,www.wexphotographic.com//553,2.7,16.0
2,www.wexphotographic.com//601,3.0,16.4
3,www.wexphotographic.com//197,3.0,16.0
4,www.wexphotographic.com//178,3.0,16.0


## Saving

In [17]:
df.to_csv("../../datasets/unlabeled/cleaned/wexphotographic.csv", index=False)