In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    #source = "buy.net"
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
    #print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.pcconnection.com")

>>> Creating dataframe...

>>> Dataframe created successfully!



In [4]:
df.head()

Unnamed: 0,source,spec_number,spec_id,35mm equivalent focal length max,35mm equivalent focal length min,<page title>,camera type,color,display size,flash type,...,lens aperture,memory notes,storage capacity,digital video format,light sensitivity,video input type,still image format,battery run time max,red eye reduction,warranty details
0,www.pcconnection.com,4452,www.pcconnection.com//4452,100.00 mm,25.00 mm,"Buy Fujifilm XF1 Digital Camera, 12MP, 4x Zoom...",Compact,Red,3 in,Pop-up flash,...,,,,,,,,,,
1,www.pcconnection.com,4451,www.pcconnection.com//4451,,,Buy Nikon WR-T10 Wireless Remote Controller Ca...,,,,,...,,,,,,,,,,
2,www.pcconnection.com,12419,www.pcconnection.com//12419,450 mm,25 mm,"Buy Canon PowerShot SX600 HS, 16MP, 18x Zoom, ...",Point & Shoot camera,White,3 in,Pop-up flash,...,,,,,,,,,,
3,www.pcconnection.com,12378,www.pcconnection.com//12378,,,"Buy Fujifilm FinePix JX680 Digital Camera, 16M...",Compact,Red,3 in,Built-in flash,...,,,,,,,,,,
4,www.pcconnection.com,4475,www.pcconnection.com//4475,112.00 mm,28.00 mm,"Buy Olympus STYLUS XZ-2 iHS Digital Camera, 12...",Compact,Black,3 in,Pop-up flash,...,,,,,,,,,,


In [5]:
cols = ["spec_id", "camera type", "display size", "megapixels"]

In [6]:
df = df[cols]

In [7]:
df.head()

Unnamed: 0,spec_id,camera type,display size,megapixels
0,www.pcconnection.com//4452,Compact,3 in,12 megapixel(s)
1,www.pcconnection.com//4451,,,
2,www.pcconnection.com//12419,Point & Shoot camera,3 in,16 megapixel(s)
3,www.pcconnection.com//12378,Compact,3 in,16 megapixel(s)
4,www.pcconnection.com//4475,Compact,3 in,12 megapixel(s)


## Camera type

In [8]:
df["camera type"].value_counts()

Compact                        89
Mirrorless system              45
SLR camera                     35
Point & Shoot camera           16
Smartphone attachable           2
Point & Shoot / Zoom camera     1
Name: camera type, dtype: int64

In [9]:
def clean_camera_type(camera):
    if pd.isna(camera):
        return camera
    else:
        punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"
        camera = camera.lower()
        for p in punctuation:
            camera = camera.replace(p, "")
        camera = camera.replace("camera", "")
        camera = camera.replace("zoom", "")
        camera = ' '.join(camera.split())
        return camera

In [10]:
df["camera type"].apply(clean_camera_type).value_counts()

compact                  89
mirrorless system        45
slr                      35
point shoot              17
smartphone attachable     2
Name: camera type, dtype: int64

In [11]:
df["camera type"] = df["camera type"].apply(clean_camera_type)

## Display size

In [12]:
df["display size"].value_counts()

3 in      134
2.7 in     27
3.2 in     14
3.3 in      3
2.5 in      2
4.8 in      2
2.4 in      2
3.5 in      1
2.8 in      1
Name: display size, dtype: int64

In [13]:
def clean_display_size(value):
    if pd.isna(value):
        return value
    else:
        return float(value.split()[0])

In [14]:
df["display size"] = df["display size"].apply(clean_display_size)

In [15]:
df.head()

Unnamed: 0,spec_id,camera type,display size,megapixels
0,www.pcconnection.com//4452,compact,3.0,12 megapixel(s)
1,www.pcconnection.com//4451,,,
2,www.pcconnection.com//12419,point shoot,3.0,16 megapixel(s)
3,www.pcconnection.com//12378,compact,3.0,16 megapixel(s)
4,www.pcconnection.com//4475,compact,3.0,12 megapixel(s)


## Megapixels

In [16]:
df["megapixels"].value_counts()

16.1 megapixel(s)     36
16 megapixel(s)       35
16.2 megapixel(s)     10
14.2 megapixel(s)      9
18 megapixel(s)        9
20.1 megapixel(s)      8
18.2 megapixel(s)      8
12 megapixel(s)        7
24.3 megapixel(s)      6
12.1 megapixel(s)      6
24.2 megapixel(s)      5
20.2 megapixel(s)      4
10.1 megapixel(s)      4
16.3 megapixel(s)      4
24.1 megapixel(s)      4
16.4 megapixel(s)      4
20 megapixel(s)        4
13.2 megapixel(s)      3
23.35 megapixel(s)     2
20.4 megapixel(s)      2
12.4 megapixel(s)      2
12.2 megapixel(s)      2
36.3 megapixel(s)      2
22.3 megapixel(s)      2
20.9 megapixel(s)      1
36.4 megapixel(s)      1
12.3 megapixel(s)      1
9.1 megapixel(s)       1
14.4 megapixel(s)      1
18.4 megapixel(s)      1
14.14 megapixel(s)     1
20.3 megapixel(s)      1
14 megapixel(s)        1
Name: megapixels, dtype: int64

In [17]:
def clean_megapixels(value):
    if pd.isna(value):
        return value
    else:
        return round(float(value.split()[0]), 1)

In [18]:
df["megapixels"] = df["megapixels"].apply(clean_megapixels)

In [19]:
df.head()

Unnamed: 0,spec_id,camera type,display size,megapixels
0,www.pcconnection.com//4452,compact,3.0,12.0
1,www.pcconnection.com//4451,,,
2,www.pcconnection.com//12419,point shoot,3.0,16.0
3,www.pcconnection.com//12378,compact,3.0,16.0
4,www.pcconnection.com//4475,compact,3.0,12.0


## Saving

In [20]:
df.rename(columns={'camera type': 'type', 'display size': 'screen_size'}, inplace=True)
df.head()

Unnamed: 0,spec_id,type,screen_size,megapixels
0,www.pcconnection.com//4452,compact,3.0,12.0
1,www.pcconnection.com//4451,,,
2,www.pcconnection.com//12419,point shoot,3.0,16.0
3,www.pcconnection.com//12378,compact,3.0,16.0
4,www.pcconnection.com//4475,compact,3.0,12.0


In [21]:
df.to_csv("../../datasets/unlabeled/cleaned/pcconnection.csv", index=False)