In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path, source, columns_df):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')

    progressive_id = 0
    progressive_id2row_df = {}
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            specification_data = json.load(specification_file)
        
            type_c = specification_data.get("camera type")
            megapixels = specification_data.get("effective pixels")
            row = (specification_id, type_c, megapixels)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.henrys.com", ["spec_id", "camera type", "effective pixels"])

>>> Creating dataframe...

>>> Dataframe created successfully!



In [4]:
df.head()

Unnamed: 0,spec_id,camera type,effective pixels
0,www.henrys.com//115,Wi-Fi,12.1MP
1,www.henrys.com//142,Small & Easy,20.1MP
2,www.henrys.com//154,Super Zoom,16.1MP
3,www.henrys.com//197,Small & Easy,16.1MP
4,www.henrys.com//20,Small & Easy,14.0MP


In [5]:
cols = ["spec_id", "camera type", "effective pixels"]

In [6]:
df = df[cols]

In [7]:
df.head()

Unnamed: 0,spec_id,camera type,effective pixels
0,www.henrys.com//115,Wi-Fi,12.1MP
1,www.henrys.com//142,Small & Easy,20.1MP
2,www.henrys.com//154,Super Zoom,16.1MP
3,www.henrys.com//197,Small & Easy,16.1MP
4,www.henrys.com//20,Small & Easy,14.0MP


## Camera type

In [8]:
df["camera type"].value_counts()

Super Zoom                42
Advanced                  41
Small & Easy              31
Wi-Fi                     27
Waterproof                24
Zoom                       5
Microscope                 3
Compact Digital Camera     2
Digital SLR                1
Pocketable                 1
Compact System Camera      1
Name: camera type, dtype: int64

In [9]:
def clean_camera_type(camera):
    if pd.isna(camera):
        return camera
    else:
        punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"
        camera = camera.lower()
        for p in punctuation:
            camera = camera.replace(p, "")
        camera = camera.replace("camera", "")
        camera = ' '.join(camera.split())
        return camera

In [10]:
df["camera type"] = df["camera type"].apply(clean_camera_type)

## Effective pixels

In [11]:
df["effective pixels"].value_counts()

16.0MP                             33
16.1MP                             18
20.2MP                             15
20.1MP                             15
12.1MP                             12
16.3MP                              9
16.2MP                              7
16MP                                6
16.4MP                              6
20.4MP                              4
14.0MP                              4
12.0MP                              4
18.1MP                              4
10.1MP                              3
13.2MP                              3
12MP                                3
12.2MP                              3
46MP                                3
20.0MP                              2
20MP                                2
18.2MP                              2
N/A                                 2
24.3MP                              2
18MP                                2
14.1MP                              2
16.1 MP                             2
12.0 million

In [12]:
def clean_pixels(value):
    if pd.isna(value):
        return value
    elif value == "N/A":
        return np.nan
    else:
        camera_regex_1 = r"([0-9]+[.]?[0-9]+?)[ ]?MP"
        camera_regex_2 = r"([0-9]+)MP"
        if bool(re.match(camera_regex_1, value)):
            groups = re.match(camera_regex_1, value)
            return float(round(float(groups.group(1)), 1))
        elif bool(re.match(camera_regex_2, value)):
            groups = re.match(camera_regex_2, value)
            return float(round(float(groups.group(1)), 1))
        else:
            return word_tokenize(value)[0]

In [13]:
df["effective pixels"] = df["effective pixels"].apply(clean_pixels)

In [14]:
df.head()

Unnamed: 0,spec_id,camera type,effective pixels
0,www.henrys.com//115,wifi,12.1
1,www.henrys.com//142,small easy,20.1
2,www.henrys.com//154,super zoom,16.1
3,www.henrys.com//197,small easy,16.1
4,www.henrys.com//20,small easy,14.0


## Saving

In [15]:
df.head()

Unnamed: 0,spec_id,camera type,effective pixels
0,www.henrys.com//115,wifi,12.1
1,www.henrys.com//142,small easy,20.1
2,www.henrys.com//154,super zoom,16.1
3,www.henrys.com//197,small easy,16.1
4,www.henrys.com//20,small easy,14.0


In [16]:
df.columns = ["spec_id", "type", "megapixels"]

In [17]:
df.to_csv("../../datasets/unlabeled/cleaned/henrys.csv", index=False)