In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path, source, columns_df):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')

    progressive_id = 0
    progressive_id2row_df = {}
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            specification_data = json.load(specification_file)
            screen_size = specification_data.get("screen size inches")
            megapixels = specification_data.get("megapixels")
            row = (specification_id, screen_size, megapixels)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.wexphotographic.com",["spec_id", "screen size inches", "megapixels"])

>>> Creating dataframe...

>>> Dataframe created successfully!



In [4]:
df.head()

Unnamed: 0,spec_id,screen size inches,megapixels
0,www.wexphotographic.com//154,3.0,16.0
1,www.wexphotographic.com//553,2.7,16.0
2,www.wexphotographic.com//601,3.0,16.4
3,www.wexphotographic.com//197,3.0,16.0
4,www.wexphotographic.com//178,3.0,16.0


### Screen size

In [5]:
def clean_size(value):
    if pd.isna(value):
        return value
    else:
        return word_tokenize(value)[1]

In [6]:
df["screen size inches"].value_counts()

3.0    72
3      43
2.7    22
0       4
2.8     3
4.8     2
2.5     1
Name: screen size inches, dtype: int64

In [7]:
df["screen size inches"] = df["screen size inches"].apply(lambda value : float(value))

In [8]:
df.head()

Unnamed: 0,spec_id,screen size inches,megapixels
0,www.wexphotographic.com//154,3.0,16.0
1,www.wexphotographic.com//553,2.7,16.0
2,www.wexphotographic.com//601,3.0,16.4
3,www.wexphotographic.com//197,3.0,16.0
4,www.wexphotographic.com//178,3.0,16.0


## Megapixels

In [9]:
df["megapixels"].value_counts()

16      30
16.0    18
20.1    14
16.2    12
16.3     9
16.1     8
12.1     7
20.2     6
12.8     5
12.0     4
20.4     4
13.2     3
16.4     3
14.0     3
18.2     3
12.2     3
18.0     2
46.0     2
24.3     2
10.1     2
10.0     1
18.1     1
29       1
0        1
15.4     1
20.0     1
16.8     1
Name: megapixels, dtype: int64

In [10]:
df["megapixels"] = df["megapixels"].apply(lambda value : float(value))

In [11]:
df["megapixels"].value_counts()

16.0    48
20.1    14
16.2    12
16.3     9
16.1     8
12.1     7
20.2     6
12.8     5
12.0     4
20.4     4
12.2     3
13.2     3
18.2     3
16.4     3
14.0     3
24.3     2
46.0     2
18.0     2
10.1     2
18.1     1
15.4     1
10.0     1
20.0     1
29.0     1
0.0      1
16.8     1
Name: megapixels, dtype: int64

In [12]:
df.head()

Unnamed: 0,spec_id,screen size inches,megapixels
0,www.wexphotographic.com//154,3.0,16.0
1,www.wexphotographic.com//553,2.7,16.0
2,www.wexphotographic.com//601,3.0,16.4
3,www.wexphotographic.com//197,3.0,16.0
4,www.wexphotographic.com//178,3.0,16.0


## Saving

In [14]:
df.columns=['spec_id','screen_size','megapixels']

In [15]:
df.to_csv("../../datasets/unlabeled/cleaned/wexphotographic.csv", index=False)