In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path, source, columns_df):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')

    progressive_id = 0
    progressive_id2row_df = {}
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            specification_data = json.load(specification_file)
        
            manufacturer = specification_data.get("manufacturer")
            descr = specification_data.get("short description")
            megapixels = specification_data.get("megapixel")
            weight = specification_data.get("weight")
            row = (specification_id, manufacturer, descr, megapixels,weight)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.ilgs.net", ["spec_id", "manufacturer", "short description", "megapixel", "weight"])

>>> Creating dataframe...

>>> Dataframe created successfully!



In [4]:
df.head()

Unnamed: 0,spec_id,manufacturer,short description,megapixel,weight
0,www.ilgs.net//142,Sony,More detail even in low-light wiht 20x optical...,Approx. 18.2 MP,166\n139 g
1,www.ilgs.net//98,Fujifilm,"12MP, 2/3"" CMOS, Full-HD 1920x108060p/30p, Wi-...",12 MP,187g
2,www.ilgs.net//77,Canon,EOS 7D + EF-S 15-85mm f/3.5-5.6 IS USM - 22.3m...,"[18 MP, 18]","[820g, 820 g]"
3,www.ilgs.net//36,Samsung,"NX300 - 20.3 MP, CMOS 23.5 x 15.7 mm, 8.4074 c...",20.3 MP,280g
4,www.ilgs.net//119,Ricoh,Clearance Product - Box Damage.16 Megapixel 4x...,16,236 g


In [5]:
cols = ["spec_id", "manufacturer", "short description", "megapixel", "weight"]

In [6]:
df = df[cols]

In [7]:
df.head()

Unnamed: 0,spec_id,manufacturer,short description,megapixel,weight
0,www.ilgs.net//142,Sony,More detail even in low-light wiht 20x optical...,Approx. 18.2 MP,166\n139 g
1,www.ilgs.net//98,Fujifilm,"12MP, 2/3"" CMOS, Full-HD 1920x108060p/30p, Wi-...",12 MP,187g
2,www.ilgs.net//77,Canon,EOS 7D + EF-S 15-85mm f/3.5-5.6 IS USM - 22.3m...,"[18 MP, 18]","[820g, 820 g]"
3,www.ilgs.net//36,Samsung,"NX300 - 20.3 MP, CMOS 23.5 x 15.7 mm, 8.4074 c...",20.3 MP,280g
4,www.ilgs.net//119,Ricoh,Clearance Product - Box Damage.16 Megapixel 4x...,16,236 g


## Manufacturer

In [8]:
df["manufacturer"].value_counts()

Canon        28
Sony         16
Nikon        14
Pentax        9
Samsung       9
Panasonic     6
Olympus       4
Fujifilm      4
Ricoh         2
Name: manufacturer, dtype: int64

In [9]:
df["manufacturer"] = df["manufacturer"].apply(lambda brand : brand.lower() if not pd.isna(brand) else np.nan)

## Short description

In [10]:
df.head()

Unnamed: 0,spec_id,manufacturer,short description,megapixel,weight
0,www.ilgs.net//142,sony,More detail even in low-light wiht 20x optical...,Approx. 18.2 MP,166\n139 g
1,www.ilgs.net//98,fujifilm,"12MP, 2/3"" CMOS, Full-HD 1920x108060p/30p, Wi-...",12 MP,187g
2,www.ilgs.net//77,canon,EOS 7D + EF-S 15-85mm f/3.5-5.6 IS USM - 22.3m...,"[18 MP, 18]","[820g, 820 g]"
3,www.ilgs.net//36,samsung,"NX300 - 20.3 MP, CMOS 23.5 x 15.7 mm, 8.4074 c...",20.3 MP,280g
4,www.ilgs.net//119,ricoh,Clearance Product - Box Damage.16 Megapixel 4x...,16,236 g


In [11]:
stop_words = set(['itself', 'down', 'by', 'with', 'doesn', 'wouldn', 'other', 'ours', 'of', 'then', 'where', 'don', 'these', 'nor', 'she', "should've", 'won', 'ma', 'from', 'had', "you're", 'our', 'did', 'them', 'too', 'her', 'that', 'haven', 'after', "you'll", 'hers', 'because', 'yourself', 'against', 'mightn', 'as', 'll', 'whom', 'how', 'couldn', 'further', 'aren', "you'd", 'and', 'needn', "couldn't", 'those', 'to', "doesn't", "weren't", 'both', 'ourselves', 'in', 'which', 'yours', 'under', 'some', 'what', 'during', 'before', "needn't", "shan't", 'here', 'having', 'hasn', 'your', "hasn't", 'between', 'me', "she's", 'into', 'all', 'at', 'shan', 'who', 'o', 'an', 'very', 'can', 'you', 'shouldn', 'such', 'but', 'do', 'out', 'am', "shouldn't", 'above', 'wasn', 'or', 'were', 'own', 'didn', "you've", 'on', 'will', 'my', 'it', 'have', 'once', 'only', 'been', 'themselves', 'his', 'be', "mightn't", 'they', 'not', 'so', 'up', 'any', 'most', 'has', 'myself', 't', 'yourselves', 'isn', "it's", 'y', 'm', 'now', 'until', 're', 'there', 'their', 'mustn', "mustn't", 'again', 'being', 'hadn', 'doing', 'just', 'no', 'if', 've', "wasn't", "won't", 'we', 'below', 'does', 'more', 'this', 'should', "isn't", 'ain', "don't", 'i', "haven't", 'than', "didn't", 'are', 'about', 'off', 'him', 'for', 'few', "wouldn't", 'was', 'weren', 'why', 'he', "that'll", 'd', 'the', 'its', 'a', 'each', 'is', 'while', "aren't", 'when', 'theirs', 'same', 's', 'himself', 'herself', "hadn't", 'through', 'over'])
punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

def tokenize_stop_words_punctuation(x):
    return [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stop_words]

In [12]:
df['short description'] = df['short description'].apply(lambda value : tokenize_stop_words_punctuation(value) if not pd.isna(value) else np.nan)

In [13]:
df.head()

Unnamed: 0,spec_id,manufacturer,short description,megapixel,weight
0,www.ilgs.net//142,sony,"[detail, even, lowlight, wiht, 20x, optical, z...",Approx. 18.2 MP,166\n139 g
1,www.ilgs.net//98,fujifilm,"[12mp, 23, cmos, fullhd, 1920x108060p30p, wifi...",12 MP,187g
2,www.ilgs.net//77,canon,"[eos, 7d, efs, 1585mm, f3556, usm, 223mm, x, 1...","[18 MP, 18]","[820g, 820 g]"
3,www.ilgs.net//36,samsung,"[nx300, 203, mp, cmos, 235, x, 157, mm, 84074,...",20.3 MP,280g
4,www.ilgs.net//119,ricoh,"[clearance, product, box, damage16, megapixel,...",16,236 g


## Megapixel

In [14]:
df["megapixel"].value_counts()

16                 16
[18 MP, 18]         8
20.3 MP             6
18 MP               4
[16 MP, 16]         4
[20.4 MP, 20.4]     3
16 MP               3
16.49               3
[16, 16 MP]         3
16.3 MP             3
16.2 MP             2
12.1 MP             2
20.2 MP             2
20.1                2
12 MP               1
[20.2 MP, 20.2]     1
[16.1 MP, 16.1]     1
[16.1, 16.1 MP]     1
[20.1, 20.1 MP]     1
24.3 MP             1
[24.3 MP, 24.3]     1
[14.3 MP, 14.3]     1
[14, 14 MP]         1
[20.2, 20.2 MP]     1
18.2 MP             1
[14.1, 18.2 MP]     1
Approx. 18.2 MP     1
[12.1, 12.1 MP]     1
20.4 MP             1
20.1 MP             1
Name: megapixel, dtype: int64

In [15]:
def clean_megapixels(value):
    if not isinstance(value, list) and pd.isna(value):
        return np.nan
    else:
        if isinstance(value, str):
            if not "Approx." in value.split():
                return float(value.split()[0])
            else:
                return float(value.split()[1])
        else:
            return float(value[0].split()[0])

In [16]:
df["megapixel"] = df["megapixel"].apply(clean_megapixels)

In [17]:
df.head()

Unnamed: 0,spec_id,manufacturer,short description,megapixel,weight
0,www.ilgs.net//142,sony,"[detail, even, lowlight, wiht, 20x, optical, z...",18.2,166\n139 g
1,www.ilgs.net//98,fujifilm,"[12mp, 23, cmos, fullhd, 1920x108060p30p, wifi...",12.0,187g
2,www.ilgs.net//77,canon,"[eos, 7d, efs, 1585mm, f3556, usm, 223mm, x, 1...",18.0,"[820g, 820 g]"
3,www.ilgs.net//36,samsung,"[nx300, 203, mp, cmos, 235, x, 157, mm, 84074,...",20.3,280g
4,www.ilgs.net//119,ricoh,"[clearance, product, box, damage16, megapixel,...",16.0,236 g


## Weight

In [18]:
df["weight"].value_counts()

269 g            4
280g             4
271 g            3
650 g            3
218 g            3
300g             2
214 g            2
407 g            2
480 g            2
577g             2
[188 g, 188g]    2
580 g            2
236 g            2
247 g            2
755g             1
[820g, 820 g]    1
[820 g, 820g]    1
133g             1
[416g, 474 g]    1
[130g, 127 g]    1
[147g, 147 g]    1
534 g            1
[660 g, 633g]    1
[137g, 100 g]    1
125 g            1
228g             1
285g             1
[188g, 188 g]    1
[755g, 755 g]    1
[164 g, 143g]    1
245g             1
147g             1
[353 g, 353g]    1
760 g            1
109g             1
623g             1
[127 g, 130g]    1
165g             1
[147 g, 147g]    1
410g             1
[218 g, 218g]    1
216g             1
105g             1
90g              1
[240g, 213 g]    1
655 g            1
166\n139 g       1
[246g, 272 g]    1
375g             1
187g             1
Name: weight, dtype: int64

In [19]:
def clean_weight(value):
    if not isinstance(value, list) and pd.isna(value):
        return np.nan
    else:
        if isinstance(value, str):
            if not "\n" in value:
                value = value.replace("g", "")
                #print(value)
                return float(value)
            else:
                value = value.split("\n")[0]
        else:
            return float(value[0].split()[0].replace("g", ""))

In [20]:
df["weight"] = df["weight"].apply(clean_weight)

In [21]:
df.head()

Unnamed: 0,spec_id,manufacturer,short description,megapixel,weight
0,www.ilgs.net//142,sony,"[detail, even, lowlight, wiht, 20x, optical, z...",18.2,
1,www.ilgs.net//98,fujifilm,"[12mp, 23, cmos, fullhd, 1920x108060p30p, wifi...",12.0,187.0
2,www.ilgs.net//77,canon,"[eos, 7d, efs, 1585mm, f3556, usm, 223mm, x, 1...",18.0,820.0
3,www.ilgs.net//36,samsung,"[nx300, 203, mp, cmos, 235, x, 157, mm, 84074,...",20.3,280.0
4,www.ilgs.net//119,ricoh,"[clearance, product, box, damage16, megapixel,...",16.0,236.0


## Saving

In [22]:
df.head()

Unnamed: 0,spec_id,manufacturer,short description,megapixel,weight
0,www.ilgs.net//142,sony,"[detail, even, lowlight, wiht, 20x, optical, z...",18.2,
1,www.ilgs.net//98,fujifilm,"[12mp, 23, cmos, fullhd, 1920x108060p30p, wifi...",12.0,187.0
2,www.ilgs.net//77,canon,"[eos, 7d, efs, 1585mm, f3556, usm, 223mm, x, 1...",18.0,820.0
3,www.ilgs.net//36,samsung,"[nx300, 203, mp, cmos, 235, x, 157, mm, 84074,...",20.3,280.0
4,www.ilgs.net//119,ricoh,"[clearance, product, box, damage16, megapixel,...",16.0,236.0


In [23]:
df.columns = ["spec_id", "manufacturer", "short_descr", "megapixels", "weight"]

In [24]:
df.head()

Unnamed: 0,spec_id,manufacturer,short_descr,megapixels,weight
0,www.ilgs.net//142,sony,"[detail, even, lowlight, wiht, 20x, optical, z...",18.2,
1,www.ilgs.net//98,fujifilm,"[12mp, 23, cmos, fullhd, 1920x108060p30p, wifi...",12.0,187.0
2,www.ilgs.net//77,canon,"[eos, 7d, efs, 1585mm, f3556, usm, 223mm, x, 1...",18.0,820.0
3,www.ilgs.net//36,samsung,"[nx300, 203, mp, cmos, 235, x, 157, mm, 84074,...",20.3,280.0
4,www.ilgs.net//119,ricoh,"[clearance, product, box, damage16, megapixel,...",16.0,236.0


In [25]:
df.to_csv("../../datasets/unlabeled/cleaned/ilgs.csv", index=False)