In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import re

In [2]:
def create_dataframe(dataset_path, source):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    #source = "buy.net"
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
    #print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../../datasets/unlabeled/2013_camera_specs', "www.shopmania.in")

>>> Creating dataframe...



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


>>> Dataframe created successfully!



In [4]:
cols = ['spec_id',"brand", "product name", "resolution", "screen size"]

In [5]:
df = df[cols]

In [6]:
df.head()

Unnamed: 0,spec_id,brand,product name,resolution,screen size
0,www.shopmania.in//1659,Sony,Sony DSC-W630,16.1 MP (more than 84%),"2.7"""
1,www.shopmania.in//1120,Samsung,Samsung ES75,14.2 MP (more than 75%),"2.7"""
2,www.shopmania.in//1570,Sony,Sony DSC-RX1,24.3 MP (more than 98%),"3"" (more than 62%)"
3,www.shopmania.in//1065,Canon,Canon EOS 60D kit 18-55mm,18 MP (more than 90%),"3"" (more than 62%)"
4,www.shopmania.in//1527,Fujifilm,Fujifilm XQ1,12 MP (more than 51%),"3"" (more than 62%)"


## Brand

In [7]:
df['brand'] = df['brand'].apply(lambda x: x.lower())

In [8]:
df.head()

Unnamed: 0,spec_id,brand,product name,resolution,screen size
0,www.shopmania.in//1659,sony,Sony DSC-W630,16.1 MP (more than 84%),"2.7"""
1,www.shopmania.in//1120,samsung,Samsung ES75,14.2 MP (more than 75%),"2.7"""
2,www.shopmania.in//1570,sony,Sony DSC-RX1,24.3 MP (more than 98%),"3"" (more than 62%)"
3,www.shopmania.in//1065,canon,Canon EOS 60D kit 18-55mm,18 MP (more than 90%),"3"" (more than 62%)"
4,www.shopmania.in//1527,fujifilm,Fujifilm XQ1,12 MP (more than 51%),"3"" (more than 62%)"


## Product name

In [9]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words = set(['itself', 'down', 'by', 'with', 'doesn', 'wouldn', 'other', 'ours', 'of', 'then', 'where', 'don', 'these', 'nor', 'she', "should've", 'won', 'ma', 'from', 'had', "you're", 'our', 'did', 'them', 'too', 'her', 'that', 'haven', 'after', "you'll", 'hers', 'because', 'yourself', 'against', 'mightn', 'as', 'll', 'whom', 'how', 'couldn', 'further', 'aren', "you'd", 'and', 'needn', "couldn't", 'those', 'to', "doesn't", "weren't", 'both', 'ourselves', 'in', 'which', 'yours', 'under', 'some', 'what', 'during', 'before', "needn't", "shan't", 'here', 'having', 'hasn', 'your', "hasn't", 'between', 'me', "she's", 'into', 'all', 'at', 'shan', 'who', 'o', 'an', 'very', 'can', 'you', 'shouldn', 'such', 'but', 'do', 'out', 'am', "shouldn't", 'above', 'wasn', 'or', 'were', 'own', 'didn', "you've", 'on', 'will', 'my', 'it', 'have', 'once', 'only', 'been', 'themselves', 'his', 'be', "mightn't", 'they', 'not', 'so', 'up', 'any', 'most', 'has', 'myself', 't', 'yourselves', 'isn', "it's", 'y', 'm', 'now', 'until', 're', 'there', 'their', 'mustn', "mustn't", 'again', 'being', 'hadn', 'doing', 'just', 'no', 'if', 've', "wasn't", "won't", 'we', 'below', 'does', 'more', 'this', 'should', "isn't", 'ain', "don't", 'i', "haven't", 'than', "didn't", 'are', 'about', 'off', 'him', 'for', 'few', "wouldn't", 'was', 'weren', 'why', 'he', "that'll", 'd', 'the', 'its', 'a', 'each', 'is', 'while', "aren't", 'when', 'theirs', 'same', 's', 'himself', 'herself', "hadn't", 'through', 'over'])
punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

def tokenize_stop_words_punctuation(x):
    return [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(str(x)))) if i and i.lower() not in stop_words]

In [10]:
df['product name'] = df['product name'].apply(tokenize_stop_words_punctuation)

In [11]:
df.head()

Unnamed: 0,spec_id,brand,product name,resolution,screen size
0,www.shopmania.in//1659,sony,"[sony, dscw630]",16.1 MP (more than 84%),"2.7"""
1,www.shopmania.in//1120,samsung,"[samsung, es75]",14.2 MP (more than 75%),"2.7"""
2,www.shopmania.in//1570,sony,"[sony, dscrx1]",24.3 MP (more than 98%),"3"" (more than 62%)"
3,www.shopmania.in//1065,canon,"[canon, eos, 60d, kit, 1855mm]",18 MP (more than 90%),"3"" (more than 62%)"
4,www.shopmania.in//1527,fujifilm,"[fujifilm, xq1]",12 MP (more than 51%),"3"" (more than 62%)"


### Resolution

In [12]:
df[df['resolution'].notnull()]

Unnamed: 0,spec_id,brand,product name,resolution,screen size
0,www.shopmania.in//1659,sony,"[sony, dscw630]",16.1 MP (more than 84%),"2.7"""
1,www.shopmania.in//1120,samsung,"[samsung, es75]",14.2 MP (more than 75%),"2.7"""
2,www.shopmania.in//1570,sony,"[sony, dscrx1]",24.3 MP (more than 98%),"3"" (more than 62%)"
3,www.shopmania.in//1065,canon,"[canon, eos, 60d, kit, 1855mm]",18 MP (more than 90%),"3"" (more than 62%)"
4,www.shopmania.in//1527,fujifilm,"[fujifilm, xq1]",12 MP (more than 51%),"3"" (more than 62%)"
5,www.shopmania.in//1177,fujifilm,"[fujifilm, finepix, s4800]",16 MP (more than 78%),"3"" (more than 62%)"
6,www.shopmania.in//904,nikon,"[nikon, d7100, kit, 18140mm]",24.1 MP (more than 98%),
7,www.shopmania.in//1462,sony,"[sony, cybershot, dsch100]",16.1 MP (more than 84%),"3"" (more than 62%)"
8,www.shopmania.in//1032,fujifilm,"[fujifilm, xs1]",12 MP (more than 51%),"3"" (more than 62%)"
9,www.shopmania.in//1618,sony,"[sony, slta99, body]",24.3 MP (more than 98%),"3"" (more than 62%)"


In [13]:
def parse_megapixels(value):
    if pd.isna(value):
        return value
    else:
        match1 = re.search('(\d*\,\d+|\d+) MP', str(value))
        if match1 is None: 
            print("SKIP")
            print(value)
            return float("NaN")


        return (match1.group(1).replace(",","."))

        

In [14]:
df["resolution"] = df["resolution"].apply(parse_megapixels)

In [15]:
df.head()

Unnamed: 0,spec_id,brand,product name,resolution,screen size
0,www.shopmania.in//1659,sony,"[sony, dscw630]",1,"2.7"""
1,www.shopmania.in//1120,samsung,"[samsung, es75]",2,"2.7"""
2,www.shopmania.in//1570,sony,"[sony, dscrx1]",3,"3"" (more than 62%)"
3,www.shopmania.in//1065,canon,"[canon, eos, 60d, kit, 1855mm]",18,"3"" (more than 62%)"
4,www.shopmania.in//1527,fujifilm,"[fujifilm, xq1]",12,"3"" (more than 62%)"


## Screen size

In [16]:
df[df['screen size'].notnull()]

Unnamed: 0,spec_id,brand,product name,resolution,screen size
0,www.shopmania.in//1659,sony,"[sony, dscw630]",1,"2.7"""
1,www.shopmania.in//1120,samsung,"[samsung, es75]",2,"2.7"""
2,www.shopmania.in//1570,sony,"[sony, dscrx1]",3,"3"" (more than 62%)"
3,www.shopmania.in//1065,canon,"[canon, eos, 60d, kit, 1855mm]",18,"3"" (more than 62%)"
4,www.shopmania.in//1527,fujifilm,"[fujifilm, xq1]",12,"3"" (more than 62%)"
5,www.shopmania.in//1177,fujifilm,"[fujifilm, finepix, s4800]",16,"3"" (more than 62%)"
7,www.shopmania.in//1462,sony,"[sony, cybershot, dsch100]",1,"3"" (more than 62%)"
8,www.shopmania.in//1032,fujifilm,"[fujifilm, xs1]",12,"3"" (more than 62%)"
9,www.shopmania.in//1618,sony,"[sony, slta99, body]",3,"3"" (more than 62%)"
10,www.shopmania.in//1248,canon,"[canon, powershot, sx150]",1,"3"" (more than 62%)"


In [17]:
def parse_screen_size(value):
    if pd.isna(value):
        return value
    else:
        m = re.search('(\d*\.\d+|\d*\,\d+|\d+)\"', str(value))
        if m is None:

            print("SKIP")
            return float("NaN")

        return m.group(1).replace(",",".")
        

In [18]:
df["screen size"] = df["screen size"].apply(parse_screen_size)

In [19]:
df.head()

Unnamed: 0,spec_id,brand,product name,resolution,screen size
0,www.shopmania.in//1659,sony,"[sony, dscw630]",1,2.7
1,www.shopmania.in//1120,samsung,"[samsung, es75]",2,2.7
2,www.shopmania.in//1570,sony,"[sony, dscrx1]",3,3.0
3,www.shopmania.in//1065,canon,"[canon, eos, 60d, kit, 1855mm]",18,3.0
4,www.shopmania.in//1527,fujifilm,"[fujifilm, xq1]",12,3.0


### Final cleaning

In [20]:
df.to_csv("../../datasets/unlabeled/cleaned/shopmania.csv", index=False)