In [9]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk

import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    source = "buy.net"
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
    print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs')

>>> Creating dataframe...



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


                                          <page title> analog video out  \
0    Canon PowerShot SX170 IS Red Digital Camera (1...              NaN   
1    Leica V-LUX 4 Black Digital Camera (12.1 MP, 2...              NaN   
2    Olympus Stylus SH-1 Silver Digital Camera (16 ...              NaN   
3    Panasonic Lumix DMC-FX48 12.1 Megapixel Compac...              NaN   
4    Olympus VR-370 Black Digital Camera (16 MP, 12...              NaN   
5    VTech Kidizoom 0.3 Megapixel Compact Camera - ...              NaN   
6    Canon PowerShot SX700 Red Digital Camera (16.1...              NaN   
7    Vivitar ViviCam 46 Red Digital Camera (4 MP, S...              NaN   
8    Fujifilm X Series X-E2 Silver Digital Camera (...              NaN   
9    Canon EOS Rebel T1i 15.1 Megapixel Digital SLR...              NaN   
10   Samsung WB30F Cobalt Black Digital Camera (16....              NaN   
11   Nikon D7100 Black SLR Digital Camera (24.1 MP,...              NaN   
12   Olympus SP-550 7.1 M

In [4]:
print(df.columns.values)

['<page title>' 'analog video out' 'autofocus points' 'battery builtin'
 'battery include' 'battery model supported' 'battery rechargeable'
 'brand name' 'builtin flash' 'bulb setting' 'camera body only'
 'camera modes' 'camera type' 'color' 'continuous shooting speed' 'depth'
 'digital zoom' 'effective megapixels' 'environmental protection'
 'exposure compensation range ' 'exposure control' 'exposure settings'
 'flash' 'focal length' 'focal length 35mm equivalent'
 'focal length conversion factor slr' 'focus features' 'frequency band'
 'general features' 'gps' 'gps enabled' 'green compliance' 'hd movie mode'
 'height' 'image format' 'image sensor' 'image sensor quantity'
 'image sensor size' 'image stabilization' 'included components'
 'installed memory' 'interface connection' 'iso equivalencies'
 'iso sensitivity' 'language support' 'lcd screen size'
 'lens construction' 'lens mount' 'longest shutter speed' 'max aperture'
 'max focal length' 'max horizontal image resolution'
 'max ve

In [5]:
df["spec_id"]

0      buy.net//6036
1      buy.net//5860
2      buy.net//5925
3      buy.net//6061
4      buy.net//5837
5      buy.net//5421
6      buy.net//5972
7      buy.net//6098
8      buy.net//6077
9      buy.net//5572
10     buy.net//5964
11     buy.net//6759
12     buy.net//5525
13     buy.net//5362
14     buy.net//5698
15     buy.net//6519
16     buy.net//5509
17     buy.net//5773
18     buy.net//5909
19     buy.net//6299
20     buy.net//6221
21     buy.net//6743
22     buy.net//6201
23     buy.net//6082
24     buy.net//6578
25     buy.net//5641
26     buy.net//5987
27     buy.net//6647
28     buy.net//6094
29     buy.net//5483
           ...      
328    buy.net//5855
329    buy.net//6146
330    buy.net//6003
331    buy.net//6296
332    buy.net//5455
333    buy.net//6783
334    buy.net//6279
335    buy.net//6500
336    buy.net//6015
337    buy.net//5793
338    buy.net//5814
339    buy.net//5951
340    buy.net//6384
341    buy.net//6170
342    buy.net//5475
343    buy.net//6259
344    buy.ne

In [28]:
most_freq_columns = df.isna().sum().sort_values()[:20].keys()

In [29]:
new_df = df[most_freq_columns]

In [30]:
new_df.head()

Unnamed: 0,<page title>,source,spec_id,spec_number,camera type,lcd screen size,url,memory card support,effective megapixels,height,width,depth,weight,image sensor,flash,maximum video capture resolution,warranty information,image stabilization,total pixels,focal length
0,Canon PowerShot SX170 IS Red Digital Camera (1...,buy.net,buy.net//6036,6036,Compact Camera,3 in,http://www.usa.canon.com,Secure Digital (SD) Card|Secure Digital Extend...,16 Megapixel,2.8 in,4.3 in,1.7 in,8.04 oz,CCD,Auto Flash|Flash OFF|Flash ON|Slow Sync,1280 x 720,1 Year,Optical,16600000.0,5 mm to 80 mm
1,"Leica V-LUX 4 Black Digital Camera (12.1 MP, 2...",buy.net,buy.net//5860,5860,Bridge Camera,3 in,http://www.leica-camera.com,Secure Digital (SD) Card|Secure Digital Extend...,12.1 Megapixel,3.4 in,4.9 in,4.3 in,1.30 lbs,CMOS,Auto Flash|Flash OFF|Flash ON|Slow Sync|Pre-fl...,1920 x 1080,,Optical,12800000.0,4.50 mm to 108 mm
2,Olympus Stylus SH-1 Silver Digital Camera (16 ...,buy.net,buy.net//5925,5925,Point & Shoot Digital Camera,3 in,http://www.getolympus.com/us/en/sh-1.html,Secure Digital High Capacity (SDHC) Card|Secur...,16000000 pixels,2.5 in,4.3 in,1.7 in,9.5 oz,CMOS,Built-in Flash,1920 x 1080,1 year(s),Sensor Shift,,4.5 mm to 108 mm
3,Panasonic Lumix DMC-FX48 12.1 Megapixel Compac...,buy.net,buy.net//6061,6061,Compact Camera,2.5 in,http://www.panasonic.com,Secure Digital (SD) Card|Secure Digital High C...,12.1 Megapixel,2.1 in,3.8 in,0.9 in,4.48 oz,CCD,Auto Flash|Flash OFF|Flash ON|Red-eye Reduction,848 x 480,1 Year Limited,Optical,12700000.0,4.40 mm to 22 mm
4,"Olympus VR-370 Black Digital Camera (16 MP, 12...",buy.net,buy.net//5837,5837,Compact Camera,3 in,http://www.olympusamerica.com,Secure Digital (SD) Card|Secure Digital Extend...,16 Megapixel,2.4 in,4.1 in,1.1 in,6.07 oz,CCD,Auto Flash|Flash OFF|Flash ON|Red-eye Reduction,1280 x 720,1 Year,Optical|Electronic,16500000.0,4.20 mm to 52.50 mm


## Title only

In [6]:
def create_dataframe(dataset_path):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']

    progressive_id = 0
    progressive_id2row_df = {}
    for source in tqdm(os.listdir(dataset_path)):
        for specification in os.listdir(os.path.join(dataset_path, source)):
            specification_number = specification.replace('.json', '')
            specification_id = '{}//{}'.format(source, specification_number)
            with open(os.path.join(dataset_path, source, specification)) as specification_file:
                specification_data = json.load(specification_file)
                page_title = specification_data.get('<page title>').lower()
                row = (source, specification_number, specification_id, page_title)
                progressive_id2row_df.update({progressive_id: row})
                progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [7]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs')

  8%|▊         | 2/24 [00:00<00:02, 10.70it/s]

>>> Creating dataframe...



100%|██████████| 24/24 [00:07<00:00,  3.37it/s]

                        source spec_number                       spec_id  \
0      www.wexphotographic.com         154  www.wexphotographic.com//154   
1      www.wexphotographic.com         553  www.wexphotographic.com//553   
2      www.wexphotographic.com         601  www.wexphotographic.com//601   
3      www.wexphotographic.com         197  www.wexphotographic.com//197   
4      www.wexphotographic.com         178  www.wexphotographic.com//178   
5      www.wexphotographic.com         206  www.wexphotographic.com//206   
6      www.wexphotographic.com         590  www.wexphotographic.com//590   
7      www.wexphotographic.com         210  www.wexphotographic.com//210   
8      www.wexphotographic.com         586  www.wexphotographic.com//586   
9      www.wexphotographic.com         569  www.wexphotographic.com//569   
10     www.wexphotographic.com         617  www.wexphotographic.com//617   
11     www.wexphotographic.com         181  www.wexphotographic.com//181   
12     www.w




In [8]:
df.head()

Unnamed: 0,source,spec_number,spec_id,page_title
0,www.wexphotographic.com,154,www.wexphotographic.com//154,nikon coolpix aw120 digital camera - camouflag...
1,www.wexphotographic.com,553,www.wexphotographic.com//553,canon ixus 150 digital camera - red (9148b007a...
2,www.wexphotographic.com,601,www.wexphotographic.com//601,fuji finepix s1 digital camera (p10nc12730a) -...
3,www.wexphotographic.com,197,www.wexphotographic.com//197,nikon coolpix s5300 digital camera - black (vn...
4,www.wexphotographic.com,178,www.wexphotographic.com//178,fuji finepix s8600 digital camera - red (p10nc...


In [10]:
 from nltk.tokenize import word_tokenize

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gerald/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

In [17]:
stopWords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [24]:
import string
punctuation = string.punctuation[:1] + string.punctuation[2:] + "€£¥₹₽"

In [25]:
punctuation

"!#$%&'()*+,-./:;<=>?@[\\]^_`{|}~€£¥₹₽"

In [44]:
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

In [51]:
tokenized = df["page_title"].apply(lambda x : [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stopWords])

In [52]:
tokenized

0        [nikon, coolpix, aw120, digital, camera, camou...
1        [canon, ixus, 150, digital, camera, red, 9148b...
2        [fuji, finepix, s1, digital, camera, p10nc1273...
3        [nikon, coolpix, s5300, digital, camera, black...
4        [fuji, finepix, s8600, digital, camera, red, p...
5        [nikon, coolpix, s3600, digital, camera, pink,...
6        [sony, cybershot, qx100, lens, style, digital,...
7        [nikon, coolpix, s5300, digital, camera, plum,...
8        [nikon, coolpix, s32, digital, camera, yellow,...
9        [samsung, wb1100f, digital, smart, camera, ecw...
10       [fuji, x100t, digital, camera, silver, p10nc13...
11       [nikon, coolpix, aw120, digital, camera, black...
12       [sony, cybershot, hx50, digital, camera, black...
13       [nikon, coolpix, p600, digital, camera, red, v...
14       [canon, ixus, 265, hs, digital, camera, pink, ...
15       [nikon, coolpix, digital, camera, silver, vna2...
16       [samsung, wb50f, digital, smart, camera, white.

## Model words tokenizer

In [53]:
import re
pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")

In [62]:
a = bool(pattern.match("mfmfm"))

In [63]:
print(a)

False


In [66]:
model_words = tokenized.apply(lambda line : list(filter(lambda word : bool(pattern.match(word)),line)))

In [69]:
tokenized.iloc[0]

['nikon',
 'coolpix',
 'aw120',
 'digital',
 'camera',
 'camouflage',
 'vna593e1',
 'wex',
 'photographic']

In [70]:
model_words

0               [aw120, vna593e1]
1                    [9148b007aa]
2               [s1, p10nc12730a]
3               [s5300, vna540e1]
4            [s8600, p10nc12690a]
5               [s3600, vna555e1]
6           [qx100, dscqx100bce7]
7               [s5300, vna542e1]
8                 [s32, vna583e1]
9        [wb1100f, ecwb1100bpbgb]
10           [x100t, p10nc13260a]
11              [aw120, vna590e1]
12         [hx50, dschx50casdnyg]
13               [p600, vna481e1]
14                   [9354b007aa]
15                     [vna231e1]
16         [wb50f, ecwb50fzbpwgb]
17            [rx1r, dscrx1rdiyg]
18             [x30, p10nc13270a]
19           [dmctz60, dmctz60eb]
20           [sh60, v107070se000]
21            [w830, dscw830pceh]
22            [sx700, 9339b014aa]
23                             []
24              [d30, 9337b012aa]
25                          [wg4]
26            [sh1, v107080be000]
27         [wb50f, ecwb50fzbpbgb]
28            [h400, dsch400bceh]
29            