In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk

import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']
    
    df = pd.DataFrame()
    progressive_id = 0
    progressive_id2row_df = {}
    #for source in tqdm(os.listdir(dataset_path)):
    source = "buy.net"
    for specification in os.listdir(os.path.join(dataset_path, source)):
        specification_number = specification.replace('.json', '')
        specification_id = '{}//{}'.format(source, specification_number)
        with open(os.path.join(dataset_path, source, specification)) as specification_file:
            columns_df = ['source', 'spec_number', 'spec_id']
            specification_data = json.load(specification_file)
            attrs = []
            for k, v in specification_data.items():
                columns_df.append(k)
                attrs.append(v)
            row = [source, specification_number, specification_id]
            row.extend(attrs)
            progressive_id2row_df.update({progressive_id: row})
            progressive_id += 1
            df = df.append(pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df))
            progressive_id2row_df = {}
    print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs')

>>> Creating dataframe...



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


                                          <page title> analog video out  \
0    Canon PowerShot SX170 IS Red Digital Camera (1...              NaN   
1    Leica V-LUX 4 Black Digital Camera (12.1 MP, 2...              NaN   
2    Olympus Stylus SH-1 Silver Digital Camera (16 ...              NaN   
3    Panasonic Lumix DMC-FX48 12.1 Megapixel Compac...              NaN   
4    Olympus VR-370 Black Digital Camera (16 MP, 12...              NaN   
5    VTech Kidizoom 0.3 Megapixel Compact Camera - ...              NaN   
6    Canon PowerShot SX700 Red Digital Camera (16.1...              NaN   
7    Vivitar ViviCam 46 Red Digital Camera (4 MP, S...              NaN   
8    Fujifilm X Series X-E2 Silver Digital Camera (...              NaN   
9    Canon EOS Rebel T1i 15.1 Megapixel Digital SLR...              NaN   
10   Samsung WB30F Cobalt Black Digital Camera (16....              NaN   
11   Nikon D7100 Black SLR Digital Camera (24.1 MP,...              NaN   
12   Olympus SP-550 7.1 M

In [None]:
print(df.columns.values)

In [None]:
df["spec_id"]

In [None]:
most_freq_columns = df.isna().sum().sort_values()[:20].keys()

In [None]:
new_df = df[most_freq_columns]

In [None]:
new_df.head()

## Title only

In [4]:
def create_dataframe(dataset_path):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']

    progressive_id = 0
    progressive_id2row_df = {}
    for source in tqdm(os.listdir(dataset_path)):
        for specification in os.listdir(os.path.join(dataset_path, source)):
            specification_number = specification.replace('.json', '')
            specification_id = '{}//{}'.format(source, specification_number)
            with open(os.path.join(dataset_path, source, specification)) as specification_file:
                specification_data = json.load(specification_file)
                page_title = specification_data.get('<page title>').lower()
                row = (source, specification_number, specification_id, page_title)
                progressive_id2row_df.update({progressive_id: row})
                progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print(df)
    print('>>> Dataframe created successfully!\n')
    return df

In [5]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs')

  0%|          | 0/24 [00:00<?, ?it/s]

>>> Creating dataframe...



100%|██████████| 24/24 [00:06<00:00,  3.52it/s]

                        source spec_number                       spec_id  \
0      www.wexphotographic.com         154  www.wexphotographic.com//154   
1      www.wexphotographic.com         553  www.wexphotographic.com//553   
2      www.wexphotographic.com         601  www.wexphotographic.com//601   
3      www.wexphotographic.com         197  www.wexphotographic.com//197   
4      www.wexphotographic.com         178  www.wexphotographic.com//178   
5      www.wexphotographic.com         206  www.wexphotographic.com//206   
6      www.wexphotographic.com         590  www.wexphotographic.com//590   
7      www.wexphotographic.com         210  www.wexphotographic.com//210   
8      www.wexphotographic.com         586  www.wexphotographic.com//586   
9      www.wexphotographic.com         569  www.wexphotographic.com//569   
10     www.wexphotographic.com         617  www.wexphotographic.com//617   
11     www.wexphotographic.com         181  www.wexphotographic.com//181   
12     www.w




In [6]:
df.head()

Unnamed: 0,source,spec_number,spec_id,page_title
0,www.wexphotographic.com,154,www.wexphotographic.com//154,nikon coolpix aw120 digital camera - camouflag...
1,www.wexphotographic.com,553,www.wexphotographic.com//553,canon ixus 150 digital camera - red (9148b007a...
2,www.wexphotographic.com,601,www.wexphotographic.com//601,fuji finepix s1 digital camera (p10nc12730a) -...
3,www.wexphotographic.com,197,www.wexphotographic.com//197,nikon coolpix s5300 digital camera - black (vn...
4,www.wexphotographic.com,178,www.wexphotographic.com//178,fuji finepix s8600 digital camera - red (p10nc...


In [8]:
df['page_title'][0]

'nikon coolpix aw120 digital camera - camouflage (vna593e1) - wex photographic'

In [9]:
 from nltk.tokenize import word_tokenize

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gerald/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

In [12]:
stopWords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [13]:
import string
punctuation = string.punctuation[:1] + string.punctuation[2:] + "€£¥₹₽"

In [14]:
punctuation

"!#$%&'()*+,-./:;<=>?@[\\]^_`{|}~€£¥₹₽"

In [15]:
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

In [16]:
tokenized = df["page_title"].apply(lambda x : [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stopWords])

In [17]:
tokenized

0        [nikon, coolpix, aw120, digital, camera, camou...
1        [canon, ixus, 150, digital, camera, red, 9148b...
2        [fuji, finepix, s1, digital, camera, p10nc1273...
3        [nikon, coolpix, s5300, digital, camera, black...
4        [fuji, finepix, s8600, digital, camera, red, p...
5        [nikon, coolpix, s3600, digital, camera, pink,...
6        [sony, cybershot, qx100, lens, style, digital,...
7        [nikon, coolpix, s5300, digital, camera, plum,...
8        [nikon, coolpix, s32, digital, camera, yellow,...
9        [samsung, wb1100f, digital, smart, camera, ecw...
10       [fuji, x100t, digital, camera, silver, p10nc13...
11       [nikon, coolpix, aw120, digital, camera, black...
12       [sony, cybershot, hx50, digital, camera, black...
13       [nikon, coolpix, p600, digital, camera, red, v...
14       [canon, ixus, 265, hs, digital, camera, pink, ...
15       [nikon, coolpix, digital, camera, silver, vna2...
16       [samsung, wb50f, digital, smart, camera, white.

## Model words tokenizer

In [18]:
import re
pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")

In [75]:
brands = ['360fly', 'acer', 'achiever', 'acorn', 'action', 'actionpro', 'activeon', 'aee', 'agfa', 'agfaphoto', 'aiptek', 'akaso', 'alpine', 'alpine', 'amkov', 'andoer', 'annke', 'ansco', 'apeman', 'apex', 'apple', 'archos', 'argus', 'arlo', 'arri', 'axis', 'bell', 'benq', 'blackmagic', 'blackmagic', 'bosch', 'bower', 'brinno', 'brookstone', 'browning', 'cambo', 'campark', 'canon', 'carl', 'casio', 'celestron', 'chinon', 'cisco', 'cobra', 'coleman', 'concord', 'contax', 'contour', 'covert', 'craig', 'crayola', 'creative', 'creative', 'crosstour', 'crumpler', 'datavideo', 'delkin', 'dell', 'digitrex', 'discovery', 'disney', 'dji', 'd-link', 'domke', 'dörr', 'dragon', 'dsc', 'dxg', 'dxo', 'easypix', 'elecom', 'elmo', 'emerson', 'energizer', 'epson', 'fisher-price', 'flip', 'flir', 'foscam', 'fotoman', 'fotopro', 'fuji', 'fujifilm', 'fujinon', 'garmin', 'gateway', 'godox', 'goodmans', 'google', 'gopro', 'grundig', 'hahnel', 'hamilton', 'hasselblad', 'hello', 'herofiber', 'hitachi', 'holga', 'horseman', 'hoya', 'htc', 'huawei', 'ikelite', 'ilford', 'impossible', 'innovage', 'insignia', 'insta360', 'intel', 'intova', 'ion', 'iris', 'jazz', 'jenoptik', 'jjrc', 'jvc', 'kaiser', 'kenko', 'keyence', 'king', 'kitvision', 'kodak', 'konica', 'kyocera', 'leaf', 'lego', 'leica', 'lenovo', 'lexibook', 'linhof', 'liquid', 'little', 'logitech', 'lomography', 'lowepro', 'ltl', 'lumix', 'lytro', 'maginon', 'magnavox', 'mamiya', 'manfrotto', 'marshall', 'marumi', 'mattel', 'maxell', 'meade', 'medion', 'memorex', 'mercury', 'metz', 'microsoft', 'microtek', 'midland', 'minolta', 'minox', 'monster', 'motorola', 'moultrie', 'mustek', 'nabi', 'neewer', 'nest', 'netgear', 'night', 'nikkon', 'nikkor', 'nikon', 'nilox', 'nintendo', 'nippon', 'nokia', 'norcent', 'olympus', 'optech', 'ordro', 'oregon', 'packard', 'palm', 'panasonic', 'parrot', 'pelco', 'pentacon', 'pentax', 'phase', 'philips', 'philips', 'phoenix', 'pioneer', 'playskool', 'polaroid', 'polarpro', 'praktica', 'premier', 'promaster', 'proscan', 'pyle', 'radioshack', 'raymarine', 'raynox', 'rca', 'ricoh', 'ring', 'rode', 'rokinon', 'rollei', 'ryobi', 'sakar', 'samsung', 'sandisk', 'sanyo', 'schneider', 'schneider', 'schneider', 'scosche', 'seasea', 'sealife', 'sharp', 'sharper', 'sigma', 'sinar', 'sipix', 'sjcam', 'sony', 'soocoo', 'stealth', 'superheadz', 'svp', 'swann', 'tamrac', 'tamron', 'technika', 'tenba', 'think', 'thule', 'tokina', 'tomy', 'toshiba', 'transcend', 'traveler', 'trust', 'verbatim', 'vibe', 'victure', 'vistaquest', 'vivitar', 'voigtländer', 'vtech', 'vupoint', 'walimex', 'wyze', 'xiaomi', 'xit', 'xtreme', 'yashica', 'zeiss', 'zoom']

In [89]:
model_words = tokenized.apply(lambda line : list(set(filter(lambda word : bool(pattern.match(word)) or word in brands,line))))

In [63]:
tokenized.iloc[0]

['nikon',
 'coolpix',
 'aw120',
 'digital',
 'camera',
 'camouflage',
 'vna593e1',
 'wex',
 'photographic']

In [90]:
mw = pd.DataFrame(model_words)

In [91]:
count = 0
for index, row in mw.iterrows():
    if len(row['page_title']) == 0:
        count += 1
print(count)

2334


In [74]:
print(len(mw))

29787


In [80]:
mw.head()

Unnamed: 0,page_title
0,"[nikon, aw120, vna593e1]"
1,"[canon, 9148b007aa]"
2,"[fuji, s1, p10nc12730a]"
3,"[nikon, s5300, vna540e1]"
4,"[fuji, s8600, p10nc12690a]"


In [94]:
count = 0
for index, row in mw.iterrows():
    if len(row['page_title']) > 3:
        #print(row['page_title'])
        count += 1
print(count)

4343
