## Loading the data

In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']

    progressive_id = 0
    progressive_id2row_df = {}
    for source in tqdm(os.listdir(dataset_path)):
        for specification in os.listdir(os.path.join(dataset_path, source)):
            specification_number = specification.replace('.json', '')
            specification_id = '{}//{}'.format(source, specification_number)
            with open(os.path.join(dataset_path, source, specification)) as specification_file:
                specification_data = json.load(specification_file)
                page_title = specification_data.get('<page title>').lower()
                row = (source, specification_number, specification_id, page_title)
                progressive_id2row_df.update({progressive_id: row})
                progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs')

  0%|          | 0/24 [00:00<?, ?it/s]

>>> Creating dataframe...



100%|██████████| 24/24 [00:01<00:00, 12.57it/s]

>>> Dataframe created successfully!






## Cleaning

In [4]:
df = df.drop(columns = ["source", "spec_number"], axis = 1)

In [5]:
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

In [6]:
stopWords = set(['itself', 'down', 'by', 'with', 'doesn', 'wouldn', 'other', 'ours', 'of', 'then', 'where', 'don', 'these', 'nor', 'she', "should've", 'won', 'ma', 'from', 'had', "you're", 'our', 'did', 'them', 'too', 'her', 'that', 'haven', 'after', "you'll", 'hers', 'because', 'yourself', 'against', 'mightn', 'as', 'll', 'whom', 'how', 'couldn', 'further', 'aren', "you'd", 'and', 'needn', "couldn't", 'those', 'to', "doesn't", "weren't", 'both', 'ourselves', 'in', 'which', 'yours', 'under', 'some', 'what', 'during', 'before', "needn't", "shan't", 'here', 'having', 'hasn', 'your', "hasn't", 'between', 'me', "she's", 'into', 'all', 'at', 'shan', 'who', 'o', 'an', 'very', 'can', 'you', 'shouldn', 'such', 'but', 'do', 'out', 'am', "shouldn't", 'above', 'wasn', 'or', 'were', 'own', 'didn', "you've", 'on', 'will', 'my', 'it', 'have', 'once', 'only', 'been', 'themselves', 'his', 'be', "mightn't", 'they', 'not', 'so', 'up', 'any', 'most', 'has', 'myself', 't', 'yourselves', 'isn', "it's", 'y', 'm', 'now', 'until', 're', 'there', 'their', 'mustn', "mustn't", 'again', 'being', 'hadn', 'doing', 'just', 'no', 'if', 've', "wasn't", "won't", 'we', 'below', 'does', 'more', 'this', 'should', "isn't", 'ain', "don't", 'i', "haven't", 'than', "didn't", 'are', 'about', 'off', 'him', 'for', 'few', "wouldn't", 'was', 'weren', 'why', 'he', "that'll", 'd', 'the', 'its', 'a', 'each', 'is', 'while', "aren't", 'when', 'theirs', 'same', 's', 'himself', 'herself', "hadn't", 'through', 'over'])

In [7]:
punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"

In [8]:
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

In [9]:
df["page_title"] = df["page_title"].apply(lambda x : [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stopWords])

## Model Words

In [10]:
pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")

In [11]:
## In the data replace lumix with panasonic

In [12]:
brands = ['360fly', 'acer', 'achiever', 'acorn', 'actionpro', 'activeon', 'aee', 'agfa', 'agfaphoto', 'aiptek', 'akaso', 'alpine', 'alpine', 'amkov', 'andoer', 'annke', 'ansco', 'apeman', 'apex', 'apple', 'archos', 'argus', 'arlo', 'arri', 'axis', 'bell', 'benq', 'blackmagic', 'blackmagic', 'bosch', 'bower', 'brinno', 'brookstone', 'browning', 'cambo', 'campark', 'canon', 'carl', 'casio', 'celestron', 'chinon', 'cisco', 'cobra', 'coleman', 'concord', 'contax', 'contour', 'covert', 'craig', 'crayola', 'creative', 'creative', 'crosstour', 'crumpler', 'datavideo', 'delkin', 'dell', 'digitrex', 'discovery', 'disney', 'dji', 'd-link', 'domke', 'dörr', 'dragon', 'dxg', 'dxo', 'easypix', 'elecom', 'elmo', 'emerson', 'energizer', 'epson', 'fisher-price', 'flip', 'flir', 'foscam', 'fotoman', 'fotopro', 'fuji', 'fujifilm', 'fujinon', 'garmin', 'gateway', 'godox', 'goodmans', 'google', 'gopro', 'grundig', 'hahnel', 'hamilton', 'hasselblad', 'hello', 'herofiber', 'hitachi', 'holga', 'horseman', 'hoya', 'htc', 'huawei', 'ikelite', 'ilford', 'impossible', 'innovage', 'insignia', 'insta360', 'intel', 'intova', 'ion', 'iris', 'jazz', 'jenoptik', 'jjrc', 'jvc', 'kaiser', 'kenko', 'keyence', 'king', 'kitvision', 'kodak', 'konica', 'kyocera', 'leaf', 'lego', 'leica', 'lenovo', 'lexibook', 'linhof', 'liquid', 'little', 'logitech', 'lomography', 'lowepro', 'ltl', 'lytro', 'maginon', 'magnavox', 'mamiya', 'manfrotto', 'marshall', 'marumi', 'mattel', 'maxell', 'meade', 'medion', 'memorex', 'mercury', 'metz', 'microsoft', 'microtek', 'midland', 'minolta', 'minox', 'monster', 'motorola', 'moultrie', 'mustek', 'nabi', 'neewer', 'nest', 'netgear', 'night', 'nikkon', 'nikkor', 'nikon', 'nilox', 'nintendo', 'nippon', 'nokia', 'norcent', 'olympus', 'optech', 'ordro', 'oregon', 'packard', 'palm', 'panasonic', 'parrot', 'pelco', 'pentacon', 'pentax', 'phase', 'philips', 'philips', 'phoenix', 'pioneer', 'playskool', 'polaroid', 'polarpro', 'praktica', 'premier', 'promaster', 'proscan', 'pyle', 'radioshack', 'raymarine', 'raynox', 'rca', 'ricoh', 'ring', 'rode', 'rokinon', 'rollei', 'ryobi', 'sakar', 'samsung', 'sandisk', 'sanyo', 'schneider', 'schneider', 'schneider', 'scosche', 'seasea', 'sealife', 'sharp', 'sharper', 'sigma', 'sinar', 'sipix', 'sjcam', 'sony', 'soocoo', 'stealth', 'superheadz', 'svp', 'swann', 'tamrac', 'tamron', 'technika', 'tenba', 'think', 'thule', 'tokina', 'tomy', 'toshiba', 'transcend', 'traveler', 'trust', 'verbatim', 'vibe', 'victure', 'vistaquest', 'vivitar', 'voigtländer', 'vtech', 'vupoint', 'walimex', 'wyze', 'xiaomi', 'xit', 'xtreme', 'yashica', 'zeiss']

In [13]:
df["page_title"] = df["page_title"].apply(lambda line : list(set(filter(lambda word : bool(pattern.match(word)) or word in brands,line))))

In [14]:
df.head()

Unnamed: 0,spec_id,page_title
0,www.wexphotographic.com//154,"[vna593e1, nikon, aw120]"
1,www.wexphotographic.com//553,"[9148b007aa, canon]"
2,www.wexphotographic.com//601,"[fuji, p10nc12730a, s1]"
3,www.wexphotographic.com//197,"[s5300, vna540e1, nikon]"
4,www.wexphotographic.com//178,"[fuji, s8600, p10nc12690a]"


In [15]:
df["brand"] = ""

In [16]:
# See how many products have more than one brand

times = 0
count = 0

for index, row in df.iterrows():
    for brand in row["page_title"]:
        if brand in brands:
            times += 1
            if times > 1:
                count += 1
                #print(row["page_title"])
            df["brand"].iloc[index] = brand
    times = 0

In [17]:
print(count)

964


In [18]:
df.head()

Unnamed: 0,spec_id,page_title,brand
0,www.wexphotographic.com//154,"[vna593e1, nikon, aw120]",nikon
1,www.wexphotographic.com//553,"[9148b007aa, canon]",canon
2,www.wexphotographic.com//601,"[fuji, p10nc12730a, s1]",fuji
3,www.wexphotographic.com//197,"[s5300, vna540e1, nikon]",nikon
4,www.wexphotographic.com//178,"[fuji, s8600, p10nc12690a]",fuji


In [26]:
count = 0
for index, row in df.iterrows():
    if row["brand"] == "":
        df["brand"].iloc[index] = "unbranded"

In [24]:
count

2388

In [27]:
# manage entries with multiple brands

In [37]:
df = df.groupby(["brand"])

In [51]:
canon = df.get_group("canon")

In [52]:
canon.drop(columns=["brand"], axis = 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [53]:
canon.head()

Unnamed: 0,spec_id,page_title
1,www.wexphotographic.com//553,"[9148b007aa, canon]"
14,www.wexphotographic.com//621,"[9354b007aa, canon]"
22,www.wexphotographic.com//572,"[sx700, canon, 9339b014aa]"
24,www.wexphotographic.com//620,"[d30, 9337b012aa, canon]"
29,www.wexphotographic.com//568,"[9145b007aa, canon]"


In [54]:
type(canon)

pandas.core.frame.DataFrame

In [55]:
canon = (canon.merge(canon, on=canon.assign(key_col=1)['key_col'], suffixes=('', '_right'))
 .query('spec_id != spec_id_right') # filter out joins on the same row
 .reset_index(drop=True))

In [56]:
canon.head()

Unnamed: 0,key_0,spec_id,page_title,spec_id_right,page_title_right
0,1,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//621,"[9354b007aa, canon]"
1,1,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//572,"[sx700, canon, 9339b014aa]"
2,1,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//620,"[d30, 9337b012aa, canon]"
3,1,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//568,"[9145b007aa, canon]"
4,1,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//211,"[g1, 9167b022aa, canon]"


In [57]:
canon.drop(columns = ["key_0"], axis = 1, inplace=True)

In [59]:
canon.rename(columns = {"spec_id" : "spec_id_left"}, inplace=True)

In [60]:
canon.head()

Unnamed: 0,spec_id_left,page_title,spec_id_right,page_title_right
0,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//621,"[9354b007aa, canon]"
1,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//572,"[sx700, canon, 9339b014aa]"
2,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//620,"[d30, 9337b012aa, canon]"
3,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//568,"[9145b007aa, canon]"
4,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//211,"[g1, 9167b022aa, canon]"


In [61]:
len(canon)

29533790

## "Algorithm"

In [63]:
import time
t = time.time()

commons = 0
found_labels = []
for index, row in canon.iterrows():
    if index % 100000 == 0:
        print(index)
    for spec in row["page_title"]:
        if spec in row["page_title_right"]:
            commons += 1
    if commons >= 2:
        found_labels.append(1)
    else:
        found_labels.append(0)
    commons = 0
    
elapsed = time.time() - t
print(elapsed)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000
6600000
6700000
6800000
6900000
7000000
7100000
7200000
7300000
7400000
7500000
7600000
7700000
7800000
7900000
8000000
8100000
8200000
8300000
8400000
8500000
8600000
8700000
8800000
8900000
9000000
9100000
9200000
9300000
9400000
9500000
9600000
9700000
9800000
9900000
10000000
10100000
10200000
10300000
10400000
10500000
10600000
10700000
10800000
10900000
11000000
11100000
11200000
11300000
11400000
11500000
11600000
11700000
11800000
11900000
12000000
12100000
12200000
12300000

In [65]:
canon["found_labels"] = found_labels

In [66]:
canon.head()

Unnamed: 0,spec_id_left,page_title,spec_id_right,page_title_right,found_labels
0,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//621,"[9354b007aa, canon]",0
1,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//572,"[sx700, canon, 9339b014aa]",0
2,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//620,"[d30, 9337b012aa, canon]",0
3,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//568,"[9145b007aa, canon]",0
4,www.wexphotographic.com//553,"[9148b007aa, canon]",www.wexphotographic.com//211,"[g1, 9167b022aa, canon]",0


In [70]:
matches = canon.loc[canon['found_labels'] == 1]

In [71]:
canon.to_csv("../datasets/created/canon_all_labeled.csv", index=False)

In [74]:
canon.drop(columns=["page_title", "page_title_right"], axis = 1).to_csv("../datasets/created/canon_compact_all_labeled.csv", index=False)

In [76]:
matches.drop(columns=["page_title", "page_title_right", "found_labels"], axis = 1).to_csv("../datasets/created/canon_compact_matches_labeled.csv",index = False)