### Loading the data

In [1]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../datasets/labeled/labeled_with_titles_large.csv")

In [3]:
df.head()

Unnamed: 0,left_spec_id,right_spec_id,label,left_page_title,right_page_title
0,www.ebay.com//53278,www.garricks.com.au//31,1,nikon d3200 24 2 mp digital slr camera black k...,nikon d3200 black w/ 18-55mm vr lens
1,www.ebay.com//53278,www.priceme.co.nz//2246,0,nikon d3200 24 2 mp digital slr camera black k...,canon eos 1200d + 18-55/3.5-5.6 new zealand p...
2,www.ebay.com//53278,www.shopbot.com.au//1376,0,nikon d3200 24 2 mp digital slr camera black k...,nikon d7000 / 18-105mm vr kit - price comparis...
3,www.ebay.com//53278,www.flipkart.com//2193,0,nikon d3200 24 2 mp digital slr camera black k...,canon eos 1200d kit (ef s18-55 is ii + 55-250 ...
4,www.ebay.com//53278,www.ebay.com//58781,0,nikon d3200 24 2 mp digital slr camera black k...,canon eos 5d digital slr camera black body onl...


## Cleaning

In [4]:
 from nltk.tokenize import word_tokenize

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gfotiadis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/gfotiadis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

In [8]:
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

In [9]:
import string
punctuation = string.punctuation[:1] + string.punctuation[2:] + "€£¥₹₽"

In [10]:
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

In [11]:
df["left_page_title"] = df["left_page_title"].apply(lambda x : [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stopWords])

In [12]:
df["right_page_title"] = df["right_page_title"].apply(lambda x : [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stopWords])

## Model words

In [13]:
import re
pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")

In [14]:
brands = ['360fly', 'acer', 'achiever', 'acorn', 'action', 'actionpro', 'activeon', 'aee', 'agfa', 'agfaphoto', 'aiptek', 'akaso', 'alpine', 'alpine', 'amkov', 'andoer', 'annke', 'ansco', 'apeman', 'apex', 'apple', 'archos', 'argus', 'arlo', 'arri', 'axis', 'bell', 'benq', 'blackmagic', 'blackmagic', 'bosch', 'bower', 'brinno', 'brookstone', 'browning', 'cambo', 'campark', 'canon', 'carl', 'casio', 'celestron', 'chinon', 'cisco', 'cobra', 'coleman', 'concord', 'contax', 'contour', 'covert', 'craig', 'crayola', 'creative', 'creative', 'crosstour', 'crumpler', 'datavideo', 'delkin', 'dell', 'digitrex', 'discovery', 'disney', 'dji', 'd-link', 'domke', 'dörr', 'dragon', 'dsc', 'dxg', 'dxo', 'easypix', 'elecom', 'elmo', 'emerson', 'energizer', 'epson', 'fisher-price', 'flip', 'flir', 'foscam', 'fotoman', 'fotopro', 'fuji', 'fujifilm', 'fujinon', 'garmin', 'gateway', 'godox', 'goodmans', 'google', 'gopro', 'grundig', 'hahnel', 'hamilton', 'hasselblad', 'hello', 'herofiber', 'hitachi', 'holga', 'horseman', 'hoya', 'htc', 'huawei', 'ikelite', 'ilford', 'impossible', 'innovage', 'insignia', 'insta360', 'intel', 'intova', 'ion', 'iris', 'jazz', 'jenoptik', 'jjrc', 'jvc', 'kaiser', 'kenko', 'keyence', 'king', 'kitvision', 'kodak', 'konica', 'kyocera', 'leaf', 'lego', 'leica', 'lenovo', 'lexibook', 'linhof', 'liquid', 'little', 'logitech', 'lomography', 'lowepro', 'ltl', 'lumix', 'lytro', 'maginon', 'magnavox', 'mamiya', 'manfrotto', 'marshall', 'marumi', 'mattel', 'maxell', 'meade', 'medion', 'memorex', 'mercury', 'metz', 'microsoft', 'microtek', 'midland', 'minolta', 'minox', 'monster', 'motorola', 'moultrie', 'mustek', 'nabi', 'neewer', 'nest', 'netgear', 'night', 'nikkon', 'nikkor', 'nikon', 'nilox', 'nintendo', 'nippon', 'nokia', 'norcent', 'olympus', 'optech', 'ordro', 'oregon', 'packard', 'palm', 'panasonic', 'parrot', 'pelco', 'pentacon', 'pentax', 'phase', 'philips', 'philips', 'phoenix', 'pioneer', 'playskool', 'polaroid', 'polarpro', 'praktica', 'premier', 'promaster', 'proscan', 'pyle', 'radioshack', 'raymarine', 'raynox', 'rca', 'ricoh', 'ring', 'rode', 'rokinon', 'rollei', 'ryobi', 'sakar', 'samsung', 'sandisk', 'sanyo', 'schneider', 'schneider', 'schneider', 'scosche', 'seasea', 'sealife', 'sharp', 'sharper', 'sigma', 'sinar', 'sipix', 'sjcam', 'sony', 'soocoo', 'stealth', 'superheadz', 'svp', 'swann', 'tamrac', 'tamron', 'technika', 'tenba', 'think', 'thule', 'tokina', 'tomy', 'toshiba', 'transcend', 'traveler', 'trust', 'verbatim', 'vibe', 'victure', 'vistaquest', 'vivitar', 'voigtländer', 'vtech', 'vupoint', 'walimex', 'wyze', 'xiaomi', 'xit', 'xtreme', 'yashica', 'zeiss', 'zoom']

In [15]:
df["left_page_title"] = df["left_page_title"].apply(lambda line : list(set(filter(lambda word : bool(pattern.match(word)) or word in brands,line))))

In [16]:
df["right_page_title"] = df["right_page_title"].apply(lambda line : list(set(filter(lambda word : bool(pattern.match(word)) or word in brands,line))))

In [17]:
ground_truth = df.loc[df['label'] == 1]

In [18]:
df.head()

Unnamed: 0,left_spec_id,right_spec_id,label,left_page_title,right_page_title
0,www.ebay.com//53278,www.garricks.com.au//31,1,"[d3200, nikon, 55mm]","[d3200, nikon, 1855mm]"
1,www.ebay.com//53278,www.priceme.co.nz//2246,0,"[d3200, nikon, 55mm]","[1200d, canon]"
2,www.ebay.com//53278,www.shopbot.com.au//1376,0,"[d3200, nikon, 55mm]","[nikon, d7000, 18105mm]"
3,www.ebay.com//53278,www.flipkart.com//2193,0,"[d3200, nikon, 55mm]","[rs26500, s1855, 1200d, canon]"
4,www.ebay.com//53278,www.ebay.com//58781,0,"[d3200, nikon, 55mm]","[5d, canon]"


In [26]:
import time

In [27]:
t = time.time()
commons = 0
found_labels = []
for index, row in df.iterrows():
    for spec in row["left_page_title"]:
        if spec in row["right_page_title"]:
            commons += 1
    if commons >= 2:
        found_labels.append(1)
    else:
        found_labels.append(0)
    commons = 0
elapsed = time.time() - t
print(elapsed)

30.684340953826904


In [21]:
df["found_labels"] = found_labels

In [23]:
sum = 0
for index, row in df.iterrows():
    if row["label"] == row["found_labels"]:
        sum += 1

In [32]:
print(sum / len(df))

0.9957569913211186


In [24]:
print(sum / len(df))

0.9980308532639349


In [25]:
len(df)

303177