In [1]:
import os
import sys
import random
import json
import collections
import itertools

import pandas as pd
import numpy as np
import scipy
import statsmodels


from tqdm import trange, tqdm_notebook as tqdm
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams["figure.figsize"] = (10,7)

pd.options.display.max_columns = 999

sns.set()

plt.rcParams["figure.figsize"] = (10,7)

# Load data

In [2]:
labels_path = '../data/raw/sigmod_medium_labelled_dataset.csv'

In [3]:
labels_df = pd.read_csv(labels_path).sort_values(by=['left_spec_id', 'right_spec_id'])

In [4]:
labels_df.shape

(46665, 3)

In [5]:
labels_df.head()

Unnamed: 0,left_spec_id,right_spec_id,label
38956,buy.net//5641,buy.net//5698,0
39281,buy.net//5641,buy.net//5791,0
33358,buy.net//5641,buy.net//5946,0
40657,buy.net//5641,buy.net//6145,0
9198,buy.net//5641,cammarkt.com//203,0


In [6]:
labels_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46665 entries, 38956 to 42821
Data columns (total 3 columns):
left_spec_id     46665 non-null object
right_spec_id    46665 non-null object
label            46665 non-null int64
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [7]:
specs_df = pd.read_csv('../data/processed/specs_features.csv')
# specs_features = specs_df.drop(['site', 'page_title', 'page_title_stem', 'brand', 'spec_id'], axis=1)
# specs_df = specs_df[['spec_id', 'site', 'page_title', 'page_title_stem', 'brand']]
specs_df.shape

(29787, 131)

In [8]:
specs_df.head()

Unnamed: 0,spec_id,page_title,page_title_stem,brand,site,page_title__10,page_title__12,page_title__16,page_title__18,page_title__alibaba,page_title__alibaba com,page_title__and,page_title__bag,page_title__black,page_title__body,page_title__buy,page_title__camera,page_title__camera black,page_title__camera buy,page_title__camera case,page_title__camera product,page_title__camera with,page_title__cameras,page_title__canon,page_title__canon eos,page_title__canon powershot,page_title__case,page_title__cctv,page_title__com,page_title__comparison,page_title__coolpix,page_title__digital,page_title__digital camera,page_title__digital slr,page_title__dome,page_title__ds,page_title__eos,page_title__for,page_title__hd,page_title__hikvision,page_title__in,page_title__in india,page_title__india,page_title__ip,page_title__ip camera,page_title__ir,page_title__is,page_title__kit,page_title__lens,page_title__mp,page_title__mp digital,page_title__new,page_title__nikon,page_title__nikon coolpix,page_title__on,page_title__on alibaba,page_title__powershot,page_title__price,page_title__product,page_title__product on,page_title__reviews,page_title__silver,page_title__slr,page_title__slr camera,page_title__sony,page_title__waterproof,page_title__with,page_title__zoom,page_title_stem__10,page_title_stem__12,page_title_stem__16,page_title_stem__18,page_title_stem__alibaba,page_title_stem__alibaba com,page_title_stem__and,page_title_stem__bag,page_title_stem__black,page_title_stem__bodi,page_title_stem__buy,page_title_stem__camera,page_title_stem__camera black,page_title_stem__camera buy,page_title_stem__camera case,page_title_stem__camera product,page_title_stem__camera with,page_title_stem__canon,page_title_stem__canon eo,page_title_stem__canon powershot,page_title_stem__case,page_title_stem__cctv,page_title_stem__com,page_title_stem__comparison,page_title_stem__coolpix,page_title_stem__digit,page_title_stem__digit camera,page_title_stem__digit slr,page_title_stem__dome,page_title_stem__ds,page_title_stem__eo,page_title_stem__for,page_title_stem__hd,page_title_stem__hikvis,page_title_stem__in,page_title_stem__in india,page_title_stem__india,page_title_stem__ip,page_title_stem__ip camera,page_title_stem__ir,page_title_stem__is,page_title_stem__kit,page_title_stem__len,page_title_stem__mp,page_title_stem__mp digit,page_title_stem__new,page_title_stem__nikon,page_title_stem__nikon coolpix,page_title_stem__on,page_title_stem__on alibaba,page_title_stem__powershot,page_title_stem__price,page_title_stem__product,page_title_stem__product on,page_title_stem__review,page_title_stem__silver,page_title_stem__slr,page_title_stem__slr camera,page_title_stem__soni,page_title_stem__waterproof,page_title_stem__with,page_title_stem__zoom,site_enc
0,www.ebay.com//57656,Canon PowerShot ELPH 110 HS 16 1 MP Digital Ca...,canon powershot elph 110 hs 16 1 mp digit came...,canon,www.ebay.com,0.0,0.0,0.401447,0.0,0.0,0.0,0.0,0.0,0.300358,0.0,0.0,0.136441,0.336425,0.0,0.0,0.0,0.0,0.0,0.297131,0.0,0.401548,0.0,0.0,0.0,0.0,0.0,0.167003,0.21231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.250333,0.27226,0.0,0.0,0.0,0.0,0.0,0.396917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.402219,0.0,0.0,0.0,0.0,0.0,0.300936,0.0,0.0,0.132372,0.336987,0.0,0.0,0.0,0.0,0.297702,0.0,0.402321,0.0,0.0,0.0,0.0,0.0,0.167318,0.206452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.250814,0.272784,0.0,0.0,0.0,0.0,0.0,0.39768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
1,www.ebay.com//60583,Canon Rebel 2000 35 mm Camera Great Condition ...,canon rebel 2000 35 mm camera great condit wit...,canon,www.ebay.com,0.0,0.0,0.0,0.0,0.0,0.0,0.595626,0.0,0.0,0.0,0.0,0.187118,0.0,0.0,0.0,0.0,0.0,0.0,0.407492,0.0,0.0,0.480595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.461731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.59699,0.0,0.0,0.0,0.0,0.181605,0.0,0.0,0.0,0.0,0.0,0.408425,0.0,0.0,0.479244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.462749,0.0,7
2,www.ebay.com//60440,Canon EOS Rebel T3i Digital SLR Camera 18 55mm...,canon eo rebel t3i digit slr camera 18 55mm is...,canon,www.ebay.com,0.0,0.0,0.0,0.319139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115511,0.0,0.0,0.0,0.0,0.0,0.0,0.251551,0.332265,0.0,0.0,0.0,0.0,0.0,0.0,0.141385,0.0,0.298158,0.0,0.0,0.329144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.341558,0.328608,0.313318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.281382,0.303178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.31902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111809,0.0,0.0,0.0,0.0,0.0,0.251457,0.332818,0.0,0.0,0.0,0.0,0.0,0.0,0.141327,0.0,0.298047,0.0,0.0,0.329675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.341517,0.328332,0.314968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.281276,0.302455,0.0,0.0,0.0,0.0,7
3,www.ebay.com//24139,"GE C1033 10 1 MP Digital Camera 3X Zoom 2 4"" L...","ge c1033 10 1 mp digit camera 3x zoom 2 4"" lcd...",ge,www.ebay.com,0.476225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.329912,0.0,0.0,0.149867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183436,0.233201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.274965,0.29905,0.422348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.458761,0.477367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.330703,0.0,0.0,0.145466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183869,0.226874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.275624,0.299767,0.423313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.459667,7
4,www.ebay.com//54903,Vivitar Clip Shot Digital Camera 1 1 MP,vivitar clip shot digit camera 1 1 mp,vivitar,www.ebay.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.347405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.425221,0.54058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.637393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.340596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430512,0.531205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.645349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7


In [9]:
meta_columns = ['spec_id', 'site', 'page_title', 'page_title_stem', 'brand']
feature_columns = list(set(specs_df.columns).difference(meta_columns))

# LSH 

In [10]:
from sklearn.preprocessing import normalize

In [11]:
from annoy import AnnoyIndex

In [12]:
vector_dim = len(feature_columns)
trees_amount = 10

In [13]:
spec_vectors = normalize(specs_df[feature_columns].values)
spec_vectors.shape

(29787, 126)

In [14]:
t = AnnoyIndex(vector_dim, 'angular')  # Length of item vector that will be indexed
for i in trange(len(spec_vectors)):
    t.add_item(i, spec_vectors[i])
    
t.build(trees_amount)

100%|██████████| 29787/29787 [00:00<00:00, 48378.18it/s]


True

# Test on labelled dataset

In [15]:
labels_df[labels_df.label==1].head()

Unnamed: 0,left_spec_id,right_spec_id,label
27484,buy.net//5641,www.ebay.com//58588,1
24490,buy.net//5641,www.gosale.com//849,1
12459,buy.net//5641,www.price-hunt.com//9794,1
5120,buy.net//5698,cammarkt.com//501,1
31627,buy.net//5698,www.ebay.com//41940,1


In [16]:
def get_vector_for_spec_id(spec_id):
    return spec_vectors[specs_df[specs_df.spec_id == spec_id].index][0]
get_vector_for_spec_id('buy.net//5641')

array([0.25977742, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.26439247, 0.        ,
       0.        , 0.        , 0.25534078, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.28814191, 0.        , 0.        , 0.15653586, 0.        ,
       0.20370053, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.15851752,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.20627926, 0.19781552, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.21889649,
       0.        , 0.        , 0.10861302, 0.        , 0.        ,
       0.        , 0.19534259, 0.        , 0.        , 0.     

In [23]:
def get_specs_for_vector(v, n=10):
    spec_idxs, dists = t.get_nns_by_vector(v, n+1,  include_distances=True)
    print(dists)
    return specs_df['spec_id'].iloc[spec_idxs]

def get_specs_for_spec_id(spec_id): 
    specs = list(get_specs_for_vector(get_vector_for_spec_id(spec_id)).values)
    return specs 

In [24]:
get_specs_for_spec_id('buy.net//5641')

[0.0, 0.293069452047348, 0.3697288930416107, 0.3697288930416107, 0.37841418385505676, 0.43298256397247314, 0.43298256397247314, 0.4787573516368866, 0.48584672808647156, 0.49113836884498596, 0.49113836884498596]


['buy.net//5641',
 'buy.net//5648',
 'buy.net//6229',
 'buy.net//5628',
 'buy.net//6536',
 'buy.net//5372',
 'buy.net//5937',
 'buy.net//6335',
 'buy.net//5635',
 'buy.net//6478',
 'buy.net//6228']

In [None]:
def predict_label(left_spec_id, right_spec_id):
    specs = get_specs_for_spec_id(left_spec_id)
    if right_spec_id in specs:
        return 1
    return 0

In [None]:
pred_labels = []
for row in tqdm(labels_df.itertuples()):
    pred_labels.append(predict_label(row.left_spec_id, row.right_spec_id))

In [134]:
from sklearn.metrics import f1_score, plot_confusion_matrix, classification_report

In [135]:
print('Train F1', f1_score(labels_df.label, pred_labels))
print(classification_report(labels_df.label, pred_labels))

Train F1 0.04385489983757444
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     43083
           1       0.72      0.02      0.04      3582

    accuracy                           0.92     46665
   macro avg       0.82      0.51      0.50     46665
weighted avg       0.91      0.92      0.89     46665



# Save index

In [138]:
t.save('../data/interim/index.ann')

True