In [1]:
from rank_bm25 import BM25Okapi

In [101]:
import os
import sys
import random
import math
import json
import collections
import itertools

import pandas as pd
import numpy as np
import scipy        
import statsmodels


from tqdm import trange, tqdm_notebook as tqdm
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

pd.options.display.max_columns = 999

sns.set()

plt.rcParams["figure.figsize"] = (10,7)

In [102]:
os.getcwd()

'/home/boris/study/UT study/ACM Sigmod/sigmod/notebooks'

In [103]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [108]:
from sigmod_src.utils import read_json, pprint_json, path_from_spec_id, get_vector_for_spec_id

# Load labels data

In [109]:
LABELS_PATH = '../data/raw/sigmod_medium_labelled_dataset.csv'

In [110]:
labels_df = pd.read_csv(LABELS_PATH)
labels_df.shape

(46665, 3)

In [111]:
labels_df.head()

Unnamed: 0,left_spec_id,right_spec_id,label
0,www.garricks.com.au//31,www.ebay.com//53278,1
1,www.ebay.com//58782,www.ebay.com//24817,0
2,www.ebay.com//58782,www.ebay.com//43019,0
3,www.ebay.com//42055,www.ebay.com//54403,0
4,www.ebay.com//44280,buy.net//6145,0


# Load specs data

In [133]:
specs_df = pd.read_csv('../data/processed/specs_preprocessed.csv')
specs_df.index=  specs_df.spec_id
specs_df.shape

(29771, 7)

In [134]:
specs_df.head()

Unnamed: 0_level_0,spec_id,page_title,all_text,page_title_stem,all_text_stem,brand,site
spec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
www.ebay.com//57656,www.ebay.com//57656,canon powershot elph 110 hs 16 1 mp,canon powershot elph 110 hs 16 1 mp canon manu...,canon powershot elph 110 hs 16 1 mp,canon powershot elph 110 hs 16 1 mp canon manu...,canon,www.ebay.com
www.ebay.com//60583,www.ebay.com//60583,canon rebel 2000 35 mm great case instruction ...,canon rebel 2000 35 mm great case instruction ...,canon rebel 2000 35 mm great case instruct boo...,canon rebel 2000 35 mm great case instruct boo...,canon,www.ebay.com
www.ebay.com//60440,www.ebay.com//60440,canon eos rebel t3i 18 55mm 75 300mm iii lens ...,canon eos rebel t3i 18 55mm 75 300mm iii lens ...,canon eo rebel t3i 18 55mm 75 300mm iii len ki...,canon eo rebel t3i 18 55mm 75 300mm iii len ki...,canon,www.ebay.com
www.ebay.com//24139,www.ebay.com//24139,ge c1033 10 1 mp 3x zoom 2 4 lcd,ge c1033 10 1 mp 3x zoom 2 4 lcd ge unused uno...,ge c1033 10 1 mp 3x zoom 2 4 lcd,ge c1033 10 1 mp 3x zoom 2 4 lcd ge unus unope...,,www.ebay.com
www.ebay.com//54903,www.ebay.com//54903,vivitar clip shot 1 1 mp,vivitar clip shot 1 1 mp vivitar unused unopen...,vivitar clip shot 1 1 mp,vivitar clip shot 1 1 mp vivitar unus unopen u...,vivitar,www.ebay.com


In [135]:
specs_df.dropna(inplace=True)

# Make index

In [137]:
text_field = 'page_title_stem'

In [138]:
corpus = specs_df[text_field].values
tokenized_corpus = specs_df[text_field].apply(lambda x: x.split(' ')).values
tokenized_corpus[0]

In [139]:
bm25 = BM25Okapi(tokenized_corpus)

In [140]:
query = specs_df[text_field].sample(1).values[0]

tokenized_query = query.split(' ')

doc_scores = bm25.get_scores(tokenized_query)

n = 10
ind = np.argsort(doc_scores)[::-1][:n]
closest_texts = specs_df.iloc[ind][text_field].values

print('Query:\n\t', query)
print()
print('Top 10 neighboors:\n\t', '\n\t'.join(closest_texts))

Query:
	 canon powershot a4000 16 0 mp blue

Top 10 neighboors:
	 canon powershot a4000 16 0 mp blue
	canon powershot a4000 16 0 mp blue
	canon powershot a4000 16 0 mp blue
	canon powershot a4000 16 0 mp blue
	canon powershot a4000 16 0 mp blue
	canon powershot a4000 16 0 mp
	canon powershot a4000 16 0 mp
	canon powershot a4000 16 0 mp silver
	canon powershot a4000 16 0 mp silver
	canon powershot a4000 16 0 mp silver


# Evaluate index

In [141]:
def tokenize(text):
    return text.split(' ')

class Bm25Indexer:
    def __init__(self, 
                 ids=None,
                 texts=None):
        self.ids = np.array(ids)
        self.texts = np.array(texts)
        self.bm25 = None

    def fit(self):
        tokenized_corpus = [tokenize(t) for t in self.texts]
        self.bm25 = BM25Okapi(tokenized_corpus)

    def lookup_idx(self, query, n=10):
        tokenized_query = tokenize(query)
        doc_scores = self.bm25.get_scores(tokenized_query)
        ind = np.argsort(doc_scores)[::-1][:n]
        return ind
    
    def lookup(self, text, n=10):
        idxs = self.lookup_idx(query, n=n)
        texts = self.texts[idxs]
        ids = self.ids[idxs]
        return ids, texts

In [142]:
indexer = Bm25Indexer(specs_df.spec_id, specs_df[text_field])
indexer.fit()

In [143]:
test_spec = specs_df.sample(1)
test_id, text =  test_spec.spec_id.values[0], test_spec[text_field].values[0]

print('Query', test_id, text)
print()
print('Results: ', indexer.lookup(text)[1])

Query www.ebay.com//44270 kodak easyshar z1485 14 0 mp batteri charger

Results:  ['canon powershot a4000 16 0 mp blue' 'canon powershot a4000 16 0 mp blue'
 'canon powershot a4000 16 0 mp blue' 'canon powershot a4000 16 0 mp blue'
 'canon powershot a4000 16 0 mp blue' 'canon powershot a4000 16 0 mp'
 'canon powershot a4000 16 0 mp' 'canon powershot a4000 16 0 mp silver'
 'canon powershot a4000 16 0 mp silver'
 'canon powershot a4000 16 0 mp silver']


## Obtian labelled dataset recall

In [144]:
dups_df = labels_df[labels_df.label==1]
results = []
print('Evaluating on a sample of duplicates:', int(0.5*len(dups_df)))
for row in tqdm(dups_df.sample(int(0.5*len(dups_df))).itertuples()):
    left_spec = specs_df.loc[row.left_spec_id]
    query_text = left_spec[text_field]
    cand_id, cand_text = indexer.lookup(query_text, n=1000)
    target_index = None
    if row.right_spec_id in cand_id:
        target_index = list(cand_id).index(row.right_spec_id)
    results.append((row.left_spec_id, row.right_spec_id, target_index))

Evaluating on a sample of duplicates: 1791


In [145]:
results_df = pd.DataFrame(results)
results_df.columns = ['left_spec_id', 'right_spec_id', 'right_spec_rank']
results_df.head()

Unnamed: 0,left_spec_id,right_spec_id,right_spec_rank
0,www.ebay.com//53706,www.ebay.com//44643,
1,www.ebay.com//56576,www.garricks.com.au//66,
2,www.ebay.com//24625,www.ebay.com//55912,
3,www.ebay.com//47191,www.ebay.com//43020,
4,www.ebay.com//58782,www.ebay.com//55745,


In [146]:
def compute_recall_k(ranks, k):
    total_number = len(ranks)
    matched_number = ranks[(~ranks.isnull()) & (ranks <= k)]
    return len(matched_number)/total_number

In [148]:
ks = [1, 10, 50, 100, 500, 1000]
for k in ks:
    print(f'Recall-{k}: ', round(compute_recall_k(results_df.right_spec_rank, k), 4))

Recall-1:  0.0
Recall-10:  0.0
Recall-50:  0.0
Recall-100:  0.0
Recall-500:  0.0039
Recall-1000:  0.0045
