In [1]:
import sys
sys.path.append("..")

In [2]:
import utils
import mydatasets
import os
import numpy as np
import torch
import mymodels
from sklearn.manifold import TSNE
import ipywidgets as widgets
from ipywidgets import interact,fixed,interact_manual
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from sklearn import metrics as skmet

In [3]:
DATASET="bibtex"
SPLIT=0

In [4]:
# change dirs because paths are hardcoded in mydatasets
curr_dir=os.getcwd()
os.chdir("..")
if DATASET in ["mediamill","delicious","bibtex"]:
    full_dataset,trn_splits,tst_splits=mydatasets.load_small_dataset(DATASET)
    trn_data,tst_data=mydatasets.get_small_dataset_split(full_dataset,trn_splits,tst_splits,SPLIT)
elif DATASET in ["eurlex"]:
    trn_data,tst_data=mydatasets.load_large_dataset(DATASET)
x_mat,y_mat,x_tst,y_tst=mydatasets.get_arrays(trn_data,tst_data)
# change back
os.chdir(curr_dir)

Loading datasets
../data/Bibtex/Bibtex_data.txt
../data/Bibtex/bibtex_trSplit.txt
../data/Bibtex/bibtex_tstSplit.txt
Number of splits : 10
## HEADER ##
#Point : 7395 , #Features : 1836 , #Labels : 159


In [5]:
load_model_dir="../runs/bibtex_40/"
model_num=160
val_file_name="../runs/bibtex_datadict.p"

In [6]:
x_trn,y_trn,x_val,y_val=mydatasets.get_validation_split(x_mat,y_mat,val_file_name,None)

In [7]:
model=torch.load(load_model_dir+"model_"+str(model_num))

In [8]:
emb_trn=model(torch.from_numpy(x_trn.astype('float32'))).detach().numpy()
emb_val=model(torch.from_numpy(x_val.astype('float32'))).detach().numpy()

### Simple neighbour model

In [9]:
metrics_df=pd.DataFrame()
for num_neighbours in [5,10,15,20,50,100,150]:
    nbrs = NearestNeighbors(n_neighbors=num_neighbours, algorithm='ball_tree').fit(emb_trn)
    trn_metrics=utils.compute_mlr_metrics(nbrs,num_neighbours,y_trn,emb_trn,y_trn,"")
    trn_metrics["trn/val"]="trn"
    trn_metrics["num_nbr"]=num_neighbours
    metrics_df=metrics_df.append(trn_metrics)
    val_metrics=utils.compute_mlr_metrics(nbrs,num_neighbours,y_trn,emb_val,y_val,"")
    val_metrics["trn/val"]="val"
    val_metrics["num_nbr"]=num_neighbours
    metrics_df=metrics_df.append(val_metrics)

### Distance weighted neighbour model

In [10]:
def weighted_mlr_metrics(nbrs,num_neighbours,y_trn,emb_tst,y_tst,prefix):
    nbr_distances, nbr_indices = nbrs.kneighbors(emb_tst)
    weights=np.exp(-nbr_distances)
    y_nbr_weighted=y_trn[nbr_indices,:]*weights.reshape(-1,num_neighbours,1)
    assert(y_nbr_weighted.shape==(emb_tst.shape[0],num_neighbours,y_tst.shape[1]))
    y_pred=np.mean(y_nbr_weighted,axis=1)
    metrics_df=pd.DataFrame(index=[0])
    metrics_df.loc[0,prefix+"p@1"]=utils.precision_at_k(y_tst,y_pred,1)
    metrics_df.loc[0,prefix+"p@3"]=utils.precision_at_k(y_tst,y_pred,3)
    metrics_df.loc[0,prefix+"p@5"]=utils.precision_at_k(y_tst,y_pred,5)
    metrics_df.loc[0,prefix+"ranking_loss"]=skmet.label_ranking_loss(y_tst,y_pred)
    metrics_df.loc[0,prefix+"coverage_error"]=skmet.coverage_error(y_tst,y_pred)
    metrics_df.loc[0,prefix+"avg_prec_score"]=skmet.label_ranking_average_precision_score(y_tst,y_pred)
    return metrics_df

In [11]:
wmetrics_df=pd.DataFrame()
for num_neighbours in [5,10,15,20,50,100,150]:
    nbrs = NearestNeighbors(n_neighbors=num_neighbours, algorithm='ball_tree').fit(emb_trn)
    trn_metrics=weighted_mlr_metrics(nbrs,num_neighbours,y_trn,emb_trn,y_trn,"")
    trn_metrics["trn/val"]="trn"
    trn_metrics["num_nbr"]=num_neighbours
    wmetrics_df=wmetrics_df.append(trn_metrics)
    val_metrics=weighted_mlr_metrics(nbrs,num_neighbours,y_trn,emb_val,y_val,"")
    val_metrics["trn/val"]="val"
    val_metrics["num_nbr"]=num_neighbours
    wmetrics_df=wmetrics_df.append(val_metrics)

In [12]:
display(metrics_df[metrics_df["trn/val"]=="val"])
display(wmetrics_df[wmetrics_df["trn/val"]=="val"])

Unnamed: 0,p@1,p@3,p@5,ranking_loss,coverage_error,avg_prec_score,trn/val,num_nbr
0,0.506148,0.331967,0.238934,0.333764,82.616803,0.429254,val,5
0,0.526639,0.338798,0.25082,0.275829,72.040984,0.464776,val,10
0,0.526639,0.344262,0.24959,0.243644,64.97541,0.477141,val,15
0,0.536885,0.343579,0.254098,0.217999,58.944672,0.485629,val,20
0,0.54918,0.338115,0.251639,0.14074,39.571721,0.501189,val,50
0,0.534836,0.336749,0.247131,0.113617,31.67418,0.495317,val,100
0,0.5,0.325137,0.244672,0.102957,28.784836,0.478201,val,150


Unnamed: 0,p@1,p@3,p@5,ranking_loss,coverage_error,avg_prec_score,trn/val,num_nbr
0,0.508197,0.331967,0.239754,0.328794,81.942623,0.444755,val,5
0,0.528689,0.340164,0.25,0.269791,71.102459,0.482935,val,10
0,0.530738,0.347678,0.24877,0.236934,63.854508,0.491434,val,15
0,0.543033,0.343579,0.25082,0.210423,57.584016,0.499,val,20
0,0.553279,0.339481,0.252459,0.130939,37.495902,0.511364,val,50
0,0.534836,0.337432,0.245902,0.105031,29.717213,0.499905,val,100
0,0.508197,0.32582,0.244672,0.095503,26.997951,0.485763,val,150


**Clearly a distance weighted scheme is beneficial.** 

This reflects favourably for the model, that similarity and distances have are correctly related, which is the objective with which the model was trained.