In [None]:
%matplotlib inline
import pandas as pd
from IPython.display import display
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import math
import os

from datasets import get_dataset, bruteforce
from distance import jaccard, l2
from lsh import LSHBuilder, LSH

sns.set_style("whitegrid")
#plt.rcParams["figure.figsize"] =(20,10)
pd.options.display.max_rows = 2000
plt.rcParams['mathtext.fontset'] = 'cm'
#plt.rcParams['font.family'] = 'cmu serif'
plt.rcParams['mathtext.rm'] = 'Bitstream Vera Sans'
plt.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic'
plt.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold'

#sns.set_context('paper', font_scale=5)

df = pd.read_csv("exp.csv")

In [None]:
for ds in set(df.dataset):
    for k in set(df[df.dataset==ds].k):
        plt.title(f"{ds}, k={k}")
        ax = sns.lineplot(data=df[(df.k == k) & (df.dataset == ds)], x="L", y="tvd",hue="method")
        plt.show()

In [None]:
for ds in set(df.dataset):
    for L in set(df[df.dataset==ds].L):
        plt.title(f"{ds}, L={L}")
        ax = sns.lineplot(data=df[(df.L == L) & (df.dataset == ds)], x="k", y="tvd",hue="method")
        plt.show()

# Discussion of b_cr/b_r term on real-world datasets

In [None]:
def ratio_plot(ds_name, distance_fn, rs, cs):
    data, queries, _, _ = get_dataset(ds_name)
    groundtruth = bruteforce(data, queries, distance_fn)

    ball_df = pd.DataFrame(data={'r' : [], 'c' : [], 'br' : [], 'bcr' :[], 'i' : [] })

    for r in rs:
        for c in cs:
            for i in range(len(groundtruth)):
                if distance_fn == l2:
                    b_r = len([1 for p in groundtruth[i] if distance_fn(data[p], queries[i]) <= r])
                    b_cr = len([1 for p in groundtruth[i] if distance_fn(data[p], queries[i]) <= r * c])
                elif distance_fn == jaccard:
                    b_r = len([1 for p in groundtruth[i] if distance_fn(data[p], queries[i]) >= r])
                    b_cr = len([1 for p in groundtruth[i] if distance_fn(data[p], queries[i]) >= r * c])
                ball_df = ball_df.append({'r': r, 'c' : c, 'br': b_r, 'bcr' : b_cr, 'i' : i}, ignore_index=True)
                
                
    ball_grouped_df = ball_df.groupby(['r', 'c']).sum().reset_index()
    ball_grouped_df['rel'] = ball_grouped_df[['br', 'bcr']].apply(lambda x:  x[1] / x[0], axis=1)
    sns.catplot(data=ball_grouped_df[['r', 'c', 'rel']].round(2).rename(columns={'rel' : '$b_{cr} / b_r$'}),x='c', y='$b_{cr} / b_r$', kind='bar', col='r')

In [None]:
ratio_plot("lastfm", jaccard, (0.15, 0.2, 0.25), (2/3, 1/2, 1/3, 1/4, 1/5) )

In [None]:
ratio_plot("movielens", jaccard, (0.15, 0.2, 0.25), (2/3, 1/2, 1/3, 1/4, 1/5) )

In [None]:
ratio_plot("sift-128-euclidean", l2, (250, 300, 325, 350), (1.25, 1.5, 2, 3))

In [None]:
## missing glove,mnist

# Discussion: Approximate neighborhood is unfair?

In [None]:
# Just run the experiment in here
import os

if not os.path.isfile("approx.csv"):
    import approx_exp
    approx_exp.run("approx.csv")

df = pd.read_csv("approx.csv")
df['sim'] = [0.9, 0.6, 0.5]
df['prob'] = df[['count', 'runs']].apply(lambda x: x[0] / x[1], axis=1)

ax = sns.barplot(data=df.rename(columns={'sim': 'Similarity', 'prob' : 'Sampling Prob.'}), x='Similarity', y='Sampling Prob.')
ax.text(-0.1, 0.005, "$\mathbf{X}$", horizontalalignment='left', size='medium', color='white', weight='semibold')
ax.text(1.9, 0.015, "$\mathbf{Z}$", horizontalalignment='left', size='medium', color='white', weight='semibold')
ax.text(0.9, 0.002, "$\mathbf{Y}$", horizontalalignment='left', size='medium', color='black', weight='semibold')
plt.tight_layout()


