In [1]:
import numpy as np
import pandas as pd 

import scipy.sparse as sp

from tqdm import tqdm_notebook as tqdm

from time import time

from annoy import AnnoyIndex

In [2]:
def ap_at_n(predictions, actuals, n=20):
    
    sortidx = (-predictions).argsort()
    numpos = (actuals > 0).sum()
    if numpos == 0:
        return 0

    numpos = min(numpos, n)
    delta_recall = 1.0 / numpos

    ap = 0.0
    poscount = 0.0

    for i in range(n):
        if actuals[sortidx[i]] > 0:
            poscount += 1
            ap += poscount / (i + 1) * delta_recall
    return ap

def gap(pred, actual):
    all = 0.0
    cnt = 0
    for i in range(len(pred)):
        v = ap_at_n(pred[i], actual[i], n=20)
        all = all + v
        cnt = cnt + 1
    return all / cnt

def gap_sparse(pred, actual):
    all = 0.0
    cnt = 0
    for i in range(len(pred)):
        p = pred[i].toarray().reshape(-1)
        a = actual[i].toarray().reshape(-1)
        v = ap_at_n(p, a, n=20)
        all = all + v
        cnt = cnt + 1
    return all / cnt

In [3]:
X_0_train = np.load('X_train_0.npy')
X_1_train = np.load('X_train_1.npy')
X_val = np.load('X_val.npy')

In [4]:
# from http://stackoverflow.com/a/8980156/861423

def save_sparse_csr(filename, array):
    np.savez(filename, data=array.data, indices=array.indices, indptr=array.indptr, shape=array.shape)

def load_sparse_csr(filename):
    loader = np.load(filename)
    return sp.csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])

In [5]:
y_0 = load_sparse_csr('y_0.npz')
y_1 = load_sparse_csr('y_1.npz')
y_val = load_sparse_csr('y_val.npz')

In [None]:
index_0 = AnnoyIndex(1152)

for i in tqdm(range(len(X_0_train))):
    row = X_0_train[i]
    index_0.add_item(i, row)




KeyboardInterrupt: 

In [None]:
t0 = time()

index_0.build(50)

print('building index took %.3s' % (time() - t0))

In [11]:
index_0.save('index_0.ann')

True

In [6]:
index_0 = AnnoyIndex(1152)
index_0.load('index_0.ann')

True

In [9]:
index_1 = AnnoyIndex(1152)

for i in tqdm(range(len(X_1_train))):
    row = X_1_train[i]
    index_1.add_item(i, row)




In [10]:
t0 = time()

index_1.build(50)

print('building index took %.3fs' % (time() - t0))

building index took 788


In [12]:
index_1.save('index_1.ann')

True

In [7]:
index_1 = AnnoyIndex(1152)
index_1.load('index_1.ann')

True

In [8]:
def dist_to_cos(d):
    return 1 - 0.5 * d * d

def knn_pred(y, idx, dst):
    y_nn = y[idx]
    cos = dist_to_cos(dst)
    
    res = y_nn.multiply(cos.reshape(-1, 1)).sum(axis=0)
    res = np.asarray(res).reshape(-1)

    return res.astype('float32')

In [9]:
knn_1 = []

for i in tqdm(range(5000)):
    row = X_1_train[i]

    idx, dst = index_0.get_nns_by_vector(row, n=50, include_distances=True)
    pred = knn_pred(y_0, idx, np.array(dst))
    pred = sp.csr_matrix(pred)

    knn_1.append(pred)




In [10]:
gap_sparse(knn_1, y_1[:5000])

0.66393109762410263

In [None]:
knn_1 = []

for i in tqdm(range(len(X_1_train))):
    row = X_1_train[i]

    idx, dst = index_0.get_nns_by_vector(row, n=50, include_distances=True)
    pred = knn_pred(y_0, idx, np.array(dst))
    pred = sp.csr_matrix(pred)

    knn_1.append(pred)




Exception in thread Thread-5:
Traceback (most recent call last):
  File "/home/agrigorev/anaconda3/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/home/agrigorev/anaconda3/lib/python3.5/site-packages/tqdm/_tqdm.py", line 103, in run
    for instance in self.tqdm_cls._instances:
  File "/home/agrigorev/anaconda3/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






In [36]:
knn_1 = sp.vstack(knn_1)

In [None]:
knn_0 = []

for i in tqdm(range(len(X_0_train))):
    row = X_0_train[i]

    idx, dst = index_1.get_nns_by_vector(row, n=50, include_distances=True)
    pred = knn_pred(y_1, idx, np.array(dst))
    pred = sp.csr_matrix(pred)

    knn_0.append(pred)

In [35]:
knn_0 = sp.vstack(knn_0)

In [None]:
knn_val = []

for i in tqdm(range(len(X_val))):
    row = X_val[i]

    idx, dst = index_0.get_nns_by_vector(row, n=50, include_distances=True)
    pred_0 = knn_pred(y_0, idx, np.array(dst))

    idx, dst = index_1.get_nns_by_vector(row, n=50, include_distances=True)
    pred_1 = knn_pred(y_1, idx, np.array(dst))

    pred = (pred_0 + pred_1) / 2
    pred = sp.csr_matrix(pred)

    knn_val.append(pred)

In [37]:
knn_val = sp.vstack(knn_val)

In [25]:
import gc
gc.collect()

0

In [26]:
gap_sparse(knn_val[:5000], y_val[:5000])

0.67894650999682049

In [27]:
import pickle

In [28]:
with open('test_ids.bin', 'rb') as f:
    test_ids = pickle.load(f)


In [30]:
X_test = np.load('X_test.npy')

In [32]:
knn_test = []

for i in tqdm(range(len(X_test))):
    row = X_val[i]

    idx, dst = index_0.get_nns_by_vector(row, n=50, include_distances=True)
    pred_0 = knn_pred(y_0, idx, np.array(dst))

    idx, dst = index_1.get_nns_by_vector(row, n=50, include_distances=True)
    pred_1 = knn_pred(y_1, idx, np.array(dst))

    pred = (pred_0 + pred_1) / 2
    pred = sp.csr_matrix(pred)

    knn_test.append(pred)




In [38]:
knn_test = sp.vstack(knn_test)

In [41]:
save_sparse_csr('knn_pred_0.npz', knn_0)
save_sparse_csr('knn_pred_1.npz', knn_1)
save_sparse_csr('knn_pred_val.npz', knn_val)
save_sparse_csr('knn_pred_test.npz', knn_test)

In [31]:
def prepare_pred_row(prow):
    classes = (-prow).argsort()[:20]
    scores = prow[classes]
    return ' '.join(['%d %0.3f' % (c, s) for (c, s) in zip(classes, scores)])