In [1]:
import pandas as pd
import numpy as np
import pyarrow 
import os
import sys
import subprocess
import dask.dataframe as dd
import time
import datetime
from annoy import AnnoyIndex
import time

In [2]:
rank = 1
n_trees = [10, 20]

In [3]:
items = "asl-matrix_i.parquet"
users = "asl-matrix_u.parquet"
train_gt = "asl-train_gt_small.parquet"
val_gt = "asl-val_gt_small.parquet"
test_gt = "asl-test_gt.parquet"
items_dd = dd.read_parquet(items, engine = 'pyarrow')
users_dd = dd.read_parquet(users, engine='pyarrow')
train_gt_dd = dd.read_parquet(train_gt, engine='pyarrow')
val_gt_dd = dd.read_parquet(val_gt, engine='pyarrow')
test_gt_dd = dd.read_parquet(test_gt, engine='pyarrow')
print('read success')
items = items_dd.compute()
users = users_dd.compute()
train_gt = train_gt_dd.compute()
val_gt = val_gt_dd.compute()
test_gt = test_gt_dd.compute()

read success


In [4]:
def average_precision(rec, gt):
    '''
    rec : recommendation list for one user
    gt : ground truth for one user
    '''
    score = 0.0
    num_hits = 0.0

    for i,item in enumerate(rec):
        if item in gt:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    
    return score / num_hits if num_hits != 0.0 else 0.0

def evaluation(rec, gt):
    '''
    rec : dataframe
    gt : dataframe
    '''
    df = pd.merge(rec, gt, how='inner', on='user_id')
    score = [average_precision(df['pred'][i], df['gt'][i]) for i in range(len(df))]
    return np.array(score).mean()

In [6]:
for tree in n_trees:
    for k in [tree * 100, tree * 100 * 5]:
        pred = {}
        f = rank  # Length of item vector that will be indexed
        t = AnnoyIndex(f, 'dot')
        for index, row in items.iterrows():
            t.add_item(row['id'], row['features'])
        
        s = time.time()
        t.build(tree) # 10 trees
        t.save('test.ann')
        u = AnnoyIndex(f, 'dot')
        u.load('test.ann') # super fast, will just mmap the file
        for index, row in users.iterrows():
            pred[row['id']] = u.get_nns_by_vector(row['features'], 100, search_k=k, include_distances=False)
        e = time.time()
        print("time: {}".format(e-s))
        
        pred_df = pd.DataFrame(list(pred.items()), columns=['user_id', 'pred'])
        train_map = evaluation(pred_df,train_gt) 
        val_map = evaluation(pred_df,val_gt)
        test_map = evaluation(pred_df,test_gt)
        print("n_tree: {} search_k: {}".format(tree, k))
        print("train map: {} val map: {} test map: {}".format(round(train_map,6), round(val_map,6), round(test_map,6)))

time: 16.583955764770508
n_tree: 10 search_k: 1000
train map: 0.000767 val map: 0.000496 test map: 0.004759
time: 27.227941036224365
n_tree: 10 search_k: 5000
train map: 0.002366 val map: 0.001513 test map: 0.004627
time: 31.41845417022705
n_tree: 20 search_k: 2000
train map: 0.001256 val map: 0.000664 test map: 0.001274
time: 50.62485313415527
n_tree: 20 search_k: 10000
train map: 0.001496 val map: 0.001037 test map: 0.000822
