In [1]:
from collections import defaultdict

import pandas as pd
import numpy as np
from tqdm import tqdm
# going to use surprise library and collaborative filtering
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('dataset.csv')
data = data.sort_values(['timestamp'])
X_train = data[:80000]
X_test = data[80000:]

In [3]:
reader = Reader(rating_scale=(1, 5))
train_Data = Dataset.load_from_df(X_train[['user_id', 'item_id', 'rating']], reader)  # special format for surprise
test_Data = Dataset.load_from_df(X_test[['user_id', 'item_id', 'rating']], reader)

In [4]:
def average_precision(actual, recommended, k=30):
    ap_sum = 0
    hits = 0
    for i in range(k):
        product_id = recommended[i] if i < len(recommended) else None
        if product_id is not None and product_id in actual:
            hits += 1
            ap_sum += hits / (i + 1)
    return ap_sum / k

def normalized_average_precision(actual, recommended, k=30):
    actual = set(actual)
    if len(actual) == 0:
        return 0.0

    ap = average_precision(actual, recommended, k=k)
    ap_ideal = average_precision(actual, list(actual)[:k], k=k)
    return ap / ap_ideal

In [5]:
# finding best algo

param_grid = {'n_epochs': np.arange(5, 20, 5), 'lr_all': np.arange(0.001, 0.01, 0.003)}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)
grid_search.fit(train_Data)

In [6]:
algo = grid_search.best_estimator['rmse']

In [7]:
train_Data = train_Data.build_full_trainset()   # constructing train in special format 
algo.fit(train_Data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x24b2dcca610>

In [8]:
# need to specify a list of values: (item, id of value, actual rating) for constructing testset

testset = [test_Data.df.iloc[i].to_list() for i in range(len(test_Data.df))]

In [9]:
preds = algo.test(testset)

In [13]:
def get_top_n(preds, n=10):
    """ returns a defaultdict with value consisting of 
        list of lists in format [<id of object>, <est. rating>]. 
        Thus we get: '<user id>':[<id of object>, <est. rating>].
    """
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in preds:
        top_n[uid].append([iid, est])

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [14]:
top_n = get_top_n(preds, n=5)

In [18]:
scores = []
for user in tqdm(X_test['user_id'].unique()):
    actual = list(X_test[X_test['user_id'] == user]['item_id'])
    recommended = np.array(top_n[user])[:, 0]
    
    scores.append(normalized_average_precision(actual, recommended))

np.mean(scores)

100%|██████████████████████████████████████████████████████████████████████████████| 301/301 [00:00<00:00, 1251.25it/s]


0.3268460228358353