In [5]:
%load_ext autoreload
%autoreload 2

import numpy as np
import sys
import pandas as pd

import run_utils

sys.path.append('../') 
import reclab

from reclab.recommenders import SLIM, EASE
from reclab.recommenders import KNNRecommender
from reclab.recommenders.cfnade import Cfnade
from reclab import data_utils

sys.path.append('../tests') 
import utils
import collections


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# helper functions for metrics

def compute_PREC_REC_MAP_NDCG_MRR(N, users, recs, test_ratings):
    assert recs.shape[1] >= N
    metrics = ['PREC', 'REC', 'MAP', 'NDCG', 'MRR']
    res = {key:[] for key in metrics}
    test_rating_matrix = np.array(list(test_ratings.keys()))
    for user_id, rec in zip(users, recs):
        test_matrix = test_rating_matrix[test_rating_matrix[:,0]==user_id,1]
        prec, recall, ncdg = precision_recall_ndcg_at_k(N, rec[:N], test_matrix)
        MAP, mrr, ncdg = map_mrr_ndcg(rec[:N], test_matrix)
        res['PREC'].append(prec)
        res['REC'].append(recall)
        res['NDCG'].append(ncdg)
        res['MAP'].append(MAP)
        res['MRR'].append(mrr)
    return {key:np.mean(res[key]) for key in metrics}

## From "A troubling analysis"... 
### https://github.com/MaurizioFD/RecSys2019_DeepLearning_Evaluation/blob/861eafeaba2943458adec22469b147ec492784b6/Conferences/IJCAI/NeuRec_github/eval.py

def precision_recall_ndcg_at_k(k, rankedlist, test_matrix):
    idcg_k = 0
    dcg_k = 0
    n_k = k if len(test_matrix) > k else len(test_matrix)
    if n_k == 0:
        return 0, 0, 0
    for i in range(n_k):
        idcg_k += 1 / np.log2(i + 2)

    b1 = rankedlist
    b2 = test_matrix
    s2 = set(b2)
    hits = [(idx, val) for idx, val in enumerate(b1) if val in s2]
    count = len(hits)

    for c in range(count):
        dcg_k += 1 / np.log2(hits[c][0] + 2)

    return float(count / k), float(count / len(test_matrix)), float(dcg_k / idcg_k)


def map_mrr_ndcg(rankedlist, test_matrix):
    ap = 0
    map = 0
    dcg = 0
    idcg = 0
    mrr = 0
    if len(test_matrix) == 0:
        return 0, 0, 0
    for i in range(len(test_matrix)):
        idcg += 1 / np.log2(i + 2)

    b1 = rankedlist
    b2 = test_matrix
    s2 = set(b2)
    hits = [(idx, val) for idx, val in enumerate(b1) if val in s2]
    count = len(hits)

    for c in range(count):
        ap += (c + 1) / (hits[c][0] + 1)
        dcg += 1 / np.log2(hits[c][0] + 2)

    if count != 0:
        mrr = 1 / (hits[0][0] + 1)

    if count != 0:
        map = ap / count

    return map, mrr, float(dcg / idcg)

In [3]:
def precision_at_k(r, k):
    """Score is precision @ k
    Relevance is binary (nonzero is relevant).
    Returns:
        Precision @ k
    Raises:
        ValueError: len(r) must be >= k
    """
    assert k >= 1
    r = np.asarray(r)[:k]
    return np.mean(r)



def recall_at_k(r, k, all_pos_num):
    r = np.asfarray(r)[:k]
    return np.sum(r) / all_pos_num

def average_precision(r,cut):
    """Score is average precision (area under PR curve)
    Relevance is binary (nonzero is relevant).
    Returns:
        Average precision
    """
    r = np.asarray(r)
    out = [precision_at_k(r, k + 1) for k in range(cut) if r[k]]
    if not out:
        return 0.
    return np.sum(out)/float(min(cut, np.sum(r)))

def test_one_user(recs, training_items, user_pos_test):
    if len(user_pos_test) == 0:
        return None
    r = []
    for i in recs:
        if i in user_pos_test:
            r.append(1)
        else:
            r.append(0)
    
    recall_20 = recall_at_k(r, 20, len(user_pos_test))
    recall_40 = recall_at_k(r, 40, len(user_pos_test))
    recall_60 = recall_at_k(r, 60, len(user_pos_test))
    recall_80 = recall_at_k(r, 80, len(user_pos_test))
    recall_100 = recall_at_k(r, 100, len(user_pos_test))

    ap_20 = average_precision(r,20)
    ap_40 = average_precision(r, 40)
    ap_60 = average_precision(r, 60)
    ap_80 = average_precision(r, 80)
    ap_100 = average_precision(r, 100)


    return np.array([recall_20,recall_40,recall_60,recall_80,recall_100, ap_20,ap_40,ap_60,ap_80,ap_100])


def test(recs, train_ratings, test_ratings, users):
    train_items = collections.defaultdict(list)
    for uid, iid in train_ratings:
        train_items[uid].append(iid)

    test_items = collections.defaultdict(list) 
    for uid, iid in test_ratings:
        test_items[uid].append(iid)
    result = np.array([0.] * 10)
    tot_num = 0
    for user_id, rec in zip(users, recs):
        res = test_one_user(rec, train_items[user_id], test_items[user_id])
        if res is not None:
            result += res
            tot_num += 1

    ret = result / tot_num
    return list(ret)

# SLIM

In "A troubling analysis" (https://arxiv.org/pdf/1911.07698.pdf) Table 12, SLIM achieves the following results on ML 1M.


| PREC@5   | REC@5   | MAP@5   | NDCG@5   | MRR@5   | PREC@10   | REC@10   | MAP@10   | NDCG@10   |  MRR@10 |
|------:|------:|------:|------:|------:|------:|------:|------:|------:|------:|
| 0.4437 |  0.1106 |  0.3692 |  0.1749 |  0.6578 | 0.3813 |  0.1770 |  0.3003 |  0.2321 |  0.667 |


In this paper, the dataset is converted into a implicit dataset, so ratings are either 1 or 0. Evaulation was performed by averaging over five different 80/20 train/test splits. (We will just look at a single split below).

 The [hyperparameters](https://github.com/MaurizioFD/RecSys2019_DeepLearning_Evaluation/blob/861eafeaba2943458adec22469b147ec492784b6/DL_Evaluation_TOIS_Additional_material.pdf) are set as `l1_ratio=1.89e-5` and `alpha=0.049`.
 

In [73]:
users, items, ratings = data_utils.read_dataset('ml-1m')

In [74]:
for key in ratings.keys():
    ratings[key] = (1, ratings[key][1])

In [75]:
all_contexts = collections.OrderedDict([(user_id, np.zeros(0)) for user_id in users])

In [76]:
train_ratings, test_ratings = data_utils.split_ratings(ratings, 0.8, shuffle=True, seed=0)

In [77]:
recommender = SLIM(alpha=0.049, l1_ratio=1.89e-5, seed=0)

In [78]:
recommender.reset(users, items, train_ratings)

In [79]:
recs, _ = recommender.recommend(all_contexts, 10)

In [83]:
for N in [5, 10]:
    res = compute_PREC_REC_MAP_NDCG_MRR(N, users, recs, test_ratings)
    print('@{}:'.format(N), res)

@5: {'PREC': 0.3241721854304636, 'REC': 0.06738210300791221, 'MAP': 0.43651995952906547, 'NDCG': 0.10395269380895251, 'MRR': 0.45423013245033117}
@10: {'PREC': 0.3814238410596027, 'REC': 0.17422638615796424, 'MAP': 0.43883045470150284, 'NDCG': 0.18934677631282193, 'MRR': 0.47939562966466936}


# EASE

In "A troubling analysis" (https://arxiv.org/pdf/1911.07698.pdf), EASE achieves the following results on ML 1M



| PREC@5   | REC@5   | MAP@5   | NDCG@5   | MRR@5   | PREC@10   | REC@10   | MAP@10   | NDCG@10   |  MRR@10 |
|------:|------:|------:|------:|------:|------:|------:|------:|------:|------:|
| 0.4360  | 0.1073  | 0.3608  | 0.1697  | 0.6475 | 0.3745  | 0.1731  | 0.2923  | 0.2259  | 0.65| 
 
In this paper, the dataset is converted into a implicit dataset, so ratings are either 1 or 0. Evaulation was performed by averaging over five different 80/20 train/test splits. (We will just look at a single split below).

The [hyperparameters](https://github.com/MaurizioFD/RecSys2019_DeepLearning_Evaluation/blob/861eafeaba2943458adec22469b147ec492784b6/DL_Evaluation_TOIS_Additional_material.pdf) are set as `lam=1.25e3`


In [84]:
users, items, ratings = data_utils.read_dataset('ml-1m')
for key in ratings.keys():
    ratings[key] = (1, ratings[key][1])
all_contexts = collections.OrderedDict([(user_id, np.zeros(0)) for user_id in users])
train_ratings, test_ratings = data_utils.split_ratings(ratings, 0.8, shuffle=True, seed=0)

In [85]:
recommender = EASE(lam=1.25e3)

In [86]:
recommender.reset(users, items, train_ratings)

  self._set_arrayXarray(i, j, x)


In [87]:
recs, _ = recommender.recommend(all_contexts, 10)

In [88]:
for N in [5, 10]:
    res = compute_PREC_REC_MAP_NDCG_MRR(N, users, recs, test_ratings)
    print('@{}:'.format(N), res)

@5: {'PREC': 0.32466887417218543, 'REC': 0.06821147096556184, 'MAP': 0.4329348325974981, 'NDCG': 0.10444392654073191, 'MRR': 0.4466197571743929}
@10: {'PREC': 0.38415562913907286, 'REC': 0.17628998806236673, 'MAP': 0.4373985546772671, 'NDCG': 0.19097229149913342, 'MRR': 0.4719510801009146}


## UserKNN cosine

In the Surprise repo changing the benchmarking script (https://github.com/NicolasHug/Surprise/blob/master/examples/benchmark.py) on KNNWithMeans to use cosine similarity leads to an RMSE of 0.942 on MovieLens 1M.


The [hyperparameters](https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans) (Table 42) are set as `topK=40`, `shrink=0`


In [85]:
users, items, ratings = data_utils.read_dataset('ml-1m')
all_contexts = collections.OrderedDict([(user_id, np.zeros(0)) for user_id in users])
train_ratings, test_ratings = data_utils.split_ratings(ratings, 0.8, shuffle=True, seed=0)

Using TensorFlow backend.
[autoreload of reclab.recommenders.top_pop failed: Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/ec2-user/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 410, in superreload
    update_generic(old_obj, new_obj)
  File "/home/ec2-user/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 347, in update_generic
    update(a, b)
  File "/home/ec2-user/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 302, in update_class
    if update_generic(old_obj, new_obj): continue
  File "/home/ec2-user/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 347, in update_generic
    update(a, b)
  File "/home/ec2-user/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 266, in update_

In [86]:
recommender = KNNRecommender(shrinkage=0, neighborhood_size=40, user_based=True, use_means=True, use_content=False)

In [87]:
recommender.reset(users, items, train_ratings)

In [88]:
t = recommender.dense_predictions
t = [(uid, iid, np.zeros(0)) for uid, iid in test_ratings]
preds = recommender.predict(t)

In [89]:
tot = 0.0
num = 0.0
for (uid, iid, _), pred in zip(t, preds):
    tot += (test_ratings[uid, iid][0] - pred) **2
    num += 1
print("RMSE is", np.sqrt(tot / num))

RMSE is 0.9458989545474523


# CF-NADE

In "A Neural Autoregressive Approach to Collaborative Filtering" (https://arxiv.org/pdf/1605.09477.pdf) Figure 1, CF-Nade (item-based, with the ordinal cost) achieves the following results on ML 1M:

with $\lambda = 1$, CF-NADE RMSE = 0.836$\sim$0.837 


10 percent of the ratings in each of these datasets are randomly selected as the test set,
leaving the remaining 90 percent of the ratings as the training set. Among the ratings in the training set, 5 percent are used as validation set. We use a default rating of 3 for items without training observations. Prediction error is measured by Root Mean Squared Error (RMSE).
The authors report the average RMSE on test set over 5 different splits.

The hyperparameters: 

The configuration of the experiments is as follows. We use
a single hidden layer architecture and the number of hidden units is set to 500, with default parameters (b1 = 0.1,b2 = 0.001 and $\epsilon$ = 10−8, for Adam) are utilized to optimize the cost function in Equation 19. The learning rate is set to 0.001 , the weight decay is set to 0.015 and we use the tanh activation function. Batch size was set to 512.
(https://github.com/Ian09/CF-NADE hyperparamets in README)

    batch_size = 512
    n_iter = 10
    look_ahead = 60
    lr =0.001  # lr in Adam and SGD, decay in Adadelta
    b1 = 0.1 # b1 in Adam, mu in SGD
    b2 = 0.001
    epsilon = 1e-8
    hidden_size_split = 500
    hidden_size = [int(x) for x in hidden_size_split]
    activation_function = tanh
    drop_rate = 0
    weight_decay = 0.02
    Optimizer = Adam
    std = 0
    alpha = 1
    polyak_mu = 0.995

In [3]:
users, items, ratings = data_utils.read_dataset('ml-1m')
all_contexts = collections.OrderedDict([(user_id, np.zeros(0)) for user_id in users])
train_ratings, test_ratings = data_utils.split_ratings(ratings, 0.9, shuffle=True, seed=0)

In [4]:
users_num = len(users)
items_num = len(items)

In [27]:
recommender = Cfnade(num_users=users_num, num_items=items_num, 
                     batch_size=512, train_epoch=30, hidden_dim=500, 
                     learning_rate=0.001)


  optimizer=optimizer)


In [28]:
import time
recommender.reset(users, items, train_ratings)
print('Reset takes time: ', time.time()-start_time)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Reset takes time:  3886.327176809311


In [29]:
import time
start_time = time.time()
t = recommender.dense_predictions
t = [(uid, iid, np.zeros(0)) for uid, iid in test_ratings]
preds = recommender.predict(t)
print('Dense Prediction takes time: ', time.time()-start_time)

Dense Prediction takes time:  86.38061046600342


In [30]:
tot = 0.0
num = 0.0
for (uid, iid, _), pred in zip(t, preds):
    tot += (test_ratings[uid, iid][0] - pred) **2
    #print(test_ratings[uid, iid][0] , pred)
    num += 1
print("RMSE is", np.sqrt(tot / num))

RMSE is 1.0955738208372945
