In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import sys
import pandas as pd

import run_utils

sys.path.append('../') 
import reclab

from reclab.recommenders import SLIM, EASE
from reclab.recommenders import KNNRecommender
from reclab.recommenders import LibFM
from reclab.recommenders import Llorma
from reclab import data_utils

sys.path.append('../tests') 
import utils
import collections


  import tqdm.autonotebook
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [40]:
# helper functions for metrics

def compute_PREC_REC_MAP_NDCG_MRR(N, users, recs, test_ratings):
    assert recs.shape[1] >= N
    metrics = ['PREC', 'REC', 'MAP', 'NDCG', 'MRR']
    res = {key:[] for key in metrics}
    test_rating_matrix = np.array(list(test_ratings.keys()))
    for user_id, rec in zip(users, recs):
        test_matrix = test_rating_matrix[test_rating_matrix[:,0]==user_id,1]
        prec, recall, ncdg = precision_recall_ndcg_at_k(N, rec[:N], test_matrix)
        MAP, mrr, ncdg = map_mrr_ndcg(rec[:N], test_matrix)
        res['PREC'].append(prec)
        res['REC'].append(recall)
        res['NDCG'].append(ncdg)
        res['MAP'].append(MAP)
        res['MRR'].append(mrr)
    return {key:np.mean(res[key]) for key in metrics}

## From "A troubling analysis"... 
### https://github.com/MaurizioFD/RecSys2019_DeepLearning_Evaluation/blob/861eafeaba2943458adec22469b147ec492784b6/Conferences/IJCAI/NeuRec_github/eval.py

def precision_recall_ndcg_at_k(k, rankedlist, test_matrix):
    idcg_k = 0
    dcg_k = 0
    n_k = k if len(test_matrix) > k else len(test_matrix)
    if n_k == 0:
        return 0, 0, 0
    for i in range(n_k):
        idcg_k += 1 / np.log2(i + 2)

    b1 = rankedlist
    b2 = test_matrix
    s2 = set(b2)
    hits = [(idx, val) for idx, val in enumerate(b1) if val in s2]
    count = len(hits)

    for c in range(count):
        dcg_k += 1 / np.log2(hits[c][0] + 2)

    return float(count / k), float(count / len(test_matrix)), float(dcg_k / idcg_k)


def map_mrr_ndcg(rankedlist, test_matrix):
    ap = 0
    map = 0
    dcg = 0
    idcg = 0
    mrr = 0
    if len(test_matrix) == 0:
        return 0, 0, 0
    for i in range(len(test_matrix)):
        idcg += 1 / np.log2(i + 2)

    b1 = rankedlist
    b2 = test_matrix
    s2 = set(b2)
    hits = [(idx, val) for idx, val in enumerate(b1) if val in s2]
    count = len(hits)

    for c in range(count):
        ap += (c + 1) / (hits[c][0] + 1)
        dcg += 1 / np.log2(hits[c][0] + 2)

    if count != 0:
        mrr = 1 / (hits[0][0] + 1)

    if count != 0:
        map = ap / count

    return map, mrr, float(dcg / idcg)

In [41]:
def precision_at_k(r, k):
    """Score is precision @ k
    Relevance is binary (nonzero is relevant).
    Returns:
        Precision @ k
    Raises:
        ValueError: len(r) must be >= k
    """
    assert k >= 1
    r = np.asarray(r)[:k]
    return np.mean(r)



def recall_at_k(r, k, all_pos_num):
    r = np.asfarray(r)[:k]
    return np.sum(r) / all_pos_num

def average_precision(r,cut):
    """Score is average precision (area under PR curve)
    Relevance is binary (nonzero is relevant).
    Returns:
        Average precision
    """
    r = np.asarray(r)
    out = [precision_at_k(r, k + 1) for k in range(cut) if r[k]]
    if not out:
        return 0.
    return np.sum(out)/float(min(cut, np.sum(r)))

def test_one_user(recs, training_items, user_pos_test):
    if len(user_pos_test) == 0:
        return None
    r = []
    for i in recs:
        if i in user_pos_test:
            r.append(1)
        else:
            r.append(0)
    
    recall_20 = recall_at_k(r, 20, len(user_pos_test))
    recall_40 = recall_at_k(r, 40, len(user_pos_test))
    recall_60 = recall_at_k(r, 60, len(user_pos_test))
    recall_80 = recall_at_k(r, 80, len(user_pos_test))
    recall_100 = recall_at_k(r, 100, len(user_pos_test))

    ap_20 = average_precision(r,20)
    ap_40 = average_precision(r, 40)
    ap_60 = average_precision(r, 60)
    ap_80 = average_precision(r, 80)
    ap_100 = average_precision(r, 100)


    return np.array([recall_20,recall_40,recall_60,recall_80,recall_100, ap_20,ap_40,ap_60,ap_80,ap_100])


def test(recs, train_ratings, test_ratings, users):
    train_items = collections.defaultdict(list)
    for uid, iid in train_ratings:
        train_items[uid].append(iid)

    test_items = collections.defaultdict(list) 
    for uid, iid in test_ratings:
        test_items[uid].append(iid)
    result = np.array([0.] * 10)
    tot_num = 0
    for user_id, rec in zip(users, recs):
        res = test_one_user(rec, train_items[user_id], test_items[user_id])
        if res is not None:
            result += res
            tot_num += 1

    ret = result / tot_num
    return list(ret)

# SLIM

In "A troubling analysis" (https://arxiv.org/pdf/1911.07698.pdf) Table 12, SLIM achieves the following results on ML 1M.


| PREC@5   | REC@5   | MAP@5   | NDCG@5   | MRR@5   | PREC@10   | REC@10   | MAP@10   | NDCG@10   |  MRR@10 |
|------:|------:|------:|------:|------:|------:|------:|------:|------:|------:|
| 0.4437 |  0.1106 |  0.3692 |  0.1749 |  0.6578 | 0.3813 |  0.1770 |  0.3003 |  0.2321 |  0.667 |


In this paper, the dataset is converted into a implicit dataset, so ratings are either 1 or 0. Evaulation was performed by averaging over five different 80/20 train/test splits. (We will just look at a single split below).

 The [hyperparameters](https://github.com/MaurizioFD/RecSys2019_DeepLearning_Evaluation/blob/861eafeaba2943458adec22469b147ec492784b6/DL_Evaluation_TOIS_Additional_material.pdf) are set as `l1_ratio=1.89e-5` and `alpha=0.049`.
 

In [73]:
users, items, ratings = data_utils.read_dataset('ml-1m')

In [74]:
for key in ratings.keys():
    ratings[key] = (1, ratings[key][1])

In [75]:
all_contexts = collections.OrderedDict([(user_id, np.zeros(0)) for user_id in users])

In [76]:
train_ratings, test_ratings = data_utils.split_ratings(ratings, 0.8, shuffle=True, seed=0)

In [77]:
recommender = SLIM(alpha=0.049, l1_ratio=1.89e-5, seed=0)

In [78]:
recommender.reset(users, items, train_ratings)

In [79]:
recs, _ = recommender.recommend(all_contexts, 10)

In [83]:
for N in [5, 10]:
    res = compute_PREC_REC_MAP_NDCG_MRR(N, users, recs, test_ratings)
    print('@{}:'.format(N), res)

@5: {'PREC': 0.3241721854304636, 'REC': 0.06738210300791221, 'MAP': 0.43651995952906547, 'NDCG': 0.10395269380895251, 'MRR': 0.45423013245033117}
@10: {'PREC': 0.3814238410596027, 'REC': 0.17422638615796424, 'MAP': 0.43883045470150284, 'NDCG': 0.18934677631282193, 'MRR': 0.47939562966466936}


# EASE

In "A troubling analysis" (https://arxiv.org/pdf/1911.07698.pdf), EASE achieves the following results on ML 1M



| PREC@5   | REC@5   | MAP@5   | NDCG@5   | MRR@5   | PREC@10   | REC@10   | MAP@10   | NDCG@10   |  MRR@10 |
|------:|------:|------:|------:|------:|------:|------:|------:|------:|------:|
| 0.4360  | 0.1073  | 0.3608  | 0.1697  | 0.6475 | 0.3745  | 0.1731  | 0.2923  | 0.2259  | 0.65| 
 
In this paper, the dataset is converted into a implicit dataset, so ratings are either 1 or 0. Evaulation was performed by averaging over five different 80/20 train/test splits. (We will just look at a single split below).

The [hyperparameters](https://github.com/MaurizioFD/RecSys2019_DeepLearning_Evaluation/blob/861eafeaba2943458adec22469b147ec492784b6/DL_Evaluation_TOIS_Additional_material.pdf) are set as `lam=1.25e3`


In [84]:
users, items, ratings = data_utils.read_dataset('ml-1m')
for key in ratings.keys():
    ratings[key] = (1, ratings[key][1])
all_contexts = collections.OrderedDict([(user_id, np.zeros(0)) for user_id in users])
train_ratings, test_ratings = data_utils.split_ratings(ratings, 0.8, shuffle=True, seed=0)

In [85]:
recommender = EASE(lam=1.25e3)

In [86]:
recommender.reset(users, items, train_ratings)

  self._set_arrayXarray(i, j, x)


In [87]:
recs, _ = recommender.recommend(all_contexts, 10)

In [88]:
for N in [5, 10]:
    res = compute_PREC_REC_MAP_NDCG_MRR(N, users, recs, test_ratings)
    print('@{}:'.format(N), res)

@5: {'PREC': 0.32466887417218543, 'REC': 0.06821147096556184, 'MAP': 0.4329348325974981, 'NDCG': 0.10444392654073191, 'MRR': 0.4466197571743929}
@10: {'PREC': 0.38415562913907286, 'REC': 0.17628998806236673, 'MAP': 0.4373985546772671, 'NDCG': 0.19097229149913342, 'MRR': 0.4719510801009146}


## UserKNN cosine

In the Surprise repo changing the benchmarking script (https://github.com/NicolasHug/Surprise/blob/master/examples/benchmark.py) on KNNWithMeans to use cosine similarity leads to an RMSE of 0.942 on MovieLens 1M.


The [hyperparameters](https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans) are set as `topK=40`, `shrink=0`


In [85]:
users, items, ratings = data_utils.read_dataset('ml-1m')
all_contexts = collections.OrderedDict([(user_id, np.zeros(0)) for user_id in users])
train_ratings, test_ratings = data_utils.split_ratings(ratings, 0.8, shuffle=True, seed=0)

Using TensorFlow backend.
[autoreload of reclab.recommenders.top_pop failed: Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/ec2-user/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 410, in superreload
    update_generic(old_obj, new_obj)
  File "/home/ec2-user/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 347, in update_generic
    update(a, b)
  File "/home/ec2-user/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 302, in update_class
    if update_generic(old_obj, new_obj): continue
  File "/home/ec2-user/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 347, in update_generic
    update(a, b)
  File "/home/ec2-user/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 266, in update_

In [86]:
recommender = KNNRecommender(shrinkage=0, neighborhood_size=40, user_based=True, use_means=True, use_content=False)

In [87]:
recommender.reset(users, items, train_ratings)

In [88]:
t = recommender.dense_predictions
t = [(uid, iid, np.zeros(0)) for uid, iid in test_ratings]
preds = recommender.predict(t)

In [89]:
tot = 0.0
num = 0.0
for (uid, iid, _), pred in zip(t, preds):
    tot += (test_ratings[uid, iid][0] - pred) **2
    num += 1
print("RMSE is", np.sqrt(tot / num))

RMSE is 0.9458989545474523


## ItemKNN cosine
In the Surprise repo changing the benchmarking script (https://github.com/NicolasHug/Surprise/blob/master/examples/benchmark.py) on KNNWithMeans to use cosine similarity leads to an RMSE of 0.993 on MovieLens 1M.


The [hyperparameters](https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans) are set as `topK=40`, `shrink=0`


In [None]:
users, items, ratings = data_utils.read_dataset('ml-1m')
all_contexts = collections.OrderedDict([(user_id, np.zeros(0)) for user_id in users])
train_ratings, test_ratings = data_utils.split_ratings(ratings, 0.8, shuffle=True, seed=0)

In [5]:
recommender = KNNRecommender(shrinkage=0, neighborhood_size=40, user_based=False, use_means=True, use_content=False)

In [6]:
recommender.reset(users, items, train_ratings)

In [7]:
t = recommender.dense_predictions
t = [(uid, iid, np.zeros(0)) for uid, iid in test_ratings]
preds = recommender.predict(t)

In [8]:
tot = 0.0
num = 0.0
for (uid, iid, _), pred in zip(t, preds):
    tot += (test_ratings[uid, iid][0] - pred) **2
    num += 1
print("RMSE is", np.sqrt(tot / num))

RMSE is 0.8874714464182688


## LibFM MCMC
In the baselines paper (https://arxiv.org/pdf/1905.01395.pdf) they report an RMSE of around 0.765 on MovieLens 10M.


The [hyperparameters](https://arxiv.org/pdf/1905.01395.pdf) (Section 5.2) are set as `num_two_way_factors=128`, `init_stdev=0.1`, `num_iter=100`.

In [None]:
users, items, ratings = data_utils.read_dataset('ml-10m')
all_contexts = collections.OrderedDict([(user_id, np.zeros(0)) for user_id in users])
train_ratings, test_ratings = data_utils.split_ratings(ratings, 0.9, shuffle=True, seed=0)

In [6]:
recommender = LibFM(num_user_features=0,
                    num_item_features=0,
                    num_rating_features=0,
                    max_num_users=len(users),
                    max_num_items=len(items),
                    num_two_way_factors=128,
                    init_stdev=0.1,
                    num_iter=100,
                    method='mcmc')

In [7]:
recommender.reset(users, items, train_ratings)

In [8]:
t = [(uid, iid, np.zeros(0)) for uid, iid in test_ratings]
preds = recommender.predict(t)

In [9]:
tot = 0.0
num = 0.0
for (uid, iid, _), pred in zip(t, preds):
    tot += (test_ratings[uid, iid][0] - pred) **2
    num += 1
print("RMSE is", np.sqrt(tot / num))

RMSE is 0.7659974180103403


## Global LLORMA

LLORMA (Local Low-Rank Matrix Approximation) http://jmlr.org/papers/volume17/14-301/14-301.pdf is Matrix Factorization ispired method that fits many weighted-MF models and averages over them. Our implementation is based on
https://github.com/JoonyoungYi/LLORMA-tensorflow implementation.

The hyper-parameter values are
    
    N_ANCHOR = 10

    PRE_RANK = 5
    PRE_LEARNING_RATE = 2e-4
    PRE_LAMBDA = 10

    RANK = 10
    LEARNING_RATE = 1e-2
    LAMBDA = 1e-3
    BATCH_SIZE = 128
    
They report a performance of 0.930 on ML-100K, we achieve a performance of 0.940
(evaluating on ML-1M or ML-10M is prohibitively slow) 
The original paper for a larger mode (rank=20, n_anchors=100) achieves a loss of 0.899, we get 0.927

In [3]:
users, items, ratings = data_utils.read_dataset('ml-100k')
all_contexts = collections.OrderedDict([(user_id, np.zeros(0)) for user_id in users])
train_ratings, test_ratings = data_utils.split_ratings(ratings, 0.8, shuffle=True, seed=0)

In [94]:
params = {'n_anchor': 10, 
          'pre_rank': 5, 
          'pre_learning_rate': 2e-4,
          'pre_lambda_val': 10,
          'pre_train_steps': 100,
          'rank': 10,
          'learning_rate': 1e-2,
          'lambda_val': 1e-3,
          'batch_size': 128,
          'train_steps': 50,
          'use_cache': True}

recommender = Llorma(len(users), len(items), **params)

In [None]:
recommender.reset(users, items, train_ratings)

In [99]:
t = [(uid, iid, np.zeros(0)) for uid, iid in test_ratings]
preds = recommender.predict(t)
tot = 0.0
num = 0.0
for (uid, iid, _), pred in zip(t, preds):
    tot += (test_ratings[uid, iid][0] - pred) **2
    num += 1
print("RMSE is", np.sqrt(tot / num))

Low: 2.998, Mean: 3.517, High: 3.974
RMSE is 0.9406582762937047


  predict_k[np.isnan(predict_k)] = 0


In [4]:
# Model with 100 local models
params = {'n_anchor': 100, 
          'pre_rank': 5, 
          'pre_learning_rate': 2e-4,
          'pre_lambda_val': 10,
          'pre_train_steps': 100,
          'rank': 20,
          'learning_rate': 1e-2,
          'lambda_val': 1e-3,
          'batch_size': 1000,
          'train_steps': 100,
          'use_cache': True}

recommender = Llorma(len(users), len(items), **params)
recommender.reset(users, items, train_ratings)



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where





Pre-train step: 0, train_error:1.1149220382440692
Pre-train step: 1, train_error:1.040608468333719
Pre-train step: 2, train_error:1.02886344730357
Pre-train step: 3, train_error:1.0261757078346831
Pre-train step: 4, train_error:1.0203726659858003
Pre-train step: 5, train_error:1.0144814043905237
Pre-train step: 6, train_error:1.0055757959777942
Pre-train step: 7, train_error:1.0035250895520307
Pre-train step: 8, train_error:1.0073046965451715
Pre-train step: 9, train_error:1.003998059966839
Pre-train step: 10, train_error:0.9978783185620222
Pre-train step: 11, train_error:0.9993548044112733
Pre-train step: 12, train_error:1.0020340215033854
Pre-train step: 13, train_error:0.9983674234896629
Pre-train step: 14, train_error:0.9942409839462831
Pre-train step: 15, train_error:0.9945044801009275
Pre-train step: 16, train_error:0.9945314167835009
Pre-train step: 17, train_error:0.991106429034

INFO:tensorflow:Restoring parameters from results/model-6.ckpt
Train step:7
Train step:7, train error: 0.9850480174413682, test error: 0.9850480174413682
INFO:tensorflow:Restoring parameters from results/model-7.ckpt
Train step:8
Train step:8, train error: 0.9791037480869652, test error: 0.9791037480869652
INFO:tensorflow:Restoring parameters from results/model-8.ckpt
Train step:9
Train step:9, train error: 0.9739520627014345, test error: 0.9739520627014345
INFO:tensorflow:Restoring parameters from results/model-9.ckpt
Train step:10
Train step:10, train error: 0.9694326977659745, test error: 0.9694326977659745
INFO:tensorflow:Restoring parameters from results/model-10.ckpt
Train step:11
Train step:11, train error: 0.9654254628305714, test error: 0.9654254628305714
INFO:tensorflow:Restoring parameters from results/model-11.ckpt
Train step:12
Train step:12, train error: 0.9618386906289095, test error: 0.9618386906289095
INFO:tensorflow:Restoring parameters from results/model-12.ckpt
Trai

Train step:59
Train step:59, train error: 0.906805570491514, test error: 0.906805570491514
INFO:tensorflow:Restoring parameters from results/model-59.ckpt
Train step:60
Train step:60, train error: 0.9062884603846499, test error: 0.9062884603846499
INFO:tensorflow:Restoring parameters from results/model-60.ckpt
Train step:61
Train step:61, train error: 0.9057796259117463, test error: 0.9057796259117463
INFO:tensorflow:Restoring parameters from results/model-61.ckpt
Train step:62
Train step:62, train error: 0.9052787252349825, test error: 0.9052787252349825
INFO:tensorflow:Restoring parameters from results/model-62.ckpt
Train step:63
Train step:63, train error: 0.9047854354656772, test error: 0.9047854354656772
INFO:tensorflow:Restoring parameters from results/model-63.ckpt
Train step:64
Train step:64, train error: 0.9042994514235809, test error: 0.9042994514235809
INFO:tensorflow:Restoring parameters from results/model-64.ckpt
Train step:65
Train step:65, train error: 0.903820484491288,

In [5]:
t = [(uid, iid, np.zeros(0)) for uid, iid in test_ratings]
preds = recommender.predict(t)
tot = 0.0
num = 0.0
for (uid, iid, _), pred in zip(t, preds):
    tot += (test_ratings[uid, iid][0] - pred) **2
    num += 1
print("RMSE is", np.sqrt(tot / num))

Low: 3.032, Mean: 3.489, High: 3.902
RMSE is 0.9274755234532681
