In [16]:
from collections import defaultdict
import pandas as pd

from surprise.model_selection import KFold, cross_validate

from surprise import KNNBasic

from surprise import SVD

from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV


from model.data_interface.data import *

In [4]:
data = load_data(data_path="../data/dicoding_user_item_rating.gzip")
print(data.head())

data_input = data.loc[:, ["user_id", "course_id", "rating"]]
surprise_input = reader_data(data=data_input,
                             cols=["user_id", "course_id", "rating"],
                             scale=True,
                             model="surprise")

Index(['user_id', 'course_id', 'graduated_at', 'rating'], dtype='object')
   user_id  course_id        graduated_at  rating
0   623699         14 2020-03-10 11:45:50       5
1   406371         14 2020-03-10 08:44:09       4
2     1946         14 2020-03-11 13:24:10       5
3   186713         14 2020-03-11 17:36:04       4
4   462580         14 2020-03-11 08:43:36       4


In [8]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [19]:
def precision_recall_at_k(predictions, k=10, threshold=4.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [20]:
trainset = surprise_input.build_full_trainset()

testset  = trainset.build_anti_testset()

In [21]:
algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.

predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

In [22]:
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

623699 [413, 615, 510, 209, 555, 403, 423, 302, 570, 159]
406371 [276, 418, 510, 428, 570, 251, 159, 237, 271, 387]
1946 [423, 445, 171, 605, 555, 32, 480, 271, 159, 219]
186713 [418, 202, 332, 297, 120, 292, 60, 251, 271, 237]
462580 [153, 423, 418, 615, 605, 171, 199, 123, 271, 237]
275777 [319, 555, 510, 209, 271, 199, 413, 605, 445, 60]
486772 [153, 202, 428, 445, 480, 555, 615, 171, 251, 413]
150018 [153, 159, 276, 313, 555, 605, 418, 387, 177, 570]
438721 [615, 332, 159, 123, 32, 60, 302, 413, 297, 605]
530845 [32, 615, 145, 153, 209, 387, 276, 433, 302, 332]
263610 [32, 615, 153, 195, 418, 302, 123, 510, 423, 387]
79913 [510, 195, 555, 153, 413, 418, 199, 445, 271, 159]
15181 [159, 209, 271, 276, 423, 510, 195, 145, 480, 413]
471465 [153, 209, 403, 413, 555, 605, 302, 418, 159, 445]
331970 [153, 209, 276, 387, 445, 480, 555, 123, 605, 271]
447463 [133, 297, 413, 445, 615, 423, 418, 292, 403, 387]
578624 [428, 510, 605, 153, 403, 615, 319, 313, 433, 302]
749524 [555, 413, 615, 41

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [23]:
kf = KFold(n_splits=5)
algo = SVD()

data = surprise_input

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

0.9871571362157141
0.9887248804594789
0.9872517072001918
0.9885182949313777
0.9871565213911233
0.9884892187227401
0.9875221871511294
0.9890356506504738
0.9877909222597776
0.9892581458859827
