# Using the Surprise library

[Surprise](http://surpriselib.com/) is a Python scikit building and analyzing recommender systems that deal with explicit rating data. Its name stands for Simple Python RecommendatIon System Engine.

In [None]:
from surprise import KNNBasic
from surprise import SVD
from surprise import Dataset
from collections import defaultdict
from surprise import get_dataset_dir
from surprise.model_selection import train_test_split
import io
import time
import matplotlib.pyplot as plt
%matplotlib inline

Below we will create an SVD model using the 100k dataset from MovieLens. This takes a few seconds to run so be patient!

In [None]:

# First train an SVD algorithm on the movielens dataset.
data = Dataset.load_builtin('ml-100k') # there are a couple of famous Rec System datasets available in this library
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Then predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset() # Return a list of ratings that can be used as a testset
predictions = algo.test(testset)

In [None]:
# some examples of predictions
print(predictions[:5])

## Some helper functions

We have built the predictions. Now we can visualize them. We first write these helpers functions.

In [None]:
from tmdb_class import TMDB # The class to retrive the movie poster images

def read_item_names():
    '''Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    '''

    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list) # This is used to group a sequence of key-value pairs into a dictionary of lists
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# Visualizing the recommendations

We can see for each user what are the top recommended movies.

In [None]:
top_n = get_top_n(predictions)
top_n['943']

In [None]:
import re
from IPython.display import Image
from IPython.display import display, HTML

top_n = get_top_n(predictions, n=10)

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

# Print the recommended items for user id 1
uid = '1'
user_ratings = top_n[uid]
recommended_items = [iid for (iid, _) in user_ratings]
print(uid, recommended_items)

# Convert ids into names.
item_names = [rid_to_name[rid]
              for rid in recommended_items]

print(uid, item_names)

for name in item_names:
    print(name)
    clean_name = re.sub(r'\([^)]*\)', '', name) # this remove the year of the movie which is in between paranthesis
    url = TMDB().get_poster_path_by_name(clean_name)
    print(url)
    if url:
        display(Image(url=url))

## Cross-validation using surprise
This package also provides for you built-in cross-validation to split the data to multiple folds.

In [None]:
from surprise.model_selection import cross_validate

algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True);

### User-based collaborative filtering with surprise!

In [None]:
from surprise import KNNWithMeans

sim_options = {
    'name': 'pearson', #let's use pearson similarity which can be seen as mean-centered cosine similarity
    'user_based': True #we will do user-based CF
}
knn_means = KNNWithMeans(k=40, min_k=1, sim_options=sim_options, verbose=False)

# Run 5-fold cross-validation and print results.
cross_validate(knn_means, data, measures=['RMSE'], cv=5, verbose=True);

## Precision- Recall @k

In [None]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls


In [None]:
trainset, testset = train_test_split(data, test_size=0.2)
algo = SVD()

algo.fit(trainset)
predictions = algo.test(testset)

precision = []
recall = []
for k in range(10):
    precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=3.5) # rating > 3 = relevant, rating < 3 = irrelevant

    # Precision and recall can then be averaged over all users
    precision.append( sum(prec for prec in precisions.values()) / len(precisions) )
    recall.append( sum(rec for rec in recalls.values()) / len(recalls) )

plt.plot(range(10), recall, 'ro-', label="recall")
plt.plot(range(10), precision, 'go-', label="precision")
plt.title("precision and recall for SVD")
plt.legend()
plt.show();

precisions, recalls = precision_recall_at_k(predictions, k=20, threshold=3.5)

print("precision @ 20 for SVD", sum(prec for prec in precisions.values()) / len(precisions))
print("recall @ 20 for SVD", sum(rec for rec in recalls.values()) / len(recalls))


In [None]:
algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options, verbose=False)

algo.fit(trainset)
predictions = algo.test(testset)

precision = []
recall = []
for k in range(10):
    precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=3.5) # rating > 3 = relevant, rating < 3 = irrelevant

    # Precision and recall can then be averaged over all users
    precision.append( sum(prec for prec in precisions.values()) / len(precisions) )
    recall.append( sum(rec for rec in recalls.values()) / len(recalls) )


plt.plot(range(10), recall, 'ro-', label="recall")
plt.plot(range(10), precision, 'go-', label="precision")
plt.legend()
plt.title("precision and recall for user-based knn")
plt.show();

precisions, recalls = precision_recall_at_k(predictions, k=20, threshold=3.5)

print("precision @ 20 for user-based knn", sum(prec for prec in precisions.values()) / len(precisions))
print("recall @ 20 for user-based knn", sum(rec for rec in recalls.values()) / len(recalls))

## precision-recall curve
We will now observe the area under precision recall curve for tow methods: SVD and KNN.

In [None]:
from inspect import signature

algo = SVD()

algo.fit(trainset)
predictions = algo.test(testset)

precision = []
recall = []
for k in range(20):
    precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=3.5) 

    # Precision and recall can then be averaged over all users
    precision.append( sum(prec for prec in precisions.values()) / len(precisions) )
    recall.append( sum(rec for rec in recalls.values()) / len(recalls) )

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve for SVD');

In [None]:
algo =  KNNWithMeans(k=40, min_k=1, sim_options=sim_options, verbose=False)

algo.fit(trainset)
predictions = algo.test(testset)

precision = []
recall = []
for k in range(20):
    precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=3.5) 

    # Precision and recall can then be averaged over all users
    precision.append( sum(prec for prec in precisions.values()) / len(precisions) )
    recall.append( sum(rec for rec in recalls.values()) / len(recalls) )

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve for user-based KNN');

## Tuning hyperr-parameters with surprise
As we saw we have built-in cross-validation in this package. We can use this to tune the hyper-parameters of our recommender system, eg tuning the number of neighbours in KNN or the number of factors (or the reduced dimensionality) in SVD.

In [None]:
from surprise.model_selection import GridSearchCV

In [None]:
SVD_grid_search = GridSearchCV(SVD, param_grid={'n_factors': [50, 100, 200, 300]}, measures=['RMSE'], cv=5,
                               refit=True, joblib_verbose=2, n_jobs=-1)
SVD_grid_search.fit(data)

In [None]:
print("best parameter:", SVD_grid_search.best_params)
print("best rmse: ", SVD_grid_search.best_score)
# you can even see the whole cv results
print("\n")
SVD_grid_search.cv_results

Now for KNN

In [None]:
print(sim_options)

In [None]:
KNN_grid_search = GridSearchCV(KNNWithMeans, param_grid={'k': [20, 30, 40, 50], 
                                                         'sim_options': {'name': ['pearson'], 'user_based': [True]}}, 
                               measures=['RMSE'], cv=5,
                               refit=True, joblib_verbose=2, n_jobs=-1)
KNN_grid_search.fit(data)

In [None]:
print("best parameter:", KNN_grid_search.best_params)
print("best rmse: ", KNN_grid_search.best_score)
# you can even see the whole cv results
print("\n")
KNN_grid_search.cv_results

Let's save the best SVD model. We will use it later.

In [None]:
best_model_svd = SVD_grid_search.best_estimator['rmse']

In [None]:
import pickle
file_name = "best_model_svd"
pickle.dump(best_model_svd, open(file_name, 'wb'))

In [None]:
m = pickle.load(open("best_model_svd", 'rb'))
m.predict('6', '908')