# Surprise Homework

In [5]:
import io
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise import SVD
from surprise import get_dataset_dir
from surprise import KNNWithMeans
from surprise import NormalPredictor

### Load Data

In [7]:
data = Dataset.load_builtin('ml-100k')
data

<surprise.dataset.DatasetAutoFolds at 0x2a8d3f5fa90>

In [4]:
trainset, testset = train_test_split(data, test_size=.25)

### Define Algorithms

##### прогнозирование случайного рейтинга на основе распределения всех рейтингов в наборе

In [8]:
NP = NormalPredictor()
cross_val_NP = cross_validate(NP, data, measures = ['RMSE'], verbose = True)

Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5367  1.5148  1.5199  1.5061  1.5166  1.5188  0.0101  
Fit time          0.16    0.21    0.23    0.21    0.22    0.21    0.03    
Test time         0.27    0.21    0.27    0.24    0.25    0.25    0.02    


##### user-based коллаборативную фильтрацию, метод kNN, k = 30, метрика косинуса

In [9]:
KNNcos = KNNWithMeans(k = 30, sim_options = { 'name': 'cosine' })
cross_val_KNNcos = cross_validate(KNNcos, data, measures = ['RMSE'], verbose = True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9604  0.9581  0.9656  0.9535  0.9581  0.9591  0.0040  
Fit time          2.14    2.28    2.45    2.23    2.26    2.27    0.10    
Test time         4.52    5.19    4.56    4.81    4.74    4.77    0.24    


##### user-based коллаборативную фильтрацию, метод kNN, k = 30, метрика Mean Squared Difference

In [11]:
KNNmsd = KNNWithMeans(k = 30, sim_options = {'name': 'msd'})
cross_val_KNNmsd = cross_validate(KNNmsd, data, measures = ['RMSE'], verbose = True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9509  0.9457  0.9631  0.9573  0.9447  0.9523  0.0070  
Fit time          0.68    0.77    0.77    0.75    0.72    0.74    0.03    
Test time         4.38    4.94    5.14    4.94    4.67    4.81    0.26    


##### user-based коллаборативную фильтрацию, метод kNN, k = 30, метрика корреляция Пирсона

In [12]:
KNNpearson = KNNWithMeans(k = 30, sim_options = { 'name': 'pearson' })
cross_val_KNNpearson = cross_validate(KNNpearson, data, measures = ['RMSE'], verbose = True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9509  0.9475  0.9534  0.9631  0.9477  0.9525  0.0057  
Fit time          3.12    3.12    2.94    3.22    3.20    3.12    0.10    
Test time         5.18    4.50    4.77    5.33    5.13    4.98    0.30    


##### SVD алгоритм

In [13]:
SVDalg = SVD()
cross_val_SVD = cross_validate(SVDalg, data, measures = ['RMSE'], verbose = True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9308  0.9463  0.9365  0.9351  0.9360  0.9369  0.0051  
Fit time          6.50    6.60    6.10    6.68    6.39    6.45    0.20    
Test time         0.30    0.30    0.19    0.22    0.28    0.26    0.05    


In [14]:
RSMA = {}
RSMA['NP'] = round(cross_val_NP['test_rmse'].mean(), 3)
RSMA['KNNcos'] = round(cross_val_KNNcos['test_rmse'].mean(), 3)
RSMA['KNNmsd'] = round(cross_val_KNNmsd['test_rmse'].mean(), 3)
RSMA['KNNpearson'] = round(cross_val_KNNpearson['test_rmse'].mean(), 3)
RSMA['SVD'] = round(cross_val_SVD['test_rmse'].mean(), 3)

RSMA

{'NP': 1.519,
 'KNNcos': 0.959,
 'KNNmsd': 0.952,
 'KNNpearson': 0.953,
 'SVD': 0.937}

### Best Algorithm

In [15]:
algorithm = min(RSMA.items(), key=lambda x: x[1])[0]
algorithm

'SVD'

In [16]:
best_algorithm = SVDalg

### Calculate precision@k and recall@k

In [17]:
def precision_recall_at_k(predictions, k = 10, threshold = 3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key = lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

algo = best_algorithm
kf = KFold(n_splits = 5)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k = 5, threshold = 3.52)

In [20]:
print('precision@k = ', sum(prec for prec in precisions.values()) / len(precisions))
print('recall@k = ', sum(rec for rec in recalls.values()) / len(recalls))

precision@k =  0.7249911629551079
recall@k =  0.4106242211830241


### Predict

In [21]:
def get_top_n(predictions, n = 10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [27]:
algo = best_algorithm
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n = 5)

for uid, user_ratings in top_n.items():
    if uid == '38':
        movie_list = [iid for (iid, _) in user_ratings]
        ratings = user_ratings

In [31]:
movie_list

[('515', 5),
 ('408', 5),
 ('923', 5),
 ('474', 4.79817050031963),
 ('316', 4.7914227464788794)]

In [28]:
def get_info(movie_list):
    path = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    info = {}
    data = pd.read_csv(path, sep='|',encoding='ISO-8859-1', header = None) 
    for movie in movie_list:
        row = data.iloc[int(movie)]
        info[movie] = (row[1], row[2])
    return info
movies_info = get_info(movie_list)
movies_info

{'515': ('Local Hero (1983)', '01-Jan-1983'),
 '408': ('Jack (1996)', '07-Aug-1996'),
 '923': ('White Squall (1996)', '01-Jan-1996'),
 '474': ('Trainspotting (1996)', '19-Jul-1996'),
 '316': ('In the Name of the Father (1993)', '01-Jan-1993')}

In [38]:
ratings = dict(ratings)
file = open('results.txt','w')
file.write('User 38 ' + '\n')
for movie in movies_info:
    info= str(movie) + ' ' + str(movies_info[movie]) + ' ' + str(round(ratings[movie],3))
    file.write(info + '\n')
file.close()