# Surprise homework

In [1]:
import io
import pandas as pd
import numpy as np
from collections import defaultdict

### Functions

In [2]:
from surprise import Dataset
from surprise import NormalPredictor
from surprise import SVD
from surprise import KNNBasic
from surprise import get_dataset_dir
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

### Constants

In [3]:
k = 30
k_5 = 5
n = 5
threshold = 3.52
user = '15'
measure = ['RMSE']
NP = 'NP'
KNN_cos = 'KNN_cos'
KNN_MSD = 'KNN_MSD'
KNN_Pearson = 'KNN_Pearson'
svd = 'SVD'

### Load data

In [4]:
data = Dataset.load_builtin('ml-100k')
data

<surprise.dataset.DatasetAutoFolds at 0x5cce568>

In [5]:
trainset, testset = train_test_split(data, test_size=.25)
testset

[('721', '300', 5.0),
 ('537', '506', 3.0),
 ('935', '237', 5.0),
 ('805', '1002', 1.0),
 ('271', '182', 3.0),
 ('605', '754', 3.0),
 ('925', '56', 3.0),
 ('413', '283', 5.0),
 ('82', '495', 3.0),
 ('256', '274', 5.0),
 ('256', '662', 2.0),
 ('129', '303', 3.0),
 ('479', '1444', 1.0),
 ('456', '182', 3.0),
 ('270', '295', 5.0),
 ('748', '137', 3.0),
 ('933', '195', 4.0),
 ('324', '763', 5.0),
 ('374', '181', 3.0),
 ('458', '9', 5.0),
 ('670', '15', 4.0),
 ('311', '399', 4.0),
 ('840', '14', 5.0),
 ('106', '647', 3.0),
 ('279', '203', 2.0),
 ('262', '172', 2.0),
 ('334', '1073', 4.0),
 ('344', '172', 4.0),
 ('385', '523', 4.0),
 ('585', '18', 2.0),
 ('213', '229', 4.0),
 ('315', '187', 4.0),
 ('757', '895', 4.0),
 ('291', '741', 5.0),
 ('144', '22', 5.0),
 ('654', '742', 4.0),
 ('684', '204', 4.0),
 ('332', '978', 4.0),
 ('328', '1439', 3.0),
 ('246', '254', 1.0),
 ('428', '326', 3.0),
 ('655', '1549', 2.0),
 ('16', '476', 3.0),
 ('90', '650', 5.0),
 ('299', '1018', 3.0),
 ('327', '658'

### Define algorithms

<b>1. Прогнозирование случайного рейтинга на основе распределения всех рейтингов в
наборе</b>

In [6]:
algorithm_NP = NormalPredictor()
crv_NP = cross_validate(algorithm_NP, data, measures = measure, verbose = True)

Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5184  1.5101  1.5258  1.5085  1.5137  1.5153  0.0062  
Fit time          0.13    0.15    0.18    0.17    0.17    0.16    0.02    
Test time         0.20    0.14    0.19    0.22    0.19    0.19    0.03    


<b>2. User-based коллаборативная фильтрация, метод kNN, k = 30, метрика
косинуса</b>

In [7]:
algorithm_KNNcos = KNNBasic(k = k, sim_options = { 'name': 'cosine' })
crv_KNNcos = cross_validate(algorithm_KNNcos, data, measures = measure, verbose = True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0151  1.0256  1.0192  1.0215  1.0135  1.0190  0.0044  
Fit time          1.85    1.86    1.85    1.89    1.87    1.86    0.01    
Test time         3.39    3.45    3.63    3.62    3.28    3.47    0.13    


<b>3. User-based коллаборативная фильтрация, метод kNN, k = 30, метрика Mean
Squared Difference</b>

In [8]:
algorithm_KNNmsd = KNNBasic(k = k)
crv_KNNmsd = cross_validate(algorithm_KNNmsd, data, measures = measure, verbose = True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9785  0.9767  0.9743  0.9786  0.9731  0.9762  0.0022  
Fit time          0.61    0.64    0.82    0.63    0.63    0.67    0.08    
Test time         3.45    3.31    4.15    3.36    3.85    3.62    0.33    


<b>4. User-based коллаборативная фильтрация, метод kNN, k = 30, метрика
корреляция Пирсона</b>

In [9]:
algorithm_KNNpearson = KNNBasic(k = k, sim_options = { 'name': 'pearson' })
crv_KNNpearson = cross_validate(algorithm_KNNpearson, data, measures = measure, verbose = True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0140  1.0118  1.0114  1.0135  1.0174  1.0136  0.0021  
Fit time          2.36    2.34    2.50    2.40    2.37    2.39    0.05    
Test time         3.63    3.81    3.39    3.73    3.36    3.58    0.18    


<b>5. SVD алгоритм</b>

In [10]:
algorithm_SVD = SVD()
crv_SVD = cross_validate(algorithm_SVD, data, measures = measure, verbose = True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9368  0.9340  0.9303  0.9412  0.9369  0.9358  0.0036  
Fit time          5.35    5.81    5.50    5.52    5.49    5.53    0.15    
Test time         0.27    0.17    0.22    0.23    0.19    0.22    0.04    


In [11]:
RSMA = {}
RSMA[NP] = round(crv_NP['test_rmse'].mean(), 3)
RSMA[KNN_cos] = round(crv_KNNcos['test_rmse'].mean(), 3)
RSMA[KNN_MSD] = round(crv_KNNmsd['test_rmse'].mean(), 3)
RSMA[KNN_Pearson] = round(crv_KNNpearson['test_rmse'].mean(), 3)
RSMA[svd] = round(crv_SVD['test_rmse'].mean(), 3)

RSMA

{'NP': 1.515,
 'KNN_cos': 1.019,
 'KNN_MSD': 0.976,
 'KNN_Pearson': 1.014,
 'SVD': 0.936}

In [12]:
best_algorithm_name = min(RSMA.items(), key=lambda x: x[1])[0]
if (best_algorithm_name == NP):
    best_algorithm = algorithm_NP
elif (best_algorithm_name == KNN_cos):
    best_algorithm = algorithm_KNNcos
elif (best_algorithm_name == KNN_MSD):
    best_algorithm = algorithm_KNNmsd
elif (best_algorithm_name == KNN_Pearson):
    best_algorithm = algorithm_KNNpearson
elif (best_algorithm_name == svd):
    best_algorithm = algorithm_SVD
    
best_algorithm_name

'SVD'

### Calculate precision@k and recall@k

In [13]:
def precision_recall_at_k(predictions, k = 10, threshold = 3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [14]:
best_algorithm.fit(trainset)
predictions = best_algorithm.test(testset)
precisions, recalls = precision_recall_at_k(predictions, k = k_5, threshold = threshold)

In [15]:
precisionak = round(sum(precision for precision in precisions.values()) / len(precisions), 3)
precisionak

0.739

In [16]:
recallak = round(sum(recall for recall in recalls.values()) / len(recalls), 3)
recallak

0.362

### Predict

In [17]:
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [20]:
testset = trainset.build_anti_testset()
predictions = best_algorithm.test(testset)
top_n = get_top_n(predictions, n)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    if uid == user: 
        films = [iid for (iid, _) in user_ratings]
        rate = user_ratings
        break

films

['199', '169', '1203', '173', '22']

In [25]:
def films_info(films):
    data_path = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    info = {}
    data = pd.read_csv(data_path, sep='|',encoding='ISO-8859-1', header = None) 
    for film in films:
        row = data.iloc[int(film)]
        info[film] = (row[1], row[2])
    return info

In [27]:
info = films_info(films)
info

{'199': ('Shining, The (1980)', '01-Jan-1980'),
 '169': ('Cinema Paradiso (1988)', '01-Jan-1988'),
 '1203': ('To Be or Not to Be (1942)', '01-Jan-1942'),
 '173': ('Raiders of the Lost Ark (1981)', '01-Jan-1981'),
 '22': ('Taxi Driver (1976)', '16-Feb-1996')}

In [46]:
rate=dict(rate)

file = open('result.txt', 'w')
print('User '+ user)
file.write('User '+ user + '\n')
for film in info:
    info_film = str(film) + ' ' + str(info[film]) + ' ' + str(round(rate[film],3))
    print(info_film)
    file.write(info_film + '\n')
file.close()

User 15
199 ('Shining, The (1980)', '01-Jan-1980') 4.218
169 ('Cinema Paradiso (1988)', '01-Jan-1988') 4.194
1203 ('To Be or Not to Be (1942)', '01-Jan-1942') 4.179
173 ('Raiders of the Lost Ark (1981)', '01-Jan-1981') 4.171
22 ('Taxi Driver (1976)', '16-Feb-1996') 4.142
