# Домашнее задание по теме «Коллаборативная фильтрация»


Задание:\
ПАКЕТ SURPRISE:
 - используйте данные MovieLens 1M
 - можно использовать любые модели из пакета
 - получите RMSE на тестовом сете 0.87 и ниже

Комментарий преподавателя : \
    В ДЗ на датасет 1М может не хватить RAM.
    Можно сделать на 100K.
    Качество RMSE предлагаю считать на основе CrossValidation (5 фолдов), а не отложенном датасете.

In [1]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split, cross_validate

import pandas as pd


In [2]:
links_df = pd.read_csv('data/links.csv')
movies_df = pd.read_csv('data/movies.csv')
ratings_df = pd.read_csv('data/ratings.csv')
tags_df = pd.read_csv('data/tags.csv')

In [3]:
movies_with_ratings = movies_df.join(ratings_df.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)


In [4]:
movies_with_ratings.head()


Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [5]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})


In [6]:
dataset.head()


Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0
2,7.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),2.5
4,17.0,Toy Story (1995),4.5


In [7]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)


In [8]:
trainset, testset = train_test_split(data, test_size=.15)


In [9]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x175b6043820>

In [10]:
test_pred = algo.test(testset)


In [11]:
accuracy.rmse(test_pred, verbose=True)


RMSE: 0.9008


0.9008262679814459

In [12]:
algo.predict(uid=2, iid='Fight Club (1999)')


Prediction(uid=2, iid='Fight Club (1999)', r_ui=None, est=4.52198658986445, details={'actual_k': 50, 'was_impossible': False})

In [13]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x175b6043370>

In [14]:
test_pred = algo.test(testset)


In [15]:
accuracy.rmse(test_pred, verbose=True)


RMSE: 0.8862


0.8862131457010708

In [16]:
new_pred = algo.predict(uid=2, iid='Fight Club (1999)')
new_pred


Prediction(uid=2, iid='Fight Club (1999)', r_ui=None, est=4.089825182625259, details={'actual_k': 20, 'was_impossible': False})

In [17]:
# 5-fold cross-validation
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8814  0.8876  0.8888  0.8857  0.8723  0.8832  0.0060  
MAE (testset)     0.6699  0.6750  0.6745  0.6745  0.6611  0.6710  0.0053  
Fit time          16.17   14.01   14.84   13.99   17.34   15.27   1.30    
Test time         8.66    10.49   9.54    10.25   11

{'test_rmse': array([0.88138338, 0.88760108, 0.8888158 , 0.88569486, 0.87231501]),
 'test_mae': array([0.66985567, 0.67498728, 0.67445266, 0.67451161, 0.66109384]),
 'fit_time': (16.171680450439453,
  14.01494312286377,
  14.844947099685669,
  13.988110303878784,
  17.33972692489624),
 'test_time': (8.662915468215942,
  10.486442565917969,
  9.542521476745605,
  10.250149011611938,
  11.679105520248413)}