# Задание

ПАКЕТ SURPRISE

1) используйте данные MovieLens 1M

2) можно использовать любые модели из пакета

3) получите RMSE на тестовом сете 0.87 и ниже


# Подготовка данных

In [5]:
import pandas as pd
import pandas.compat
import io

In [6]:
import numpy as np
from tqdm import tqdm_notebook

In [7]:
df_ratings = pd.read_csv('ratings.dat', header=None, sep='::', engine='python')
df_movies = pd.read_csv('movies.dat', header=None, sep='::', engine='python')

In [8]:
df_movies.columns=['movieId', 'title', 'genres']
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
df_ratings.columns=['userId','movieId','rating','timestamp']
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [10]:
df = pd.merge(df_ratings, df_movies, on='movieId')

In [11]:
del df['timestamp']
del df['genres']

# Построение модели

In [12]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy

In [13]:
from surprise.model_selection import KFold

In [14]:
kfold = KFold(5)

In [15]:
df_for_surpise = df_ratings[['userId', 'movieId', 'rating']]

In [16]:
df_for_surpise.columns = ['uid', 'iid', 'rating']

In [17]:
df_for_surpise.head()

Unnamed: 0,uid,iid,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [18]:
reader = Reader(rating_scale=(0.5, 5))

In [19]:
dataset = Dataset.load_from_df(df_for_surpise, reader)

In [20]:
from surprise import KNNWithMeans

In [21]:
algo = KNNWithMeans(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [22]:
from surprise.model_selection import cross_validate

In [23]:
cross_validate(algo, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8577  0.8589  0.8599  0.8565  0.8600  0.8586  0.0013  
MAE (testset)     0.6720  0.6726  0.6727  0.6708  0.6734  0.6723  0.0009  
Fit time          23.93   24.52   22.36   22.49   25.96   23.85   1.34    
Test time         76.77   74.07   71.49   77.50   10

{'test_rmse': array([0.85770955, 0.8588965 , 0.85987329, 0.85647481, 0.85995654]),
 'test_mae': array([0.67196828, 0.67257379, 0.67268234, 0.67080609, 0.67344851]),
 'fit_time': (23.930315017700195,
  24.519879817962646,
  22.355286121368408,
  22.49003291130066,
  25.96183705329895),
 'test_time': (76.76503300666809,
  74.07050085067749,
  71.48867011070251,
  77.49739003181458,
  100.82974886894226)}

Нужный диапазон значений RMSE получен (< 0.87), но сделаем проверку и на других моделях

# Исследование с использованием дополнительных методов

In [25]:
from surprise import SVD
algo_SVD = SVD()
cross_validate(algo_SVD, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8738  0.8748  0.8727  0.8736  0.8736  0.8737  0.0007  
MAE (testset)     0.6865  0.6871  0.6860  0.6864  0.6853  0.6863  0.0006  
Fit time          48.78   53.99   54.77   53.21   59.39   54.03   3.39    
Test time         2.59    2.40    2.65    2.31    2.60    2.51    0.13    


{'test_rmse': array([0.8737836 , 0.87483774, 0.87274034, 0.87362809, 0.87357737]),
 'test_mae': array([0.68654825, 0.68711422, 0.68603251, 0.68635947, 0.68528672]),
 'fit_time': (48.784323930740356,
  53.992595911026,
  54.76615881919861,
  53.206093072891235,
  59.3909330368042),
 'test_time': (2.591013193130493,
  2.4009430408477783,
  2.648621082305908,
  2.305108070373535,
  2.6044681072235107)}

In [26]:
from surprise import NMF
algo_NMF = NMF()
cross_validate(algo_NMF, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9153  0.9172  0.9137  0.9170  0.9170  0.9160  0.0014  
MAE (testset)     0.7225  0.7255  0.7222  0.7244  0.7246  0.7238  0.0013  
Fit time          60.94   103.44  78.19   63.65   57.67   72.78   16.86   
Test time         4.39    5.37    2.92    2.86    2.52    3.61    1.09    


{'test_rmse': array([0.91530897, 0.91720981, 0.91365833, 0.91701378, 0.91698258]),
 'test_mae': array([0.72248924, 0.7255436 , 0.72216356, 0.72440275, 0.72459024]),
 'fit_time': (60.9437518119812,
  103.44148302078247,
  78.18986988067627,
  63.648370027542114,
  57.66501593589783),
 'test_time': (4.391932964324951,
  5.371487140655518,
  2.916332960128784,
  2.860474109649658,
  2.519414186477661)}

In [27]:
from surprise import NormalPredictor
algo_NormalPredictor = NormalPredictor()
cross_validate(algo_NormalPredictor, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5142  1.5096  1.5091  1.5085  1.5096  1.5102  0.0020  
MAE (testset)     1.2127  1.2094  1.2088  1.2084  1.2085  1.2096  0.0016  
Fit time          1.04    1.20    1.23    1.28    1.32    1.22    0.10    
Test time         2.27    2.29    2.64    2.36    2.65    2.44    0.17    


{'test_rmse': array([1.51418796, 1.50964186, 1.50905508, 1.50847517, 1.50963653]),
 'test_mae': array([1.21272273, 1.20943121, 1.20881641, 1.20840908, 1.20853127]),
 'fit_time': (1.044506311416626,
  1.196470022201538,
  1.2271511554718018,
  1.2840723991394043,
  1.3248209953308105),
 'test_time': (2.265394926071167,
  2.286365032196045,
  2.6395490169525146,
  2.357621908187866,
  2.6482579708099365)}

In [28]:
from surprise import BaselineOnly
algo_BaselineOnly = BaselineOnly()
cross_validate(algo_BaselineOnly, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9094  0.9086  0.9098  0.9086  0.9068  0.9086  0.0010  
MAE (testset)     0.7205  0.7193  0.7199  0.7198  0.7178  0.7195  0.0009  
Fit time          1.47    1.58    1.61    1.70    1.59    1.59    0.07    
Test time         2.15    2.51    2.23    2.20    2.22    2.26    0.12    


{'test_rmse': array([0.90942539, 0.9085991 , 0.90980872, 0.90858822, 0.90676428]),
 'test_mae': array([0.7204932 , 0.71928212, 0.71985833, 0.71981814, 0.7178152 ]),
 'fit_time': (1.4685819149017334,
  1.5788931846618652,
  1.6097469329833984,
  1.6961469650268555,
  1.5928809642791748),
 'test_time': (2.154721975326538,
  2.5060369968414307,
  2.2257139682769775,
  2.2033257484436035,
  2.2225630283355713)}

In [29]:
from surprise import KNNBasic
algo_KNNBasic = KNNBasic()
cross_validate(algo_KNNBasic, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9230  0.9241  0.9232  0.9228  0.9212  0.9229  0.0010  
MAE (testset)     0.7282  0.7282  0.7276  0.7271  0.7263  0.7275  0.0007  
Fit time          23.16   30.79   32.90   24.10   46.12   31.41   8.25    
Test time         163.62  162.17  134.74  139.09  163.35  152.59  12.88   


{'test_rmse': array([0.92297307, 0.92413593, 0.92324575, 0.92284917, 0.92118136]),
 'test_mae': array([0.72824666, 0.7282099 , 0.72763346, 0.72714845, 0.72630396]),
 'fit_time': (23.15976905822754,
  30.79008173942566,
  32.89593315124512,
  24.095762968063354,
  46.11776876449585),
 'test_time': (163.62284207344055,
  162.1659541130066,
  134.7415370941162,
  139.0866641998291,
  163.34989500045776)}

In [30]:
from surprise import KNNBaseline
algo_KNNBaseline = KNNBaseline()
cross_validate(algo_KNNBaseline, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8966  0.8945  0.8930  0.8935  0.8983  0.8952  0.0020  
MAE (testset)     0.7065  0.7059  0.7054  0.7048  0.7099  0.7065  0.0018  
Fit time          23.30   27.95   37.53   40.72   25.75   31.05   6.83    
Test time         147.55  168.94  175.59  140.39  177.24  161.94  15.11   


{'test_rmse': array([0.89659327, 0.8945231 , 0.893032  , 0.89348825, 0.89826936]),
 'test_mae': array([0.70647612, 0.70590723, 0.70543047, 0.70476492, 0.70991248]),
 'fit_time': (23.3039608001709,
  27.949639081954956,
  37.53399610519409,
  40.71524524688721,
  25.752312898635864),
 'test_time': (147.54622793197632,
  168.94113898277283,
  175.5856750011444,
  140.38802790641785,
  177.23597812652588)}

In [31]:
from surprise import SlopeOne
algo_SlopeOne = SlopeOne()
cross_validate(algo_SlopeOne, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9062  0.9067  0.9054  0.9078  0.9065  0.9065  0.0008  
MAE (testset)     0.7140  0.7151  0.7136  0.7147  0.7153  0.7145  0.0006  
Fit time          13.57   14.53   15.74   15.81   17.79   15.49   1.42    
Test time         64.16   69.32   69.77   66.89   112.60  76.55   18.14   


{'test_rmse': array([0.90615955, 0.90665377, 0.90542248, 0.90782533, 0.90654278]),
 'test_mae': array([0.71404473, 0.71507334, 0.71363334, 0.71471831, 0.71526984]),
 'fit_time': (13.569154977798462,
  14.52989912033081,
  15.744428157806396,
  15.810383796691895,
  17.78842806816101),
 'test_time': (64.16039490699768,
  69.31711316108704,
  69.7667498588562,
  66.8853440284729,
  112.59567999839783)}

In [32]:
from surprise import CoClustering
algo_CoClustering = CoClustering()
cross_validate(algo_CoClustering, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9166  0.9184  0.9155  0.9159  0.9149  0.9163  0.0012  
MAE (testset)     0.7183  0.7207  0.7180  0.7183  0.7166  0.7184  0.0013  
Fit time          27.59   26.47   21.28   20.70   23.01   23.81   2.76    
Test time         5.26    4.29    3.84    3.79    4.65    4.36    0.54    


{'test_rmse': array([0.91664481, 0.91839389, 0.91552217, 0.91591495, 0.91488871]),
 'test_mae': array([0.71827459, 0.72070758, 0.7179508 , 0.71827473, 0.71662566]),
 'fit_time': (27.592801094055176,
  26.467613220214844,
  21.282800912857056,
  20.700852870941162,
  23.01062774658203),
 'test_time': (5.256424903869629,
  4.28609824180603,
  3.8384299278259277,
  3.7934608459472656,
  4.646324872970581)}

# ВЫВОДЫ

Лучшим результатом является вариант полученный в самом начале работы с методом KNNWithMeans(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False}). Остальные испробованные методы показали результаты сильно хуже.

Интерес вызвал алгоритм BaselineOnly, тк у него RMSE получилось не самое низкое, но время работы значительно ниже, чем для других методов. Возможно, в кейсах, где критично время, это свойство может быть важным.