### Импортируем все нужные библиотеки

In [145]:
from surprise import KNNWithMeans, KNNBasic, SVD, NMF, NormalPredictor, CoClustering
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
import pandas as pd

from warnings import filterwarnings
filterwarnings('ignore')

In [146]:
movies = pd.read_csv("movies.dat", sep = "::", names = ['movieId', 'title', 'genres'])
ratings = pd.read_csv('ratings.dat', sep = "::", names = ['userId', 'movieId', 'rating', 'timestamp'])

In [147]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


### Соединим movies и ratings и удалим пустые строки.

In [148]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

### Выведем уникальные фильмы

In [149]:
movies_with_ratings[movies_with_ratings.userId == 2.0].title.unique()

array(['Get Shorty (1995)', 'Broken Arrow (1996)', 'Braveheart (1995)',
       'Desperado (1995)', 'Die Hard: With a Vengeance (1995)',
       'Ed Wood (1994)',
       'Like Water for Chocolate (Como agua para chocolate) (1992)',
       'Outbreak (1995)', 'Shawshank Redemption, The (1994)',
       'Clear and Present Danger (1994)', 'Forrest Gump (1994)',
       'Maverick (1994)', 'True Lies (1994)', 'Cliffhanger (1993)',
       'Demolition Man (1993)', 'Fugitive, The (1993)',
       'Getaway, The (1994)', 'Jurassic Park (1993)', 'Mr. Jones (1993)',
       'Remains of the Day, The (1993)',
       'Terminator 2: Judgment Day (1991)', 'Dances with Wolves (1990)',
       'Silence of the Lambs, The (1991)', 'Courage Under Fire (1996)',
       'Mission: Impossible (1996)', 'Twister (1996)',
       'Independence Day (ID4) (1996)', "Breakfast at Tiffany's (1961)",
       'Gone with the Wind (1939)', 'Picnic (1955)',
       'Bonnie and Clyde (1967)', 'Platoon (1986)',
       "Sophie's Choice (1

 ### Создадим датасет в формате surprise

In [150]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [151]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),5.0
1,6.0,Toy Story (1995),4.0
2,8.0,Toy Story (1995),4.0
3,9.0,Toy Story (1995),5.0
4,10.0,Toy Story (1995),5.0


In [152]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

### Разобьем наш dataset на тренировочную выборку и тестовыую в соотношении 15 / 85

In [153]:
trainset, testset = train_test_split(data, test_size=.15)

### Создадим модель, используя SVD алгоритм и параметры n_factors = 10, n_epochs = 40

In [154]:
algo = SVD(n_factors = 10, n_epochs = 40)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ffb05ff36d8>

### Выведем значение ошибки

In [155]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8551


0.8550854258761379

### Проведем проверку используя крос валидацию

In [156]:
from surprise.model_selection import cross_validate

In [157]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8610  0.8608  0.8598  0.8622  0.8634  0.8614  0.0013  
MAE (testset)     0.6745  0.6741  0.6726  0.6752  0.6756  0.6744  0.0010  
Fit time          46.87   48.44   48.15   48.50   47.20   47.83   0.67    
Test time         3.92    3.93    4.04    5.82    2.87    4.12    0.95    


{'test_rmse': array([0.86097794, 0.86080581, 0.85981006, 0.86218445, 0.86344224]),
 'test_mae': array([0.6745233 , 0.67407484, 0.67257485, 0.67515646, 0.6755602 ]),
 'fit_time': (46.86782765388489,
  48.438090085983276,
  48.15355086326599,
  48.502896785736084,
  47.20002579689026),
 'test_time': (3.9207067489624023,
  3.932687759399414,
  4.0411388874053955,
  5.822185277938843,
  2.8686532974243164)}

In [158]:
algo.predict(uid=2, iid='Bad boy (1999)')

Prediction(uid=2, iid='Bad boy (1999)', r_ui=None, est=3.378709156873783, details={'was_impossible': False})

### Другие алгоритмы на дефолтных параметрах показали хуже результат.

- NMF (RMSE: 0.9166)
- NormalPredictor (RMSE: 1.5093)
- CoClustering (RMSE: 0.9125)