# Задание

1.Датасет ml-latest

2.Вспомнить подходы, которые разбирали

3.Выбрать понравившийся подход к гибридным системам

4.Написать свою рекомендательную систему

# Подготовка датасета

In [36]:
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np

In [2]:
ratings = pd.read_csv('ratings.dat', header=None, sep='::', engine='python')
movies = pd.read_csv('movies.dat', header=None, sep='::', engine='python')

In [6]:
movies.columns=['movieId', 'title', 'genres']
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings.columns=['userId','movieId','rating','timestamp']
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [9]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,5.0,978824268.0
1,1,Toy Story (1995),Animation|Children's|Comedy,6.0,4.0,978237008.0
2,1,Toy Story (1995),Animation|Children's|Comedy,8.0,4.0,978233496.0
3,1,Toy Story (1995),Animation|Children's|Comedy,9.0,5.0,978225952.0
4,1,Toy Story (1995),Animation|Children's|Comedy,10.0,5.0,978226474.0


# Модель для предсказания рейтинга пользователя

In [10]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),5.0
1,6.0,Toy Story (1995),4.0
2,8.0,Toy Story (1995),4.0
3,9.0,Toy Story (1995),5.0
4,10.0,Toy Story (1995),5.0


In [11]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [12]:
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

Обучаем модель 

In [13]:
%%time
algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)

CPU times: user 17.8 s, sys: 24.3 ms, total: 17.8 s
Wall time: 17.8 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a36e93350>

In [14]:
test_pred = algo.test(testset)

Оценка качества модели на тесте:

In [15]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8664


0.8664385133661735

Оценка на кросс валидации

In [38]:
cros_val = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)
RMSE_mean = cros_val['test_rmse'].mean()

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8716  0.8721  0.8715  0.8722  0.8730  0.8721  0.0005  
Fit time          26.20   22.51   30.48   23.66   24.88   25.55   2.75    
Test time         2.23    2.21    2.15    2.56    1.94    2.22    0.20    


In [39]:
print(RMSE_mean)

0.8720854758172342


In [40]:
algo.predict(uid=2.0, iid='Mortal Kombat (1995)').est

2.896028163108692

Определим пользователя и фильмы, которые он посмотрел. Сформируем списки по оценке и наименовнию фильма:

In [41]:
current_user_id = 2.0
user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()

scores = []
titles = []

for movie in movies_with_ratings.title.unique():
    if movie in user_movies:
        continue
        
    scores.append(algo.predict(uid=current_user_id, iid=movie).est)
    titles.append(movie)

Топ 10 фильмов из прогноза для пользователя

In [42]:
sorted(scores)[-10:]

[4.216392627455411,
 4.220569405784239,
 4.232005536594473,
 4.310596730628545,
 4.317310962435108,
 4.371442335171839,
 4.381401243501826,
 4.41271934997772,
 4.5087358593990166,
 4.525838412733957]

Любимые жанры пользователя:

In [43]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))
movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[0]

"Animation Children's Comedy"

Создадим словари для тестовой и тренировочной выборки и обучим модель:

In [44]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)
res

(array([[0.48797024, 0.6151026 , 0.71151155, 0.71151155, 0.7378178 ,
         0.7378178 , 0.7378178 , 0.7378178 , 0.7378178 , 0.7419569 ,
         0.7419569 , 0.75200801, 0.75200801, 0.75200801, 0.75200801,
         0.75200801, 0.75200801, 0.7758186 , 0.7758186 , 0.78182187]]),
 array([[ 363, 3420, 1779, 1058, 3807,  647, 2124, 3819, 3084, 3410, 2074,
         3397, 2728, 2553, 3324, 2105, 1110, 2104, 2899, 1898]]))

Результаты предсказания для пользователя:

In [67]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
363,367,"Mask, The (1994)",Comedy|Crime|Fantasy
3420,3489,Hook (1991),Adventure|Fantasy
1779,1848,"Borrowers, The (1997)",Adventure|Children's|Comedy|Fantasy
1058,1073,Willy Wonka and the Chocolate Factory (1971),Adventure|Children's|Comedy|Fantasy
3807,3877,Supergirl (1984),Action|Adventure|Fantasy
647,653,Dragonheart (1996),Action|Adventure|Fantasy
2124,2193,Willow (1988),Action|Adventure|Fantasy
3819,3889,Highlander: Endgame (2000),Action|Adventure|Fantasy
3084,3153,"7th Voyage of Sinbad, The (1958)",Action|Adventure|Fantasy
3410,3479,Ladyhawke (1985),Adventure|Fantasy|Romance
