### Рекомендации на основе содержани


    Использовать dataset MovieLens
    Построить рекомендации (регрессия, предсказываем оценку) на фичах:

    TF-IDF на тегах и жанрах
    Средние оценки (+ median, variance, etc.) пользователя и фильма

    Оценить RMSE на тестовой выборке



In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
ratings['movieId'].value_counts()

356       329
318       317
296       307
593       279
2571      278
         ... 
5986        1
100304      1
34800       1
83976       1
8196        1
Name: movieId, Length: 9724, dtype: int64

In [8]:
np.mean(ratings[ratings['movieId'] == 100304].rating)

3.0

In [11]:
print(max(ratings['movieId']))

193609


In [12]:
movieId_ratings = pd.DataFrame()
movieId = []
rating = []
for i in tqdm(range(1, 193610)):
    var = np.mean(ratings[ratings['movieId'] == i].rating)
    if var != 'Nan':
        movieId.append(i)
        rating.append(var)
movieId_ratings['movieId'] = movieId
movieId_ratings['ratings'] = rating

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=193609.0), HTML(value='')))




In [13]:
movieId_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193609 entries, 0 to 193608
Data columns (total 2 columns):
movieId    193609 non-null int64
ratings    9724 non-null float64
dtypes: float64(1), int64(1)
memory usage: 3.0 MB


In [14]:
movieId_ratings.dropna(inplace=True)

In [15]:
movieId_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9724 entries, 0 to 193608
Data columns (total 2 columns):
movieId    9724 non-null int64
ratings    9724 non-null float64
dtypes: float64(1), int64(1)
memory usage: 227.9 KB


In [16]:
movies_with_ratings = movies.join(movieId_ratings.set_index('movieId'), on='movieId')

In [17]:
movies_with_ratings.dropna(inplace=True)
movies_with_ratings

Unnamed: 0,movieId,title,genres,ratings
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143
4,5,Father of the Bride Part II (1995),Comedy,3.071429
5,6,Heat (1995),Action|Crime|Thriller,3.946078
6,7,Sabrina (1995),Comedy|Romance,3.185185
7,8,Tom and Huck (1995),Adventure|Children,2.875000
8,9,Sudden Death (1995),Action,3.125000
9,10,GoldenEye (1995),Action|Adventure|Thriller,3.496212


### Подготовим наш датасет

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
def change_string(s):
    return " ".join(s.replace(' ', '').replace('-','').split('|'))

In [24]:
movie_genres = [change_string(g) for g in movies_with_ratings.genres.values]

In [25]:
count_vect = CountVectorizer()

In [26]:
X_train_counts = count_vect.fit_transform(movie_genres)

In [27]:
X_train_counts.toarray()

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [28]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [29]:
y = movies_with_ratings['ratings'] 

In [30]:
X_test, X_train, y_test, y_train = train_test_split(X_train_tfidf, y, test_size=0.3, random_state = 1000)

In [33]:
from sklearn.linear_model import LinearRegression
algo = LinearRegression()

In [35]:
algo =  LinearRegression().fit(X_train, y_train) 

### Предскажем

In [39]:
y_pred = algo.predict(X_test)
y_pred

array([3.48844733, 3.51841891, 3.42807802, ..., 3.46891167, 3.40478545,
       3.75      ])

In [40]:
len(y_test)

6806

In [42]:
print("RMSE =",np.sqrt(np.mean((y_test-y_pred)**2)))

RMSE = 0.82975199272167
