Использовать dataset MovieLens

Построить рекомендации (регрессия, предсказываем оценку) на фичах:
TF-IDF на тегах и жанрах
Средние оценки (+ median, variance, etc.) пользователя и фильма

Оценить RMSE на тестовой выборке

In [210]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tqdm import tqdm_notebook

In [5]:
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')

In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Считаем среднюю, медиану, мин и макс оценку по пользователю
======

In [73]:
ratings.head()
ratings_new = ratings.copy()

In [74]:
del ratings_new['timestamp']

In [75]:
median_rating = pd.DataFrame(ratings_new.groupby('userId').rating.median())

In [76]:
median_rating = median_rating.reset_index().rename(columns={'rating' : 'median_user_rating'})
median_rating.head()

Unnamed: 0,userId,median_user_rating
0,1,5.0
1,2,4.0
2,3,0.5
3,4,4.0
4,5,4.0


In [77]:
variance_rating = pd.DataFrame(ratings_new.groupby('userId').rating.var().round(2))

In [78]:
variance_rating = variance_rating.reset_index().rename(columns={'rating' : 'variance_user_rating'})
variance_rating.head()

Unnamed: 0,userId,variance_user_rating
0,1,0.64
1,2,0.65
2,3,4.37
3,4,1.73
4,5,0.98


In [79]:
min_rating = pd.DataFrame(ratings_new.groupby('userId').rating.min())

In [80]:
min_rating = min_rating.reset_index().rename(columns={'rating' : 'min_user_rating'})
min_rating.head()

Unnamed: 0,userId,min_user_rating
0,1,1.0
1,2,2.0
2,3,0.5
3,4,1.0
4,5,1.0


In [81]:
max_rating = pd.DataFrame(ratings_new.groupby('userId').rating.max())

In [82]:
max_rating = max_rating.reset_index().rename(columns={'rating' : 'max_user_rating'})
max_rating.head()

Unnamed: 0,userId,max_user_rating
0,1,5.0
1,2,5.0
2,3,5.0
3,4,5.0
4,5,5.0


In [83]:
mean_rating = pd.DataFrame(ratings_new.groupby('userId').rating.mean().round(2))

In [84]:
mean_rating = mean_rating.reset_index().rename(columns={'rating' : 'mean_user_rating'})
mean_rating.head()

Unnamed: 0,userId,mean_user_rating
0,1,4.37
1,2,3.95
2,3,2.44
3,4,3.56
4,5,3.64


In [85]:
ratings_full = pd.merge(median_rating,variance_rating, on='userId', how='left')

In [86]:
ratings_full = pd.merge(ratings_full, min_rating, on='userId', how='left')

In [87]:
ratings_full = pd.merge(ratings_full, max_rating, on='userId', how='left')

In [88]:
ratings_full = pd.merge(ratings_full, mean_rating, on='userId', how='left')

In [89]:
ratings_full.tail()

Unnamed: 0,userId,median_user_rating,variance_user_rating,min_user_rating,max_user_rating,mean_user_rating
605,606,4.0,0.52,0.5,5.0,3.66
606,607,4.0,0.93,1.0,5.0,3.79
607,608,3.0,1.16,0.5,5.0,3.13
608,609,3.0,0.2,3.0,4.0,3.27
609,610,3.5,0.74,0.5,5.0,3.69


In [90]:
ratings_full.shape

(610, 6)

Считаем среднюю, медиану, мин и макс оценку по фильму
========

In [106]:
ratings_movies = ratings.copy()
ratings_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [107]:
del ratings_movies['timestamp']

In [108]:
median_rating_movies = pd.DataFrame(ratings_movies.groupby('movieId').rating.median())

In [109]:
median_rating_movies = median_rating_movies.reset_index().rename(columns={'rating' : 'median_movie_rating'})
median_rating_movies.head()

Unnamed: 0,movieId,median_movie_rating
0,1,4.0
1,2,3.5
2,3,3.0
3,4,3.0
4,5,3.0


In [110]:
variance_rating_movies = pd.DataFrame(ratings_movies.groupby('movieId').rating.var().round(2))

In [111]:
variance_rating_movies = variance_rating_movies.reset_index().rename(columns={'rating' : 'variance_movie_rating'})
variance_rating_movies.head()

Unnamed: 0,movieId,variance_movie_rating
0,1,0.7
1,2,0.78
2,3,1.11
3,4,0.73
4,5,0.82


In [112]:
min_rating_movies = pd.DataFrame(ratings_movies.groupby('movieId').rating.min())

In [113]:
min_rating_movies = min_rating_movies.reset_index().rename(columns={'rating' : 'min_movie_rating'})
min_rating_movies.head()

Unnamed: 0,movieId,min_movie_rating
0,1,0.5
1,2,0.5
2,3,0.5
3,4,1.0
4,5,0.5


In [114]:
max_rating_movies = pd.DataFrame(ratings_movies.groupby('movieId').rating.max())

In [115]:
max_rating_movies = max_rating_movies.reset_index().rename(columns={'rating' : 'max_movie_rating'})
max_rating_movies.head()

Unnamed: 0,movieId,max_movie_rating
0,1,5.0
1,2,5.0
2,3,5.0
3,4,3.0
4,5,5.0


In [116]:
mean_rating_movies = pd.DataFrame(ratings_movies.groupby('movieId').rating.mean().round(2))

In [117]:
mean_rating_movies = mean_rating_movies.reset_index().rename(columns={'rating' : 'mean_movie_rating'})
mean_rating_movies.head()

Unnamed: 0,movieId,mean_movie_rating
0,1,3.92
1,2,3.43
2,3,3.26
3,4,2.36
4,5,3.07


In [118]:
ratings_movie_full = pd.merge(median_rating_movies, variance_rating_movies, on='movieId', how='left')

In [119]:
ratings_movie_full = pd.merge(ratings_movie_full, min_rating_movies, on='movieId', how='left')

In [120]:
ratings_movie_full = pd.merge(ratings_movie_full, max_rating_movies, on='movieId', how='left')

In [121]:
ratings_movie_full = pd.merge(ratings_movie_full, mean_rating_movies, on='movieId', how='left')

In [124]:
ratings_movie_full.head()

Unnamed: 0,movieId,median_movie_rating,variance_movie_rating,min_movie_rating,max_movie_rating,mean_movie_rating
0,1,4.0,0.7,0.5,5.0,3.92
1,2,3.5,0.78,0.5,5.0,3.43
2,3,3.0,1.11,0.5,5.0,3.26
3,4,3.0,0.73,1.0,3.0,2.36
4,5,3.0,0.82,0.5,5.0,3.07


In [126]:
ratings_movie_full.shape

(9724, 6)

In [128]:
ratings_movie_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9724 entries, 0 to 9723
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   movieId                9724 non-null   int64  
 1   median_movie_rating    9724 non-null   float64
 2   variance_movie_rating  6278 non-null   float64
 3   min_movie_rating       9724 non-null   float64
 4   max_movie_rating       9724 non-null   float64
 5   mean_movie_rating      9724 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 531.8 KB


In [183]:
ratings_movie_full.isnull().any()

movieId                  False
median_movie_rating      False
variance_movie_rating     True
min_movie_rating         False
max_movie_rating         False
mean_movie_rating        False
dtype: bool

В variance_movie_rating - есть NaN значения

In [187]:
ratings_movie_full = ratings_movie_full.fillna(0.0)

In [188]:
ratings_movie_full.isnull().any()

movieId                  False
median_movie_rating      False
variance_movie_rating    False
min_movie_rating         False
max_movie_rating         False
mean_movie_rating        False
dtype: bool

TF-IDF
=====

По жанрам
====

In [134]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [131]:
movies_new = movies.copy()
def new_str(s):
    return ' '.join(s.lower().replace(' ', '').replace('-', '').split('|'))

In [135]:
movie_genres = [new_str(g) for g in movies.genres.values]

In [139]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [143]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [159]:
movie_genres = pd.DataFrame(X_train_tfidf.toarray(), columns=count_vect.get_feature_names()).reset_index()

In [165]:
movie_genres = movie_genres.rename(columns={'index':'movieId'})
movie_genres.head()

Unnamed: 0,movieId,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


По тэгам
====

In [168]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [170]:
tags_new = tags.copy()
del tags_new['timestamp']

In [172]:
movies_with_tags = movies.join(tags_new.set_index('movieId'), on='movieId')

In [174]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game


In [176]:
movies_with_tags.dropna(inplace=True)

In [177]:
tag_strings = []
movies = []

for movie, group in movies_with_tags.groupby('movieId'):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)

In [178]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tag_strings)

In [179]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [180]:
movies_with_tags = pd.DataFrame(X_train_tfidf.toarray(), columns=count_vect.get_feature_names()).reset_index()

In [181]:
movies_with_tags.head()

Unnamed: 0,index,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [182]:
movies_with_tags = movies_with_tags.rename(columns={'index':'movieId'})
movies_with_tags.head()

Unnamed: 0,movieId,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Объединяем датасеты
======

In [189]:
ratings_movie_full.head()

Unnamed: 0,movieId,median_movie_rating,variance_movie_rating,min_movie_rating,max_movie_rating,mean_movie_rating
0,1,4.0,0.7,0.5,5.0,3.92
1,2,3.5,0.78,0.5,5.0,3.43
2,3,3.0,1.11,0.5,5.0,3.26
3,4,3.0,0.73,1.0,3.0,2.36
4,5,3.0,0.82,0.5,5.0,3.07


In [190]:
ratings_full.head()

Unnamed: 0,userId,median_user_rating,variance_user_rating,min_user_rating,max_user_rating,mean_user_rating
0,1,5.0,0.64,1.0,5.0,4.37
1,2,4.0,0.65,2.0,5.0,3.95
2,3,0.5,4.37,0.5,5.0,2.44
3,4,4.0,1.73,1.0,5.0,3.56
4,5,4.0,0.98,1.0,5.0,3.64


In [197]:
df = ratings[['userId', 'movieId', 'rating']]
df = pd.merge(df, ratings_movie_full, how='left', on='movieId')

In [198]:
df = pd.merge(df, ratings_full, how='left', on='userId')

In [199]:
df = pd.merge(df, movie_genres, how='left', on='movieId')

In [200]:
df = pd.merge(df, movies_with_tags, how='left', on='movieId')

In [201]:
df.head()

Unnamed: 0,userId,movieId,rating,median_movie_rating,variance_movie_rating,min_movie_rating,max_movie_rating,mean_movie_rating,median_user_rating,variance_user_rating,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,1,1,4.0,4.0,0.7,0.5,5.0,3.92,5.0,0.64,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,4.0,3.0,1.11,0.5,5.0,3.26,5.0,0.64,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,6,4.0,4.0,0.67,1.0,5.0,3.95,5.0,0.64,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,47,5.0,4.0,0.85,0.5,5.0,3.98,5.0,0.64,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,50,5.0,4.5,0.64,1.0,5.0,4.24,5.0,0.64,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [205]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Columns: 1505 entries, userId to zooeydeschanel
dtypes: float64(1503), int64(2)
memory usage: 1.1 GB


In [206]:
df.dropna(inplace=True)

In [208]:
df.isnull().any().any()

False

Нормализуем данные
=====

In [211]:
stadart_scaler = StandardScaler()
df_norm = pd.DataFrame(stadart_scaler.fit_transform(df), columns=df.columns)
df_norm.head()

Unnamed: 0,userId,movieId,rating,median_movie_rating,variance_movie_rating,min_movie_rating,max_movie_rating,mean_movie_rating,median_user_rating,variance_user_rating,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,-1.687275,-1.511071,0.378886,0.622337,-0.526928,-0.830226,0.25682,0.615563,2.53071,-0.62823,...,-0.019119,-0.025023,-0.03621,-0.005518,-0.005518,-0.030239,-0.049726,-0.047524,-0.0825,-0.022079
1,-1.687275,-1.506722,0.378886,-1.143958,0.816215,-0.830226,0.25682,-0.667253,2.53071,-0.62823,...,-0.019119,-0.025023,-0.03621,-0.005518,-0.005518,-0.030239,-0.049726,-0.047524,-0.0825,-0.022079
2,-1.687275,-1.500199,0.378886,0.622337,-0.625206,-0.11741,0.25682,0.673872,2.53071,-0.62823,...,-0.019119,-0.025023,-0.03621,-0.005518,-0.005518,-0.030239,-0.049726,-0.047524,-0.0825,-0.022079
3,-1.687275,-1.411055,1.333557,0.622337,-0.035534,-0.830226,0.25682,0.732182,2.53071,-0.62823,...,-0.019119,-0.025023,-0.03621,-0.005518,-0.005518,-0.030239,-0.049726,-0.047524,-0.0825,-0.022079
4,-1.687275,-1.404532,1.333557,1.505485,-0.723485,-0.11741,0.25682,1.237534,2.53071,-0.62823,...,-0.019119,-0.025023,-0.03621,-0.005518,-0.005518,-0.030239,-0.049726,-0.047524,-0.0825,-0.022079


In [212]:
df_norm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32839 entries, 0 to 32838
Columns: 1505 entries, userId to zooeydeschanel
dtypes: float64(1505)
memory usage: 377.1 MB


Делим данные на тестовые и обучающие
======

In [213]:
y = df_norm['rating']
x = df_norm.drop(columns=['rating'])

In [214]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

Модель Регрессии
====

In [215]:
lr_t = LinearRegression(n_jobs=-1)
lr_t.fit(x_train, y_train)
y_pred_lr = lr_t.predict(x_test)
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred_lr)))

RMSE:  523272005516.142
