1. Использовать dataset MovieLens

2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
   - TF-IDF на тегах и жанрах
   - Средние оценки (+ median, variance, etc.) пользователя и фильма

3. Оценить RMSE на тестовой выборке

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

from tqdm import tqdm_notebook

# Подготовка данных

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [5]:
movies['genres_tf_idf'] = movies.apply(lambda r: ' '.join(r['genres'].split('|')), axis=1)

In [6]:
movies['tags_tf_idf'] = movies.apply(lambda r: ' '.join(tags[tags['movieId']==r['movieId']]['tag'].values), axis=1)

In [7]:
movies.drop('genres',axis=1,inplace=True)
movies.head()

Unnamed: 0,movieId,title,genres_tf_idf,tags_tf_idf
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game Robin Williams game
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old
3,4,Waiting to Exhale (1995),Comedy Drama Romance,
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake


In [8]:
def get_tfidf(df, fld):
    cnt_vec = CountVectorizer()
    processed = cnt_vec.fit_transform(df[fld])
    tfidf = TfidfTransformer()
    tfidf_dense = tfidf.fit_transform(processed).todense()
    columns = [None for i in range(len(cnt_vec.vocabulary_))]
    for k in cnt_vec.vocabulary_:
        columns[cnt_vec.vocabulary_[k]] = k
    df_tfidf = pd.DataFrame(tfidf_dense, columns=columns)
    return pd.concat((df.drop(fld,1), df_tfidf), axis=1)

Будем применять tf-idf и на тэгах, и на жанрах

In [9]:
movies_with_tfidf = get_tfidf(movies, 'genres_tf_idf')
movies_with_tfidf = get_tfidf(movies_with_tfidf, 'tags_tf_idf')
movies_with_tfidf.head()

Unnamed: 0,movieId,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
movies_with_tfidf.columns

Index(['movieId', 'title', 'action', 'adventure', 'animation', 'children',
       'comedy', 'crime', 'documentary', 'drama',
       ...
       'york', 'you', 'younger', 'your', 'zellweger', 'zither', 'zoe',
       'zombie', 'zombies', 'zooey'],
      dtype='object', length=1770)

In [12]:
fc = [c for c in movies_with_tfidf.columns if c not in ['genres', 'genres_tf_idf', 'tags_tf_idf']]

In [13]:
movies_for_ds = movies_with_tfidf[fc]

In [14]:
movies_with_ratings = pd.merge(ratings, movies_for_ds, on='movieId')

Для прогнозирования рейтинга для конкретного пользователя обучимся только на его фильмах

Выберем пользователя с большим количеством оцененных фильмов

In [15]:
ratings.groupby('userId')[['movieId']].count().sort_values('movieId', ascending=False).head()

Unnamed: 0_level_0,movieId
userId,Unnamed: 1_level_1
414,2698
599,2478
474,2108
448,1864
274,1346


In [16]:
TARGET_USER = 414

Создадим процедуру для тренировки на фильмах, оцененных заданным пользователем, с валидацией и прогнозом на отложенной выборке строк для сравнения с реальными оценками пользователя

In [18]:
def get_prediction(df, target_user, random_state = 7):
    df = df[df['userId']==target_user]
    drop_col_list = ['userId','movieId','timestamp','title','rating']
    df_for_user, df_for_user_test = train_test_split(df, test_size = 0.1, random_state = random_state)
    X, y = df_for_user.drop(drop_col_list,axis=1,inplace=False), df_for_user['rating']
    del df_for_user
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state = random_state)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_valid = sc.transform(X_valid)
    X_test = sc.transform(df_for_user_test.drop(drop_col_list,axis=1,inplace=False))
    
    models = [Lasso, Ridge, DecisionTreeRegressor, SVR, RandomForestRegressor]
    models_results = {}
    for m in tqdm_notebook(models):
        model = m()
        model.fit(X_train, y_train)
        models_results[m.__name__] = [model,
                                      model.score(X_train ,y_train),
                                      model.score(X_valid, y_valid),
                                      mean_squared_error(model.predict(X_train), y_train)**.5,
                                      mean_squared_error(model.predict(X_valid), y_valid)**.5
                                     ]
        print("{}. r2_train: {:.4f}, r2_test: {:.4f}, rmse_train: {:.4f}, rmse_test: {:.4f}".format(
            m.__name__, 
            models_results[m.__name__][1],
            models_results[m.__name__][2],
            models_results[m.__name__][3],
            models_results[m.__name__][4]
        ))
    
    best_ = sorted(models_results.values(), key=lambda t: t[4], reverse=False)[0]
    best_model = best_[0]
    print('-'*100)
    print(f'BEST VALIDATION_RMSE = {best_[4]}\nMODEL = {best_model}')
    rslt = pd.concat([df_for_user_test.reset_index().drop('index',1),
                      pd.DataFrame(columns=['predicted_rating'],data=best_model.predict(X_test))],
                     axis=1)
    print(f"RMSE TEST = {mean_squared_error(rslt['predicted_rating'], rslt['rating'])**.5}")
    return rslt[drop_col_list + ['predicted_rating'] + [c for c in rslt.columns if c not in drop_col_list + ['predicted_rating']]]

In [19]:
%%time
target_user_ratings = get_prediction(movies_with_ratings, TARGET_USER)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Lasso. r2_train: 0.0000, r2_test: -0.0005, rmse_train: 0.9309, rmse_test: 0.9309
Ridge. r2_train: 0.3431, r2_test: -0.0142, rmse_train: 0.7545, rmse_test: 0.9373
DecisionTreeRegressor. r2_train: 0.5294, r2_test: -0.1565, rmse_train: 0.6386, rmse_test: 1.0009
SVR. r2_train: 0.3128, r2_test: 0.1146, rmse_train: 0.7717, rmse_test: 0.8758
RandomForestRegressor. r2_train: 0.4756, r2_test: -0.0089, rmse_train: 0.6741, rmse_test: 0.9348

----------------------------------------------------------------------------------------------------
BEST VALIDATION_RMSE = 0.8757575090342732
MODEL = SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
RMSE TEST = 0.900768594006565
CPU times: user 33.9 s, sys: 3.87 s, total: 37.8 s
Wall time: 41.9 s


In [20]:
target_user_ratings.head()

Unnamed: 0,userId,movieId,timestamp,title,rating,predicted_rating,action,action.1,action.2,action.3,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,414,205,961438413,Unstrung Heroes (1995),3.0,3.40024,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,414,1257,961595667,Better Off Dead... (1985),5.0,2.901,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,414,6,961515642,Heat (1995),3.0,3.025846,0.549328,0.0,0.549328,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,414,2724,1025108900,Runaway Bride (1999),1.0,2.900074,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,414,43928,1216151152,Ultraviolet (2006),2.5,3.004893,0.384787,0.0,0.384787,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Добавим средние оценки пользователя и фильмов для обучения по всем пользователям

In [21]:
users_rating = ratings.groupby('userId')['rating'].agg(['mean','median','std','var']).reset_index()
movie_rating = ratings.groupby('movieId')['rating'].agg(['mean','median','std','var']).reset_index()

In [22]:
%%time
movies_with_ratings = movies_with_ratings.merge(users_rating, on='userId')
movies_with_ratings = movies_with_ratings.merge(movie_rating, on='movieId')
del users_rating, movie_rating

CPU times: user 7.14 s, sys: 6.45 s, total: 13.6 s
Wall time: 14.1 s


Сделаем процедуру для тренировки на фильмах, оцененных всеми пользователеми, с валидацией и прогнозом на отложенной выборке строк для сравнения с реальными оценками пользователя

In [24]:
def get_prediction2(df, target_user, random_state = 7):
    
    df = df[df['userId']==target_user]
    drop_col_list = ['userId','movieId','timestamp','title','rating']
    df_for_user, df_for_user_test = train_test_split(df, test_size = 0.1, random_state = random_state)
    X, y = df_for_user.drop(drop_col_list,axis=1,inplace=False), df_for_user['rating']
    del df_for_user
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state = random_state)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_valid = sc.transform(X_valid)
    X_test = sc.transform(df_for_user_test.drop(drop_col_list,axis=1,inplace=False))
    
    models = [Lasso, Ridge, DecisionTreeRegressor, SVR, RandomForestRegressor]
    models_results = {}
    for m in tqdm_notebook(models):
        model = m()
        model.fit(X_train, y_train)
        models_results[m.__name__] = [model,
                                      model.score(X_train ,y_train),
                                      model.score(X_valid, y_valid),
                                      mean_squared_error(model.predict(X_train), y_train)**.5,
                                      mean_squared_error(model.predict(X_valid), y_valid)**.5
                                     ]
        print("{}. r2_train: {:.4f}, r2_test: {:.4f}, rmse_train: {:.4f}, rmse_test: {:.4f}".format(
            m.__name__, 
            models_results[m.__name__][1],
            models_results[m.__name__][2],
            models_results[m.__name__][3],
            models_results[m.__name__][4]
        ))
   
    best_ = sorted(models_results.values(), key=lambda t: t[4], reverse=False)[0]
    best_model = best_[0]
    print('-'*100)
    print(f'BEST VALIDATION_RMSE = {best_[4]}\nMODEL = {best_model}')
    rslt = pd.concat([df_for_user_test.reset_index().drop('index',1),
                      pd.DataFrame(columns=['predicted_rating'],data=best_model.predict(X_test))],
                     axis=1)
    print(f"RMSE TEST = {mean_squared_error(rslt['predicted_rating'], rslt['rating'])**.5}")
    return rslt[drop_col_list + ['predicted_rating'] + [c for c in rslt.columns if c not in drop_col_list + ['predicted_rating']]]

In [25]:
movies_with_ratings.fillna(0,inplace=True)

In [26]:
%%time
predicted_ratings = get_prediction2(movies_with_ratings, TARGET_USER)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Lasso. r2_train: 0.0000, r2_test: -0.0094, rmse_train: 0.9397, rmse_test: 0.9315
Ridge. r2_train: 0.6309, r2_test: 0.4208, rmse_train: 0.5709, rmse_test: 0.7056
DecisionTreeRegressor. r2_train: 0.9907, r2_test: -0.0152, rmse_train: 0.0904, rmse_test: 0.9342
SVR. r2_train: 0.6000, r2_test: 0.4538, rmse_train: 0.5943, rmse_test: 0.6852
RandomForestRegressor. r2_train: 0.9209, r2_test: 0.4702, rmse_train: 0.2643, rmse_test: 0.6748

----------------------------------------------------------------------------------------------------
BEST VALIDATION_RMSE = 0.6748459304167922
MODEL = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_sc

RMSE улучшился при добавлении метрик по фильмам

In [27]:
predicted_ratings.head()

Unnamed: 0,userId,movieId,timestamp,title,rating,predicted_rating,action,action.1,action.2,action.3,...,zombies,zooey,mean_x,median_x,std_x,var_x,mean_y,median_y,std_y,var_y
0,414,1093,961436657,"Doors, The (1991)",3.0,2.865,0.0,0.0,0.0,0.0,...,0.0,0.0,3.391957,3.5,0.933307,0.871062,3.405405,3.5,0.991897,0.983859
1,414,59369,1283877039,Taken (2008),3.5,3.785,0.515078,0.0,0.515078,0.0,...,0.0,0.0,3.391957,3.5,0.933307,0.871062,3.619048,4.0,1.103263,1.217189
2,414,6,961515642,Heat (1995),3.0,4.16,0.549328,0.0,0.549328,0.0,...,0.0,0.0,3.391957,3.5,0.933307,0.871062,3.946078,4.0,0.817224,0.667856
3,414,3827,989248906,Space Cowboys (2000),2.0,2.83,0.41273,0.0,0.41273,0.0,...,0.0,0.0,3.391957,3.5,0.933307,0.871062,2.891304,3.0,0.811235,0.658103
4,414,41285,1216147618,Match Point (2005),3.0,3.46,0.0,0.0,0.0,0.0,...,0.0,0.0,3.391957,3.5,0.933307,0.871062,3.441176,3.5,0.966345,0.933824
