In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error

%matplotlib inline

In [2]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [3]:
links = pd.read_csv('D:/GitRepo/Нетология/Рекомендательные системы/netology-recsys-master/netology-recsys-master/lecture-1/links.csv')
movies = pd.read_csv('D:/GitRepo/Нетология/Рекомендательные системы/netology-recsys-master/netology-recsys-master/lecture-1/movies.csv')
ratings = pd.read_csv('D:/GitRepo/Нетология/Рекомендательные системы/netology-recsys-master/netology-recsys-master/lecture-1/ratings.csv')
tags = pd.read_csv('D:/GitRepo/Нетология/Рекомендательные системы/netology-recsys-master/netology-recsys-master/lecture-1/tags.csv')

In [4]:
movies.tail(6)

Unnamed: 0,movieId,title,genres
9736,193579,Jon Stewart Has Left the Building (2015),Documentary
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy


In [5]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [6]:
tags.head(3)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992


### Pre-processing

USERS&MOVIES:

In [7]:
df_1 = ratings[['userId', 'movieId', 'rating']]
df_1.head(3)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0


In [8]:
# Users
df_1_1 = pd.DataFrame(df_1.groupby('userId').rating.mean()).rename(columns={'rating': 'user_mean_rating'})
df_1_1 = pd.merge(df_1_1, pd.DataFrame(df_1.groupby('userId').rating.median()).rename(columns={'rating': 'user_median_rating'}),
         left_index=True,
         right_index=True)
df_1_1 = pd.merge(df_1_1, pd.DataFrame(df_1.groupby('userId').rating.var()).rename(columns={'rating': 'user_variance_rating'}),
         left_index=True,
         right_index=True)
df_1_1 = pd.merge(df_1_1, pd.DataFrame(df_1.groupby('userId').rating.max()).rename(columns={'rating': 'user_max_rating'}),
         left_index=True,
         right_index=True)
df_1_1 = pd.merge(df_1_1, pd.DataFrame(df_1.groupby('userId').rating.min()).rename(columns={'rating': 'user_min_rating'}),
         left_index=True,
         right_index=True)
df_1_1.head(2)

Unnamed: 0_level_0,user_mean_rating,user_median_rating,user_variance_rating,user_max_rating,user_min_rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4.366379,5.0,0.640077,5.0,1.0
2,3.948276,4.0,0.649015,5.0,2.0


In [9]:
# Movies
df_1_2 = pd.DataFrame(df_1.groupby('movieId').rating.mean()).rename(columns={'rating': 'movie_mean_rating'})
df_1_2 = pd.merge(df_1_2, pd.DataFrame(df_1.groupby('movieId').rating.median()).rename(columns={'rating': 'movie_median_rating'}),
         left_index=True,
         right_index=True)
df_1_2 = pd.merge(df_1_2, pd.DataFrame(df_1.groupby('movieId').rating.var()).rename(columns={'rating': 'movie_variance_rating'}),
         left_index=True,
         right_index=True).fillna(0)
df_1_2 = pd.merge(df_1_2, pd.DataFrame(df_1.groupby('movieId').rating.max()).rename(columns={'rating': 'movie_max_rating'}),
         left_index=True,
         right_index=True)
df_1_2 = pd.merge(df_1_2, pd.DataFrame(df_1.groupby('movieId').rating.min()).rename(columns={'rating': 'movie_min_rating'}),
         left_index=True,
         right_index=True)
df_1_2.head(2)

Unnamed: 0_level_0,movie_mean_rating,movie_median_rating,movie_variance_rating,movie_max_rating,movie_min_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3.92093,4.0,0.69699,5.0,0.5
2,3.431818,3.5,0.777419,5.0,0.5


GENRES:

In [10]:
df_2 = movies[['movieId', 'genres']]
movie_genres = [change_string(g) for g in df_2.genres.values]
count_vect = CountVectorizer()
buffer = count_vect.fit_transform(movie_genres)
tfidf_transformer  = TfidfTransformer()
buffer = tfidf_transformer.fit_transform(buffer)
df_2 = pd.DataFrame(buffer.toarray(), columns=count_vect.get_feature_names())
df_2 = pd.merge(movies[['movieId']], df_2, how='left', left_index=True, right_index=True)
df_2.index = df_2.movieId
df_2.drop(columns=['movieId'], inplace=True)
df_2.fillna(0.0, inplace=True)
df_2.head(2)

Unnamed: 0_level_0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


TAGS:

In [11]:
movie_gtags = [change_string(g) for g in tags.tag.values]
count_vect = CountVectorizer()
buffer = count_vect.fit_transform(movie_gtags)
buffer1 = pd.DataFrame(buffer.toarray(), columns=count_vect.get_feature_names())
common_tags = buffer1.sum().sort_values(ascending=False)[buffer1.sum().sort_values(ascending=False) > 5].index
tfidf_transformer  = TfidfTransformer()
buffer = tfidf_transformer.fit_transform(buffer)
df_3 = pd.DataFrame(buffer.toarray(), columns=count_vect.get_feature_names())
df_3 = df_3[common_tags]
df_3 = pd.merge(movies[['movieId']], df_3, how='left', left_index=True, right_index=True)
df_3.index = df_3.movieId
df_3.drop(columns=['movieId'], inplace=True)
df_3.fillna(0.0, inplace=True)
df_3.head(2)

Unnamed: 0_level_0,innetflixqueue,atmospheric,thoughtprovoking,funny,scifi,surreal,superhero,disney,quirky,religion,...,wedding,zombies,twins,hitmen,visuallystunning,fantasy,dystopia,gambling,greatsoundtrack,gothic
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Final dataset:

In [12]:
df = ratings[['userId', 'movieId', 'rating']]
df = pd.merge(df, df_1_1, how='left', left_on='userId', right_index=True)
df = pd.merge(df, df_1_2, how='left', left_on='movieId', right_index=True)
df = pd.merge(df, df_2, how='left', left_on='movieId', right_index=True)
df = pd.merge(df, df_3, how='left', left_on='movieId', right_index=True)
df

Unnamed: 0,userId,movieId,rating,user_mean_rating,user_median_rating,user_variance_rating,user_max_rating,user_min_rating,movie_mean_rating,movie_median_rating,...,wedding,zombies,twins,hitmen,visuallystunning,fantasy_y,dystopia,gambling,greatsoundtrack,gothic
0,1,1,4.0,4.366379,5.0,0.640077,5.0,1.0,3.920930,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,4.0,4.366379,5.0,0.640077,5.0,1.0,3.259615,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,6,4.0,4.366379,5.0,0.640077,5.0,1.0,3.946078,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,47,5.0,4.366379,5.0,0.640077,5.0,1.0,3.975369,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,50,5.0,4.366379,5.0,0.640077,5.0,1.0,4.237745,4.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,3.688556,3.5,0.735173,5.0,0.5,3.333333,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100832,610,168248,5.0,3.688556,3.5,0.735173,5.0,0.5,4.142857,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100833,610,168250,5.0,3.688556,3.5,0.735173,5.0,0.5,3.633333,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100834,610,168252,5.0,3.688556,3.5,0.735173,5.0,0.5,4.280000,4.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df.isna().sum().sort_values()

userId         0
creepy         0
stylized       0
family         0
martialarts    0
              ..
aliens         0
dreamlike      0
blackcomedy    0
highschool     0
gothic         0
Length: 170, dtype: int64

### Scaling features:

In [14]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
X.head(3)

Unnamed: 0,userId,movieId,rating,user_mean_rating,user_median_rating,user_variance_rating,user_max_rating,user_min_rating,movie_mean_rating,movie_median_rating,...,wedding,zombies,twins,hitmen,visuallystunning,fantasy_y,dystopia,gambling,greatsoundtrack,gothic
0,-1.780374,-0.54697,0.478112,1.873188,2.568746,-0.583897,0.182856,0.145061,0.742308,0.696738,...,-0.031664,-0.053145,-0.026544,-0.038469,-0.045245,-0.04676,-0.04264,-0.028528,-0.015104,-0.050549
1,-1.780374,-0.546914,0.478112,1.873188,2.568746,-0.583897,0.182856,0.145061,-0.428247,-0.93145,...,-0.031664,-0.053145,-0.026544,-0.038469,-0.045245,-0.04676,-0.04264,-0.028528,-0.015104,-0.050549
2,-1.780374,-0.54683,0.478112,1.873188,2.568746,-0.583897,0.182856,0.145061,0.786821,0.696738,...,-0.031664,-0.053145,-0.026544,-0.038469,-0.045245,-0.04676,-0.04264,-0.028528,-0.015104,-0.050549


In [28]:
X.shape

(100836, 170)

### Sampling

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X.drop(columns=['rating']), 
                                                    X['rating'], 
                                                    test_size=0.1)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, 
                                                  y_train, 
                                                  test_size=0.1)
print('Train:', X_train.shape, y_train.shape)
print('Dev:', X_dev.shape, y_dev.shape)
print('Test:', X_test.shape, y_test.shape)

Train: (81676, 169) (81676,)
Dev: (9076, 169) (9076,)
Test: (10084, 169) (10084,)


### Regression

In [16]:
params = {'alpha':list(np.arange(0.1, 100.0, 0.1)),
          'max_iter':[500, 1000, 2000, 5000, 10000],
          'tol':[1e-3, 1e-2, 1e-4, 1e-5],
          'solver':['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']          
          }
linear_model = RandomizedSearchCV(Ridge(), params, cv=10, random_state=13, n_iter=10)
linear_model = linear_model.fit(X_train,y_train).best_estimator_
linear_model

Ridge(alpha=36.2, copy_X=True, fit_intercept=True, max_iter=2000,
      normalize=False, random_state=None, solver='sparse_cg', tol=0.0001)

In [17]:
print('RMSE:', np.sqrt(mean_squared_error(y_dev, linear_model.predict(X_dev))))
print('MAE:', mean_absolute_error(y_dev, linear_model.predict(X_dev)))

RMSE: 0.7766534922980494
MAE: 0.5909364063603018
