In [68]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [39]:
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')

In [7]:
movies['genres_split'] = movies.genres.str.split('|')
movies['genres_space'] = movies.apply(lambda r: ' '.join(r['genres_split']), axis=1)

In [8]:
movies.head()

Unnamed: 0,movieId,title,genres,genres_split,genres_cnt,genres_space
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",5,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]",3,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",2,Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]",3,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,[Comedy],1,Comedy


In [9]:
cnt_vec = CountVectorizer()
processed = cnt_vec.fit_transform(movies['genres_space'])
tfidf = TfidfTransformer()
tfidf_dense = tfidf.fit_transform(processed).todense()

In [12]:
columns_genres = [None for i in range(len(cnt_vec.vocabulary_))]
for k in cnt_vec.vocabulary_:
    columns_genres[cnt_vec.vocabulary_[k]] = k

In [96]:
df_tfidf = pd.DataFrame(tfidf_dense, columns=columns_genres)
df_tfidf =  pd.concat((movies, df_tfidf), axis=1)
df_tfidf = df_tfidf.drop(['genres', 'title'], axis=1)
df_tfidf.head()

Unnamed: 0,movieId,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
tags_space = tags.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)).reset_index()

In [77]:
tags_space.head()

Unnamed: 0,movieId,tag
0,1,pixar pixar fun
1,2,fantasy magic board game Robin Williams game
2,3,moldy old
3,5,pregnancy remake
4,7,remake


In [22]:
processed_tags = cnt_vec.fit_transform(tags_space['tag'])
tfidf_dense_tags = tfidf.fit_transform(processed_tags).todense()


In [23]:
columns_tags = [None for i in range(len(cnt_vec.vocabulary_))]
for k in cnt_vec.vocabulary_:
    columns_tags[cnt_vec.vocabulary_[k]] = k

In [99]:
df_tfidf_tags = pd.DataFrame(tfidf_dense_tags, columns=columns_tags)
df_tfidf_tags =  pd.concat((tags_space, df_tfidf_tags), axis=1)
df_tfidf_tags = df_tfidf_tags.drop(['tag'], axis=1)

In [112]:
tfidf_fin = pd.merge(df_tfidf, df_tfidf_tags, on='movieId', how='outer')

In [122]:
tfidf_fin = tfidf_fin.drop(['movieId'], axis=1)

In [129]:
tfidf_fin

Unnamed: 0,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,fi_x,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,0.000000,0.416846,0.516225,0.504845,0.267586,0.000000,0.0,0.000000,0.482990,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.512361,0.000000,0.620525,0.000000,0.000000,0.0,0.000000,0.593662,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.505015,0.000000,0.0,0.466405,0.000000,0.000000,...,,,,,,,,,,
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.549328,0.000000,0.000000,0.000000,0.000000,0.635947,0.0,0.000000,0.000000,0.000000,...,,,,,,,,,,
6,0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.000000,0.636699,0.000000,0.771112,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,,,,,,,,,,
8,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,,,,,,,,,,
9,0.553065,0.629522,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,,,,,,,,,,


In [123]:
movies_with_tfidf = pd.concat((movies, tfidf_fin), axis=1)

In [124]:
movies_with_tfidf.head()

Unnamed: 0,movieId,title,genres,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,...,,,,,,,,,,
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [125]:
movies_for_ds = movies_with_tfidf.drop(['genres'], axis=1)

In [126]:
movies_with_ratings = pd.merge(ratings, movies_for_ds, on='movieId')

In [137]:
movies_with_ratings = movies_with_ratings.fillna(0)

In [138]:
TARGET_USER = 474

In [139]:
df_for_user = movies_with_ratings[movies_with_ratings['userId']==TARGET_USER]

In [140]:
X = df_for_user.drop(['userId', 'movieId', 'timestamp', 'title', 'rating'], axis=1)
y = df_for_user['rating']

In [141]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [142]:
models = [LinearRegression, Lasso, Ridge, RandomForestRegressor, SVR]

In [143]:
for m in models:
    model = m()
    model.fit(X_train, y_train)
    print("{}. r2_train: {:.4f}, r2_test: {:.4f}, mse_train: {:.4f}, mse_test: {:.4f}".format(
        m.__name__, model.score(X_train ,y_train), model.score(X_test, y_test),
        mean_squared_error(model.predict(X_train), y_train),
        mean_squared_error(model.predict(X_test), y_test)
    ))

LinearRegression. r2_train: 0.5362, r2_test: -10047512299447121577246720.0000, mse_train: 0.3137, mse_test: 7458282304353642219569152.0000
Lasso. r2_train: 0.0000, r2_test: -0.0032, mse_train: 0.6764, mse_test: 0.7447
Ridge. r2_train: 0.4675, r2_test: 0.1507, mse_train: 0.3602, mse_test: 0.6305




RandomForestRegressor. r2_train: 0.6849, r2_test: 0.0257, mse_train: 0.2131, mse_test: 0.7233




SVR. r2_train: 0.0054, r2_test: -0.0250, mse_train: 0.6727, mse_test: 0.7608
