In [68]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [39]:
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')

In [7]:
movies['genres_split'] = movies.genres.str.split('|')
movies['genres_cnt'] = movies.apply(lambda r: len(r['genres_split']), axis=1)
movies['genres_space'] = movies.apply(lambda r: ' '.join(r['genres_split']), axis=1)

In [8]:
movies.head()

Unnamed: 0,movieId,title,genres,genres_split,genres_cnt,genres_space
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",5,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]",3,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",2,Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]",3,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,[Comedy],1,Comedy


In [9]:
cnt_vec = CountVectorizer()
processed = cnt_vec.fit_transform(movies['genres_space'])
tfidf = TfidfTransformer()
tfidf_dense = tfidf.fit_transform(processed).todense()

In [12]:
columns_genres = [None for i in range(len(cnt_vec.vocabulary_))]
for k in cnt_vec.vocabulary_:
    columns_genres[cnt_vec.vocabulary_[k]] = k

In [13]:
df_tfidf = pd.DataFrame(tfidf_dense, columns=columns_genres)


In [14]:
df_tfidf.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [20]:
tags_space = tags.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)).reset_index()

In [21]:
tags_space.head()

Unnamed: 0,movieId,tag
0,1,pixar pixar fun
1,2,fantasy magic board game Robin Williams game
2,3,moldy old
3,5,pregnancy remake
4,7,remake


In [22]:
processed_tags = cnt_vec.fit_transform(tags_space['tag'])
tfidf_dense_tags = tfidf.fit_transform(processed_tags).todense()


In [23]:
columns_tags = [None for i in range(len(cnt_vec.vocabulary_))]
for k in cnt_vec.vocabulary_:
    columns_tags[cnt_vec.vocabulary_[k]] = k

In [25]:
df_tfidf_tags = pd.DataFrame(tfidf_dense_tags, columns=columns_tags)

In [30]:
movies_with_tfidf = pd.concat((movies, df_tfidf, df_tfidf_tags), axis=1)

In [34]:
movies_with_tfidf.head()

Unnamed: 0,movieId,title,genres,genres_split,genres_cnt,genres_space,action,adventure,animation,children,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",5,Adventure Animation Children Comedy Fantasy,0.0,0.416846,0.516225,0.504845,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]",3,Adventure Children Fantasy,0.0,0.512361,0.0,0.620525,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",2,Comedy Romance,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]",3,Comedy Drama Romance,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,[Comedy],1,Comedy,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
movies_for_ds = movies_with_tfidf.drop(['genres', 'genres_split', 'genres_cnt', 'genres_space'], axis=1)

Unnamed: 0,movieId,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,1,Toy Story (1995),0.000000,0.416846,0.516225,0.504845,0.267586,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.000000,0.512361,0.000000,0.620525,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.000000,0.000000,0.000000,0.000000,0.505015,0.000000,0.0,0.466405,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,Heat (1995),0.549328,0.000000,0.000000,0.000000,0.000000,0.635947,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,Sabrina (1995),0.000000,0.000000,0.000000,0.000000,0.570915,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,Tom and Huck (1995),0.000000,0.636699,0.000000,0.771112,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9,Sudden Death (1995),1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10,GoldenEye (1995),0.553065,0.629522,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
movies_with_ratings = pd.merge(ratings, movies_for_ds, on='movieId')

In [41]:
movies_with_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,action,adventure,animation,children,comedy,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,1,1,4.0,964982703,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,1,4.0,847434962,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,1,4.5,1106635946,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15,1,2.5,1510577970,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,1,4.5,1305696483,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
TARGET_USER = 474

In [43]:
df_for_user = movies_with_ratings[movies_with_ratings['userId']==TARGET_USER]

In [62]:
df_for_user = df_for_user.fillna(0)

In [63]:
X = df_for_user.drop(['userId', 'movieId', 'timestamp', 'title', 'rating'], axis=1)
y = df_for_user['rating']

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [66]:
models = [LinearRegression, Lasso, Ridge, RandomForestRegressor, SVR]

In [69]:
for m in models:
    model = m()
    model.fit(X_train, y_train)
    print("{}. r2_train: {:.4f}, r2_test: {:.4f}, mse_train: {:.4f}, mse_test: {:.4f}".format(
        m.__name__, model.score(X_train ,y_train), model.score(X_test, y_test),
        mean_squared_error(model.predict(X_train), y_train),
        mean_squared_error(model.predict(X_test), y_test)
    ))

LinearRegression. r2_train: 0.3053, r2_test: -187209538064799386470187008.0000, mse_train: 0.4743, mse_test: 134450811357975169407647744.0000
Lasso. r2_train: 0.0000, r2_test: -0.0014, mse_train: 0.6827, mse_test: 0.7192
Ridge. r2_train: 0.2707, r2_test: 0.0798, mse_train: 0.4979, mse_test: 0.6609




RandomForestRegressor. r2_train: 0.4582, r2_test: -0.0127, mse_train: 0.3699, mse_test: 0.7273




SVR. r2_train: -0.0067, r2_test: 0.0061, mse_train: 0.6873, mse_test: 0.7138
