In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

import warnings

%matplotlib inline
warnings.filterwarnings("ignore")

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
print(tags.shape)
print(movies.shape)
print(ratings.shape)

(3683, 4)
(9742, 3)
(100836, 4)


In [4]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [5]:
tags_data = pd.DataFrame(tags.movieId.unique())

In [6]:
movies_data = pd.DataFrame(movies.movieId.unique())

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
len(ratings.movieId.unique())

9724

In [10]:
mean_movie_ratings = pd.DataFrame()

In [11]:
mean_movie_ratings['mean_movie_rating'] = ratings.groupby('movieId').rating.mean()

In [12]:
mean_movie_ratings = mean_movie_ratings.reset_index()

In [13]:
mean_movie_ratings.head()

Unnamed: 0,movieId,mean_movie_rating
0,1,3.92093
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429


In [14]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [15]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [16]:
len(movie_genres)

9742

In [17]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [18]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [19]:
genres_tfidf = X_train_tfidf.todense()

In [21]:
genres_tfidf.shape

(9742, 20)

In [22]:
genres_tfidf = pd.DataFrame(genres_tfidf)

In [23]:
genres_tfidf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
genres_movieId = pd.DataFrame(movies['movieId'])

In [25]:
genres_movieId.head()

Unnamed: 0,movieId
0,1
1,2
2,3
3,4
4,5


In [26]:
genres_tfidf = genres_movieId.join(genres_tfidf)

In [27]:
genres_tfidf.head()

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
movies_with_tags = movies.join(tags.set_index('movieId'), on = 'movieId')

In [29]:
movies_with_tags.dropna(inplace = True)

In [30]:
tag_strings = []
movies_list = []

for movie, group in tqdm(movies_with_tags.groupby('movieId')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies_list.append(movie)

HBox(children=(FloatProgress(value=0.0, max=1572.0), HTML(value='')))




In [31]:
tag_strings[: 5]

['pixar pixar fun',
 'fantasy magicboardgame RobinWilliams game',
 'moldy old',
 'pregnancy remake',
 'remake']

In [32]:
movies_list[0:5]

[1, 2, 3, 5, 7]

In [33]:
len(tag_strings)

1572

In [34]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tag_strings)

In [35]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [36]:
tags_tfidf = X_train_tfidf.todense()

In [37]:
tags_tfidf.shape

(1572, 1472)

In [38]:
tags_tfidf = pd.DataFrame(tags_tfidf)

In [39]:
tags_tfidf['movieId'] = pd.Series(movies_list)

In [40]:
tags_tfidf.shape

(1572, 1473)

In [41]:
genres_tags = genres_tfidf.merge(tags_tfidf, on = 'movieId')

In [42]:
genres_tags.head()

Unnamed: 0,movieId,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,...,1462,1463,1464,1465,1466,1467,1468,1469,1470,1471
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
genres_tags.shape

(1572, 1493)

In [44]:
genres_tags_ratings = genres_tags.merge(mean_movie_ratings, on = 'movieId')

In [45]:
genres_tags_ratings.shape

(1554, 1494)

In [46]:
genres_tags_ratings.head()

Unnamed: 0,movieId,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,...,1463,1464,1465,1466,1467,1468,1469,1470,1471,mean_movie_rating
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093
1,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.431818
2,3,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.259615
3,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.071429
4,7,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.185185


In [47]:
test_pred = pd.DataFrame()

In [48]:
users = ratings.userId.unique()

In [49]:
for userId in tqdm(users):
    user_ratings = ratings[ratings.userId == userId]
    user_ratings.drop(['userId', 'timestamp'], axis = 1, inplace = True)
    user_ratings = user_ratings.merge(genres_tags_ratings, on = 'movieId')
    y = user_ratings.rating
    X = user_ratings.drop(['rating', 'movieId'], axis = 1)
    
    rand_state = 50
    X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size = 0.2, random_state = rand_state)
    
    mean_user_rating = np.mean(y_train)
    X_train.loc[:, 'mean_user_rating'] = mean_user_rating
    X_test.loc[:, 'mean_user_rating'] = mean_user_rating
    
    estimator = Lasso(alpha = 0.1)
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    
    y_test = pd.DataFrame(y_test)
    y_test.columns = ['test']
    y_test = y_test.reset_index()
    y_test.drop('index', axis = 1, inplace = True)
    
    y_pred = pd.DataFrame(y_pred)
    y_pred.columns = ['pred']
    
    y_test_pred = y_test.join(y_pred)
    
    test_pred = test_pred.append(y_test_pred)

HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))




In [50]:
test_pred.shape

(9910, 2)

In [51]:
test_pred

Unnamed: 0,test,pred
0,3.0,4.395604
1,5.0,4.395604
2,4.0,4.395604
3,5.0,4.395604
4,4.0,4.395604
...,...,...
67,3.0,3.496577
68,5.0,4.116023
69,5.0,4.371497
70,3.5,4.160540


In [52]:
y_test = test_pred['test']
y_pred = test_pred['pred']

In [53]:
mean_squared_error(y_test, y_pred, squared = False)

0.8335290867940339