In [1]:
import pandas as pd
import numpy as np
import math
import io
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

%pylab inline

Populating the interactive namespace from numpy and matplotlib


## **Data Loading**

In [2]:
from google.colab import files
uploaded = files.upload()

Saving links.csv to links.csv
Saving movies.csv to movies.csv
Saving ratings.csv to ratings.csv
Saving tags.csv to tags.csv


In [3]:
tags = pd.read_csv(io.BytesIO(uploaded['tags.csv']),encoding="ISO-8859-1")

tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [4]:
# links are : Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format:
links = pd.read_csv(io.BytesIO(uploaded['links.csv']),encoding="ISO-8859-1")

links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
movies = pd.read_csv(io.BytesIO(uploaded['movies.csv']),encoding="ISO-8859-1")
movies.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings = pd.read_csv(io.BytesIO(uploaded['ratings.csv']),encoding="ISO-8859-1")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## **Let's have a look at Tags**

Let's first select a movie among our db of movies...

In [7]:
movies[movies.movieId == 1]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


... and look at tags associated with this movie

In [8]:
tags[tags['movieId'] == 1]

Unnamed: 0,userId,movieId,tag,timestamp
629,336,1,pixar,1139045764
981,474,1,pixar,1137206825
2886,567,1,fun,1525286013


As you can see in the tags dataset, we have a tag per user and movie. We will create a function that concatenates every tag from a movie in a single line. See below.

In [0]:
def _concatenate_tags_of_movie(tags):
    tags_as_str = ' '.join(set(tags))
    return tags_as_str

In [10]:
tags_per_movie = tags.groupby('movieId')['tag'].agg(_concatenate_tags_of_movie)
tags_per_movie.name = 'movie_tags'
tags_per_movie = tags_per_movie.reset_index()
tags_per_movie.head()

Unnamed: 0,movieId,movie_tags
0,1,fun pixar
1,2,game fantasy Robin Williams magic board game
2,3,old moldy
3,5,pregnancy remake
4,7,remake


Let's now select the one for our first movie : Toy Story

In [11]:
tags_per_movie[tags_per_movie['movieId'] == 1]

Unnamed: 0,movieId,movie_tags
0,1,fun pixar


In [12]:
avg_ratings  = ratings.groupby('movieId')['rating'].agg(['mean', 'median', 'size'])
avg_ratings.columns = ['rating_mean', 'rating_median', 'num_tags']
avg_ratings = avg_ratings.reset_index()
avg_ratings.head()

Unnamed: 0,movieId,rating_mean,rating_median,num_tags
0,1,3.92093,4.0,215
1,2,3.431818,3.5,110
2,3,3.259615,3.0,52
3,4,2.357143,3.0,7
4,5,3.071429,3.0,49


Now let's concatenate all of our information in a single dataframe. We will remove movies without tags (if any to avoid NAs).

In [13]:
movies_with_ratings = pd.merge(movies, avg_ratings, how='left', on='movieId')
my_data = pd.merge(movies_with_ratings, tags_per_movie, how='left', on='movieId')
my_data = my_data[~my_data.movie_tags.isnull()].reset_index(drop=True)
my_data.head()

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_tags,movie_tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,4.0,215.0,fun pixar
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,3.5,110.0,game fantasy Robin Williams magic board game
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,3.0,52.0,old moldy
3,5,Father of the Bride Part II (1995),Comedy,3.071429,3.0,49.0,pregnancy remake
4,7,Sabrina (1995),Comedy|Romance,3.185185,3.0,54.0,remake


## **Algo Building**
As we have explained in the paper, content-based filtering algorithm is similar to feature document analysis. We use the concept of TF-IDF to rmove tags that may appear in lots of movies (which therefore don't describe it with a lots of acuracy) to promote tags that may define each movie more relevantly. 
Having doing so, we switch to the cosine similarity calculation (as explained also in the paper) using again scikit learn that contains a cosine_similarity method. 

### TF-IDF
Note that scikit-learn provides with a method to do this called TfidfVectorizer().


In [15]:
tf_idf = TfidfVectorizer()
movies_tf_idf = tf_idf.fit_transform(my_data.movie_tags)
movies_tf_idf.shape

(1572, 1744)

### **Cosine Similarity**

In [44]:
cosine = cosine_similarity(movies_tf_idf)
movies_cosine = pd.DataFrame(cosine_similarity(movies_tf_idf))

indices = my_data.movieId
movies_cosine.columns = [str(indices[int(col)]) for col in movies_cosine.columns]
movies_cosine.index = [indices[idx] for idx in movies_cosine.index]
movies_cosine.head()


Unnamed: 0,1,2,3,5,7,11,14,16,17,21,22,25,26,28,29,31,32,34,36,38,39,40,41,43,45,46,47,50,52,58,62,92,96,101,104,107,110,111,116,122,...,135133,135518,135536,136864,138036,139385,139644,140174,141890,143367,144210,148626,148881,152077,152711,153070,155288,156371,156605,158872,158966,161634,164179,164909,167746,168248,168252,170945,174053,174055,176371,176419,179401,180031,180985,183611,184471,187593,187595,193565
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285481,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.675448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.675448,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## **Most Similar Movies to Toy Story**

In [45]:
movies_cosine.iloc[0].sort_values(ascending=False)[:10]


1         1.000000
2355      0.724941
122918    0.688811
3114      0.285423
108932    0.232746
115617    0.227600
89745     0.204283
68954     0.181712
296       0.029864
2291      0.000000
Name: 1, dtype: float64

In [46]:
top_3 = [2355,122918,3114]
my_top_3 = my_data[(my_data.movieId).isin(top_3)]
my_top_3[['title','genres','movie_tags']]

Unnamed: 0,title,genres,movie_tags
544,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Pixar
666,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,animation Disney funny Tom Hanks Pixar origina...
1524,Guardians of the Galaxy 2 (2017),Action|Adventure|Sci-Fi,fun


## **Recommendation for a User**
We have to first select a user from our db, let's take the user 1. 

In [47]:
user_ratings = ratings[ratings.userId == 1]
print(user_ratings.shape)

(232, 4)


In [48]:
user_data = my_data.reset_index().merge(user_ratings, on='movieId')
print(user_data.shape)
user_data.head()
#it's very good because he rated lots of movies (232 actually but only 114 of them are in the tag db) which will lead to better prediction accuracy 

(114, 11)


Unnamed: 0,index,movieId,title,genres,rating_mean,rating_median,num_tags,movie_tags,userId,rating,timestamp
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,4.0,215.0,fun pixar,1,4.0,964982703
1,2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,3.0,52.0,old moldy,1,4.0,964981247
2,26,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,3.975369,4.0,203.0,twist ending serial killer mystery,1,5.0,964983815
3,27,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.237745,4.5,204.0,twist ending heist mindfuck thriller tricky su...,1,5.0,964982931
4,33,101,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance,3.782609,4.0,23.0,off-beat comedy crime quirky,1,5.0,964980868


Let's compute standardized weights we have explained into the paper.

In [49]:
user_data['weight'] = preprocessing.scale(user_data['rating'])
user_data.head()

Unnamed: 0,index,movieId,title,genres,rating_mean,rating_median,num_tags,movie_tags,userId,rating,timestamp,weight
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,4.0,215.0,fun pixar,1,4.0,964982703,-0.496031
1,2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,3.0,52.0,old moldy,1,4.0,964981247,-0.496031
2,26,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,3.975369,4.0,203.0,twist ending serial killer mystery,1,5.0,964983815,0.733263
3,27,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.237745,4.5,204.0,twist ending heist mindfuck thriller tricky su...,1,5.0,964982931,0.733263
4,33,101,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance,3.782609,4.0,23.0,off-beat comedy crime quirky,1,5.0,964980868,0.733263


In [50]:
user_profile = np.dot(movies_tf_idf[user_data['index'].values].toarray().T, user_data['weight'].values)
C = cosine_similarity(atleast_2d(user_profile), movies_tf_idf)
R = argsort(C)[:, ::-1]
recommendations = [i for i in R[0] if i not in user_data['index'].values]
my_data['title'][recommendations].head(10)


1359                           Lucky Number Slevin (2006)
441                          Spanish Prisoner, The (1997)
147                Snow White and the Seven Dwarfs (1937)
142                                        Aladdin (1992)
243                                     Cinderella (1950)
249                         Fox and the Hound, The (1981)
500     101 Dalmatians (One Hundred and One Dalmatians...
499                             Lady and the Tramp (1955)
148                           Beauty and the Beast (1991)
152                                Aristocats, The (1970)
Name: title, dtype: object