In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sklearn

In [2]:
df = pd.read_csv("cleaned.csv")

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Overview
0,1,223,4.0,2005-04-02 23:46:13,Clerks,Comedy,A day in the lives of two convenience clerks n...
1,1,2959,4.0,2004-09-10 03:08:18,Fight Club,Action|Crime|Drama|Thriller,An insomniac office worker and a devil-may-car...
2,1,3499,4.0,2005-04-02 23:35:18,Misery,Drama|Horror|Thriller,After a famous author is rescued from a car cr...
3,1,4011,4.0,2005-04-02 23:43:26,Snatch,Comedy|Crime|Thriller,"Unscrupulous boxing promoters, violent bookmak..."
4,1,4027,4.0,2005-04-02 23:44:20,"O Brother, Where Art Thou?",Adventure|Comedy|Crime,"In the deep south during the 1930s, three esca..."


In [4]:
df.shape

(4437943, 7)

In [5]:
df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
Overview     0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4437943 entries, 0 to 4437942
Data columns (total 7 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  object 
 4   title      object 
 5   genres     object 
 6   Overview   object 
dtypes: float64(1), int64(2), object(4)
memory usage: 237.0+ MB


### TF-IDF

In [7]:
df_tf = df.copy()

In [8]:
df_tf.shape

(4437943, 7)

In [9]:
df_tf.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres',
       'Overview'],
      dtype='object')

In [10]:
df_tf1 = df_tf[['movieId','title','genres']]

In [11]:
df_tf1.head()

Unnamed: 0,movieId,title,genres
0,223,Clerks,Comedy
1,2959,Fight Club,Action|Crime|Drama|Thriller
2,3499,Misery,Drama|Horror|Thriller
3,4011,Snatch,Comedy|Crime|Thriller
4,4027,"O Brother, Where Art Thou?",Adventure|Comedy|Crime


In [12]:
df_tf1.shape

(4437943, 3)

In [13]:
df_tf1 = df_tf1.drop_duplicates()

In [14]:
df_tf1.shape

(479, 3)

In [15]:
df_tf1.head()

Unnamed: 0,movieId,title,genres
0,223,Clerks,Comedy
1,2959,Fight Club,Action|Crime|Drama|Thriller
2,3499,Misery,Drama|Horror|Thriller
3,4011,Snatch,Comedy|Crime|Thriller
4,4027,"O Brother, Where Art Thou?",Adventure|Comedy|Crime


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations

Here we're finding the sets of combinations of genres up to k (4 here), or in mathematical terms, the superset.

In [17]:
tf = TfidfVectorizer(analyzer=lambda s: (c for i in range(1,4)
                                             for c in combinations(s.split('|'), r=i)))
tfidf_matrix = tf.fit_transform(df_tf1['genres'])
tfidf_matrix.shape

(479, 414)

Results in the following tf-idf vectors (note that only a subset of the columns and rows is sampled)

In [18]:
pd.DataFrame(tfidf_matrix.todense(), columns=tf.get_feature_names_out() ,   index=df_tf1.title).sample(5, axis=1).sample(10, axis=0)

Unnamed: 0_level_0,"(Action, Crime)","(Adventure, Children, Romance)","(Comedy, Musical, War)","(Adventure, War)","(Drama, Fantasy, Musical)"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Joker,0.0,0.0,0.0,0.0,0.0
Sweet Smell of Success,0.0,0.0,0.0,0.0,0.0
Inherit the Wind,0.0,0.0,0.0,0.0,0.0
End of Watch,0.0,0.0,0.0,0.0,0.0
White Heat,0.0,0.0,0.0,0.0,0.0
Shrek,0.0,0.19196,0.0,0.0,0.0
Kal Ho Naa Ho,0.0,0.0,0.0,0.0,0.0
Ferris Bueller's Day Off,0.0,0.0,0.0,0.0,0.0
Magnolia,0.0,0.0,0.0,0.0,0.0
Kagemusha,0.0,0.0,0.0,0.0,0.0


Compute the cosine similarities between all tf-idf vectors

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix)

In [20]:
cosine_sim_df = pd.DataFrame(cosine_sim, index=df_tf1['title'], columns=df_tf1['title'])
print('Shape:', cosine_sim_df.shape)
cosine_sim_df.sample(5, axis=1).round(2)

Shape: (479, 479)


title,Breaking the Waves,Once,Judgment at Nuremberg,Do the Right Thing,Braveheart
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Clerks,0.00,0.00,0.00,0.00,0.00
Fight Club,0.03,0.01,0.11,0.11,0.18
Misery,0.04,0.02,0.13,0.13,0.02
Snatch,0.00,0.00,0.00,0.00,0.00
"O Brother, Where Art Thou?",0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...
Haider,0.04,0.18,0.15,0.15,0.02
Me and Earl and the Dying Girl,0.27,0.13,1.00,1.00,0.15
Black,0.27,0.13,1.00,1.00,0.15
Joker,0.00,0.00,0.00,0.00,0.00


Using argpartition we take the k highest values in M(similarity matrix) on a given index (movie) i . We then index on the columns in M , and further slice to get from highest weights to lowest.

In [21]:
def genre_recommendations(i, M, items, k=10):
    """
    Recommends movies based on a similarity dataframe

    Parameters
    ----------
    i : str
        Movie (index of the similarity dataframe)
    M : pd.DataFrame
        Similarity dataframe, symmetric, with movies as indices and columns
    items : pd.DataFrame
        Contains both the title and some other features used to define similarity
    k : int
        Amount of recommendations to return

    """
    ix = M.loc[:,i].to_numpy().argpartition(range(-1,-k,-1))
    closest = M.columns[ix[-1:-(k+2):-1]]
    closest = closest.drop(i, errors='ignore')
    return pd.DataFrame(closest).merge(items).head(k)

Testing the recommender

In [22]:
df_tf1[df_tf1.title.eq('Fight Club')]

Unnamed: 0,movieId,title,genres
1,2959,Fight Club,Action|Crime|Drama|Thriller


In [23]:
genre_recommendations('Fight Club', cosine_sim_df, df_tf1[['title', 'genres']])

Unnamed: 0,title,genres
0,The Dark Knight,Action|Crime|Drama|Thriller
1,Taken,Action|Crime|Drama|Thriller
2,Kill Bill: Vol. 1,Action|Crime|Thriller
3,Die Hard,Action|Crime|Thriller
4,Dirty Harry,Action|Crime|Thriller
5,Heat,Action|Crime|Thriller
6,Die Hard: With a Vengeance,Action|Crime|Thriller
7,Scarface,Action|Crime|Drama
8,Man on Fire,Action|Crime|Drama|Mystery|Thriller
9,Mississippi Burning,Crime|Drama|Thriller
