# Building a content-based recommender using tf-idf

#### Import libraries and data

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer #obtaining tdf vectors
from itertools import combinations #finding combination of genres for a given movie
from sklearn.metrics.pairwise import cosine_similarity #To compute the cosine similarities between all tf-idf vectors


movies = pd.read_csv("../../Data/ml-latest-small/movies.csv",index_col=0)

In [2]:
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


# Alternative: directly use year seperated df

In [3]:
#movies=pd.read_csv("../../Data/ml-latest-small/PreprocessedData_ml_latest_year_small.csv",index_col=0)
#movies["title"] = movies["title"].str.lower()
#movies=movies.drop(["userId","rating","year"],axis=1).drop_duplicates()
#movies=movies.sort_values(by=['movieId'])
#movies.set_index('movieId',inplace=True)

In [4]:
#movies.head()

#### Transform the genres into a meaningful representation of numbers using TfidVectorizer

In [5]:
tf = TfidfVectorizer(stop_words = 'english', analyzer = 'word')
tfidf_matrix = tf.fit_transform(movies['genres'])

In [6]:
##to get an impression of what the result looks like, uncomment the line below
#pd.DataFrame(tfidf_matrix.todense(), columns=tf.get_feature_names(), index=movies.title).sample(10, axis=1).sample(10, axis=0)


#### Use cosine similarity to find similar vectors

In [7]:
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, index=movies['title'], columns=movies['title'])
print('Shape:', cosine_sim_df.shape)
#find a sample of the result below
cosine_sim_df.sample(5, axis=1).round(2) 

Shape: (9742, 9742)


title,Iron Eagle (1986),Spies Like Us (1985),West Side Story (1961),"Fly II, The (1989)","Doom Generation, The (1995)"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Toy Story (1995),0.00,0.27,0.00,0.0,0.13
Jumanji (1995),0.00,0.00,0.00,0.0,0.00
Grumpier Old Men (1995),0.00,0.57,0.42,0.0,0.27
Waiting to Exhale (1995),0.00,0.51,0.52,0.0,0.45
Father of the Bride Part II (1995),0.00,1.00,0.00,0.0,0.48
...,...,...,...,...,...
Black Butler: Book of the Atlantic (2017),0.23,0.32,0.00,0.0,0.15
No Game No Life: Zero (2017),0.00,0.35,0.00,0.0,0.17
Flint (2017),0.00,0.00,0.33,0.0,0.44
Bungo Stray Dogs: Dead Apple (2018),0.31,0.00,0.00,0.0,0.00


### find the highest tf-idf score for a given movie, function declaration

In [8]:
def genre_recommendations(i, M, items, k=10):
    """
    Recommends movies based on a similarity dataframe

    Parameters
    ----------
    i : str
        Movie (index of the similarity dataframe)
    M : pd.DataFrame
        Similarity dataframe, symmetric, with movies as indices and columns
    items : pd.DataFrame
        Contains both the title and some other features used to define similarity
    k : int
        Amount of recommendations to return

    """
    ix = M.loc[:,i].to_numpy().argpartition(range(-1,-k,-1))
    closest = M.columns[ix[-1:-(k+2):-1]]
    closest = closest.drop(i, errors='ignore')
    return pd.DataFrame(closest).merge(items).head(k)

#### find your movie on what the recommendation should be based on, and paste the name in the function below

In [13]:
# for example, find a harry potter movie
movies[movies["title"].str.contains('Harry')]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
98203,"Twilight Saga: Breaking Dawn - Part 2, The (2012)",Adventure|Drama|Fantasy|Romance|IMAX


In [11]:
similar_movies=genre_recommendations('Harry Potter and the Order of the Phoenix (2007)', cosine_sim_df, movies[['title', 'genres']],10)


In [14]:
similar_movies.head(10)

Unnamed: 0,title,genres
0,Jack the Giant Slayer (2013),Adventure|Fantasy|IMAX
1,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX
2,"Hobbit: An Unexpected Journey, The (2012)",Adventure|Fantasy|IMAX
3,"Hobbit: The Desolation of Smaug, The (2013)",Adventure|Fantasy|IMAX
4,Alice in Wonderland (2010),Adventure|Fantasy|IMAX
5,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX
6,"Twilight Saga: Breaking Dawn - Part 2, The (2012)",Adventure|Drama|Fantasy|Romance|IMAX
7,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX
8,Star Wars: Episode VII - The Force Awakens (2015),Action|Adventure|Fantasy|Sci-Fi|IMAX
9,Man of Steel (2013),Action|Adventure|Fantasy|Sci-Fi|IMAX
