# Building a content-based recommender using tf-idf

## Import libraries and data

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer #obtaining tdf vectors
from itertools import combinations #finding combination of genres for a given movie
from sklearn.metrics.pairwise import cosine_similarity #To compute the cosine similarities between all tf-idf vectors


df = pd.read_csv("../../Data/ml-latest-small/PreprocessedData_ml_latest_year_small.csv",index_col=0)


## drop duplicates and keep only movieId, title and genres

In [2]:
df['pasteIDandMovie'] = df['title']+str(df['movieId'])
df = df.drop_duplicates(subset=['pasteIDandMovie'])

movies = df[['movieId', 'title', 'genres']].sort_values(by=['movieId']).reset_index(drop=True)


In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## Load the class

In [4]:
%run -i CB_TFIDF_CosineSimilarity.py 

## Transform the genres into a meaningful representation of numbers using TfidVectorizer

In [5]:
tf = TfidfVectorizer(stop_words = 'english', analyzer = 'word')
tfidf_matrix = tf.fit_transform(movies['genres'])

In [6]:
##to get an impression of what the result looks like, uncomment the line below
#pd.DataFrame(tfidf_matrix.todense(), columns=tf.get_feature_names(), index=movies.title).sample(10, axis=1).sample(10, axis=0)


## Use cosine similarity to find similar vectors

In [7]:
#cosine_sim = cosine_similarity(tfidf_matrix)
#cosine_sim_df = pd.DataFrame(cosine_sim, index=movies['title'], columns=movies['title'])

model=tfidf_cosine_sim_model(tfidf_matrix)
cosine_sim_df=pd.DataFrame(model.cosine_sim, index=movies['title'], columns=movies['title'])

print('Shape:', cosine_sim_df.shape)
#find a sample of the result below
cosine_sim_df.sample(5, axis=1).round(2) 

Shape: (9719, 9719)


title,Firestarter (1984),"Frighteners, The (1996)",Transformers: Revenge of the Fallen (2009),Signs (2002),Date Night (2010)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Toy Story (1995),0.0,0.11,0.16,0.0,0.12
Jumanji (1995),0.0,0.00,0.19,0.0,0.00
Grumpier Old Men (1995),0.0,0.24,0.00,0.0,0.79
Waiting to Exhale (1995),0.0,0.21,0.00,0.0,0.70
Father of the Bride Part II (1995),0.0,0.42,0.00,0.0,0.45
...,...,...,...,...,...
Black Butler: Book of the Atlantic (2017),0.0,0.13,0.15,0.0,0.41
No Game No Life: Zero (2017),0.0,0.15,0.00,0.0,0.16
Flint (2017),0.0,0.00,0.00,0.0,0.00
Bungo Stray Dogs: Dead Apple (2018),0.0,0.00,0.19,0.0,0.36


## find the highest tf-idf score for a given movie, function declaration


Moved this function in the CB_TFIDF_CosineSimilarity.py script

In [8]:
'''
def genre_recommendation(query_title):
    """
    Recommends movies based on a similarity dataframe
    Parameters
    ----------
    query_title : Movie title (string)

    """
    items= movies[['title', 'genres']]
    #select column with the input movie title, and change it to numpy array 
    #resulting array of indices indicates the positions of the elements that would be in the first i positions
    sel = cosine_sim_df.loc[:,query_title].to_numpy().argpartition(range(-1,-10,-1)) 
    #resulting subset of column names is ordered in descending order of the corresponding values in the title column. 
    #This subset is then assigned to the variable ct    
    ct = cosine_sim_df.columns[sel[-1:-(10+2):-1]]
    #drop columns title from input and merge the df with the original dataframe. show only first i results. 
    ct = ct.drop(query_title, errors='ignore')
    
    xx = pd.DataFrame(ct).merge(items).head(10)
    
    #add similarity score to xx
    xx['Similarity Score'] = cosine_sim_df.loc[query_title, xx['title']].values
    
    return xx

'''

'\ndef genre_recommendation(query_title):\n    """\n    Recommends movies based on a similarity dataframe\n    Parameters\n    ----------\n    query_title : Movie title (string)\n\n    """\n    items= movies[[\'title\', \'genres\']]\n    #select column with the input movie title, and change it to numpy array \n    #resulting array of indices indicates the positions of the elements that would be in the first i positions\n    sel = cosine_sim_df.loc[:,query_title].to_numpy().argpartition(range(-1,-10,-1)) \n    #resulting subset of column names is ordered in descending order of the corresponding values in the title column. \n    #This subset is then assigned to the variable ct    \n    ct = cosine_sim_df.columns[sel[-1:-(10+2):-1]]\n    #drop columns title from input and merge the df with the original dataframe. show only first i results. \n    ct = ct.drop(query_title, errors=\'ignore\')\n    \n    xx = pd.DataFrame(ct).merge(items).head(10)\n    \n    #add similarity score to xx\n    x

### find your movie on what the recommendation should be based on, and paste the name in the function below

In [9]:
# for example, find a harry potter movie
#movies[movies["title"].str.contains('Harry')]

## Recommendation Example

In [10]:
similar_movies=model.genre_recommendation('Harry Potter and the Order of the Phoenix (2007)',movies)

In [11]:
similar_movies

Unnamed: 0,title,movieId,genres,Similarity Score
0,Pete's Dragon (2016),160573,Adventure|Children|Fantasy,1.0
1,"NeverEnding Story, The (1984)",2161,Adventure|Children|Fantasy,1.0
2,"Chronicles of Narnia: The Lion, the Witch and ...",41566,Adventure|Children|Fantasy,1.0
3,Seventh Son (2014),119655,Adventure|Children|Fantasy,1.0
4,"Chronicles of Narnia: Prince Caspian, The (2008)",59501,Adventure|Children|Fantasy,1.0
5,Bridge to Terabithia (2007),50601,Adventure|Children|Fantasy,1.0
6,Return to Oz (1985),2093,Adventure|Children|Fantasy,1.0
7,Chronicles of Narnia: The Voyage of the Dawn T...,82169,Adventure|Children|Fantasy,1.0
8,Alice Through the Looking Glass (2016),158813,Adventure|Children|Fantasy,1.0
9,Alice in Wonderland (1933),80748,Adventure|Children|Fantasy,1.0


## Save your model

In [12]:
import pickle
filename = 'Model_tfidf_cosine_sim.sav'
pickle.dump(model, open(filename, 'wb'))