# The goal of this analysis is to build a content based recommender system

In [1]:
# import necessary libraries
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# I acquired this dataset myself from the IMDB website using a selenium webscraper
svd_df = pd.read_pickle('Pickles/tfidf_df.pkl')
actor_df = pd.read_pickle('Pickles/actor_df.pkl')
director_df = pd.read_pickle('Pickles/director_df.pkl')
genre_df = pd.read_pickle('Pickles/genre_df.pkl')
prodcom_df = pd.read_pickle('Pickles/prodcom_df.pkl')

df = pd.merge(svd_df, genre_df, on='Title', how='outer')
df = pd.merge(df, director_df, on='Title', how='outer')
df = pd.merge(df, actor_df, on='Title', how='outer')
df = pd.merge(df, prodcom_df, on='Title', how='outer')

In [3]:
print('Total samples: {s} | Total features {f}'.format(s=len(df), f=df.shape[1]))
print("Rows with NA: {n}".format(n=len(df[df.isnull().any(axis=1)])))

Total samples: 1708 | Total features 11229
Rows with NA: 0


# Show most similar movies to any given movie

In [4]:
# calculate cosine similarity matrix of movies
titles = df['Title']
matrix_df = df.drop(['Title'], axis=1)

cos_sim_matrix = cosine_similarity(matrix_df, matrix_df)

cos_sim_df = pd.DataFrame(cos_sim_matrix, columns = titles)
cos_sim_df['Title'] = titles

In [5]:
def display_top_movies(movie_title, n_movies, similarity_matrix_df):
    
    ordered_index_series = similarity_matrix_df[movie_title].sort_values(ascending=False) # access the movie's column and sort the values in it
    
    top_series = ordered_index_series[1:n_movies+1] # remove the first value, it is the similarity to itself (1)

    print(movie_title, '\n---------------------------------------')
    for i,v in top_series.items(): # iterate through items in series, print movie title and similarity to our chosen movie
        print((similarity_matrix_df.iloc[i]['Title']).ljust(40,' '), ' :', round(v, 4))
        

In [6]:
display_top_movies(movie_title = 'Toy Story 2 (1999)',
                   n_movies = 10,
                   similarity_matrix_df = cos_sim_df)

Toy Story 2 (1999) 
---------------------------------------
Toy Story (1995)                          : 0.7987
Toy Story 3 (2010)                        : 0.6999
A Bug's Life (1998)                       : 0.3232
Brave (2012)                              : 0.3147
Cars (2006)                               : 0.3074
Up (2009)                                 : 0.29
Monsters, Inc. (2001)                     : 0.2843
WALL·E (2008)                             : 0.2812
Cars 2 (2011)                             : 0.2727
Ratatouille (2007)                        : 0.2688


In [7]:
display_top_movies(movie_title = 'Avengers: Endgame (2019)',
                   n_movies = 10,
                   similarity_matrix_df = cos_sim_df)

Avengers: Endgame (2019) 
---------------------------------------
Avengers: Infinity War (2018)             : 0.7415
Captain America: Civil War (2016)         : 0.518
Avengers: Age of Ultron (2015)            : 0.4528
Avengers Assemble (2012)                  : 0.4168
Captain America: The Winter Soldier (2014)  : 0.2381
Iron Man 2 (2010)                         : 0.2381
Thor: Ragnarok (2017)                     : 0.2345
Spider-Man: Homecoming (2017)             : 0.1932
Ant-Man and the Wasp (2018)               : 0.1886
Guardians of the Galaxy: Vol. 2 (2017)    : 0.185


In [8]:
display_top_movies(movie_title = 'Pulp Fiction (1994)',
                   n_movies = 10,
                   similarity_matrix_df = cos_sim_df)

Pulp Fiction (1994) 
---------------------------------------
Jackie Brown (1997)                       : 0.2843
Kill Bill: Vol. 1 (2003)                  : 0.2475
Kill Bill: Vol. 2 (2004)                  : 0.2085
Fight Club (1999)                         : 0.205
The Hateful Eight (2015)                  : 0.1936
The Dark Knight (2008)                    : 0.1779
Inception (2010)                          : 0.1728
The Whole Nine Yards (2000)               : 0.1727
Hostage (2005)                            : 0.1578
Unbreakable (2000)                        : 0.151


In [9]:
display_top_movies(movie_title = 'Harry Potter and the Half-Blood Prince (2009)',
                   n_movies = 10,
                   similarity_matrix_df = cos_sim_df)

Harry Potter and the Half-Blood Prince (2009) 
---------------------------------------
Harry Potter and the Deathly Hallows: Part 1 (2010)  : 0.4533
Harry Potter and the Deathly Hallows: Part 2 (2011)  : 0.43
Harry Potter and the Chamber of Secrets (2002)  : 0.404
Harry Potter and the Goblet of Fire (2005)  : 0.4005
Harry Potter and the Order of the Phoenix (2007)  : 0.315
Harry Potter and the Prisoner of Azkaban (2004)  : 0.2601
Alice in Wonderland (2010)                : 0.2213
Fantastic Mr. Fox (2009)                  : 0.2194
Chicken Run (2000)                        : 0.193
Mr. & Mrs. Smith (2005)                   : 0.1806


In [10]:
display_top_movies(movie_title = 'The Social Network (2010)',
                   n_movies = 10,
                   similarity_matrix_df = cos_sim_df)

The Social Network (2010) 
---------------------------------------
The Pursuit of Happyness (2006)           : 0.2985
Moneyball (2011)                          : 0.2722
Before Sunset (2004)                      : 0.2653
Zombieland (2009)                         : 0.2579
Erin Brockovich (2000)                    : 0.2537
Rocky Balboa (2006)                       : 0.2424
Hacksaw Ridge (2016)                      : 0.2407
Lion (2016)                               : 0.2378
Raging Bull (1980)                        : 0.2319
Awakenings (1990)                         : 0.2315
