# Use PCA to reduce the dimensionality of our dataset to enable better similarity ratings

In [1]:
# import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# I acquired this dataset myself from the IMDB website using a selenium webscraper
svd_df = pd.read_pickle('Pickles/tfidf_df.pkl')
actor_df = pd.read_pickle('Pickles/actor_df.pkl')
director_df = pd.read_pickle('Pickles/director_df.pkl')
genre_df = pd.read_pickle('Pickles/genre_df.pkl')
prodcom_df = pd.read_pickle('Pickles/prodcom_df.pkl')

df = pd.merge(svd_df, genre_df, on='Title', how='outer')
df = pd.merge(df, director_df, on='Title', how='outer')
df = pd.merge(df, actor_df, on='Title', how='outer')
df = pd.merge(df, prodcom_df, on='Title', how='outer')
df.shape

(1708, 11229)

In [3]:
df.head(1)

Unnamed: 0,00,007,10,100,1000,100000,10th,11,12,127,...,Uncharted,Smart Egg Pictures,Rogue,Produzioni De,Big Beach Films,Appian Way,LBI Entertainment,Matten Productions,Cinergi Pictures Entertainment,Savvy Media Holdings
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
from sklearn.decomposition import PCA

titles = df['Title']
temp = df.loc[:, df.columns != 'Title']

pca = PCA(n_components=100).fit(temp.T) # fit PCA on all our features

pca_df = pd.DataFrame(pca.components_.T) # create df from components

# write this dataframe to a pickle
temp = pca_df.copy()
temp.columns = temp.columns.astype(str)
temp['Title'] = titles
temp.to_pickle('Pickles/pca_100_df.pkl')

pca_df.shape

(1708, 100)

In [5]:
cos_sim_matrix = cosine_similarity(pca_df, pca_df) # create similarity matrix from df
cos_sim_matrix.shape

(1708, 1708)

In [6]:
cos_sim_df = pd.DataFrame(cos_sim_matrix, columns = titles) # create new df from our similarity matrix
cos_sim_df['Title'] = titles

# Show most similar movies to any given movie using all features

In [7]:
def display_top_movies(movie_title, n_movies, similarity_matrix_df):
    
    ordered_index_series = similarity_matrix_df[movie_title].sort_values(ascending=False) # access the movie's column and sort the values in it
    
    top_series = ordered_index_series[1:n_movies+1] # remove the first value, it is the similarity to itself (1)

    print(movie_title, '\n---------------------------------------')
    for i,v in top_series.items(): # iterate through items in series, print movie title and similarity to our chosen movie
        print((similarity_matrix_df.iloc[i]['Title']).ljust(40,' '), ' :', round(v, 4))
        

In [8]:
display_top_movies(movie_title = 'Toy Story 2 (1999)',
                   n_movies = 10,
                   similarity_matrix_df = cos_sim_df)

Toy Story 2 (1999) 
---------------------------------------
Toy Story (1995)                          : 0.9944
Toy Story 3 (2010)                        : 0.9861
Brave (2012)                              : 0.6045
Ratatouille (2007)                        : 0.5481
A Bug's Life (1998)                       : 0.5129
Up (2009)                                 : 0.5068
Coco (2017)                               : 0.4684
WALL·E (2008)                             : 0.4512
You've Got Mail (1998)                    : 0.4352
Monsters, Inc. (2001)                     : 0.4333


In [9]:
display_top_movies(movie_title = 'Avengers: Endgame (2019)',
                   n_movies = 10,
                   similarity_matrix_df = cos_sim_df)

Avengers: Endgame (2019) 
---------------------------------------
Avengers: Infinity War (2018)             : 0.9653
Captain America: Civil War (2016)         : 0.8307
Avengers: Age of Ultron (2015)            : 0.8076
Avengers Assemble (2012)                  : 0.6471
Captain America: The Winter Soldier (2014)  : 0.5265
Wind River (2017)                         : 0.5264
Don Jon (2013)                            : 0.4228
Her (2013)                                : 0.3774
Ghost World (2001)                        : 0.3606
Match Point (2005)                        : 0.3242


In [10]:
display_top_movies(movie_title = 'Pulp Fiction (1994)',
                   n_movies = 10,
                   similarity_matrix_df = cos_sim_df)

Pulp Fiction (1994) 
---------------------------------------
Jackie Brown (1997)                       : 0.7252
Hostage (2005)                            : 0.6866
Kill Bill: Vol. 2 (2004)                  : 0.675
Kill Bill: Vol. 1 (2003)                  : 0.671
Fight Club (1999)                         : 0.5152
Die Hard: With a Vengeance (1995)         : 0.5023
Unbreakable (2000)                        : 0.4528
There Will Be Blood (2007)                : 0.443
Chocolat (2000)                           : 0.4374
The Hateful Eight (2015)                  : 0.4358


In [11]:
display_top_movies(movie_title = 'Harry Potter and the Half-Blood Prince (2009)',
                   n_movies = 10,
                   similarity_matrix_df = cos_sim_df)

Harry Potter and the Half-Blood Prince (2009) 
---------------------------------------
Harry Potter and the Chamber of Secrets (2002)  : 0.7625
Harry Potter and the Order of the Phoenix (2007)  : 0.7532
Harry Potter and the Goblet of Fire (2005)  : 0.7449
Harry Potter and the Deathly Hallows: Part 2 (2011)  : 0.7338
Harry Potter and the Deathly Hallows: Part 1 (2010)  : 0.7318
Harry Potter and the Prisoner of Azkaban (2004)  : 0.6504
In Time (2011)                            : 0.4688
The Girl Next Door (2004)                 : 0.4542
Assassin's Creed (2016)                   : 0.4492
Harry Potter and the Philosopher's Stone (2001)  : 0.4422


In [18]:
display_top_movies(movie_title = 'The Social Network (2010)',
                   n_movies = 10,
                   similarity_matrix_df = cos_sim_df)

The Social Network (2010) 
---------------------------------------
The Girl with the Dragon Tattoo (2011)    : 0.5792
Moneyball (2011)                          : 0.5609
Captain Phillips (2013)                   : 0.5494
Steve Jobs (2015)                         : 0.4974
Under the Skin (2013)                     : 0.4902
Zombieland (2009)                         : 0.4862
Mystic River (2003)                       : 0.4826
Before Sunset (2004)                      : 0.4804
The Pursuit of Happyness (2006)           : 0.4804
Doubt (2008)                              : 0.467


- ## Our reduced features seem to be giving more accurate reflections of similarity