## Recommender: Overview of Movie Synopsis

This notebook uses the 'overview' column, which is concise description of the movie synopsis, to make a recommender. It uses a TfidfVectorizer to capture infrequently used words and weigh them more heavily (in contrast to CountVectorizer).

In [16]:
###############
### IMPORTS ###
###############

import pandas as pd
import numpy as np

import matplotlib as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import pickle

In [17]:
df_all = pd.read_csv('data/dataframe_merged_small.csv')

In [18]:
print('Shape of dataframe: ', df_all.shape)
print('Columns of dataframe: ', df_all.columns)

Shape of dataframe:  (10876, 29)
Columns of dataframe:  Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'director',
       'weighted_rating'],
      dtype='object')


In [19]:
df = pd.read_csv('data/dataframe_merged_small.csv', usecols=['id', 'title', 'genres', 'keywords'])

In [20]:
print('Shape of dataframe: ', df.shape)
print('Columns of dataframe: ', df.columns)

Shape of dataframe:  (10876, 4)
Columns of dataframe:  Index(['genres', 'id', 'title', 'keywords'], dtype='object')


In [21]:
df

Unnamed: 0,genres,id,title,keywords
0,"['Animation', 'Comedy', 'Family']",862,Toy Story,"['jealousy', 'toy', 'boy', 'friendship', 'frie..."
1,"['Adventure', 'Fantasy', 'Family']",8844,Jumanji,"['board game', 'disappearance', ""based on chil..."
2,"['Romance', 'Comedy']",15602,Grumpier Old Men,"['fishing', 'best friend', 'duringcreditssting..."
3,"['Action', 'Crime', 'Drama', 'Thriller']",949,Heat,"['robbery', 'detective', 'bank', 'obsession', ..."
4,"['Comedy', 'Romance']",11860,Sabrina,"['paris', 'brother brother relationship', 'cha..."
...,...,...,...,...
10871,['Comedy'],19307,Carry On Camping,"['holiday', 'nudist camp', 'camping', 'tent', ..."
10872,"['Drama', 'Family', 'Fantasy']",18098,Arabian Nights,[]
10873,"['Drama', 'Foreign']",52103,Pickpocket,['independent film']
10874,"['Family', 'Animation', 'Romance', 'Comedy']",455661,In a Heartbeat,"['love', 'teenager', 'lgbt', 'short']"


In [22]:
# Join [overview] and [keywords] together
# These two columns are synopsis-associated and it's sensible to join them together
df['genres_and_keywords'] = df['genres'] + df['keywords'].astype(str)

In [23]:
df['genres_and_keywords'].loc[0]

"['Animation', 'Comedy', 'Family']['jealousy', 'toy', 'boy', 'friendship', 'friends']"

In [24]:
# Check if [overview] has NaN values and take the sum to see how many
# NOTE: IT DOESN'T REALLY MATTER IF WE REPLACE THE NAN VALUES BUT LETS DO IT ANYWAYS
#       THIS IS B/C WE WILL BE FEEDING IN [overview_and_keywords], NOT [overview]
df['genres_and_keywords'].isnull().sum()

0

In [39]:
vectorizer = CountVectorizer(stop_words='english')
doc_word = vectorizer.fit_transform(df['genres_and_keywords'])

In [40]:
# Compute the cosine similarity matrix from doc_word
cosine_sim = cosine_similarity(doc_word, doc_word)

In [41]:
# Look at the cosine_sim matrix
pd.DataFrame(cosine_sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10866,10867,10868,10869,10870,10871,10872,10873,10874,10875
0,1.000000,0.102062,0.125000,0.000000,0.102062,0.000000,0.102062,0.204124,0.000000,0.000000,...,0.125000,0.612372,0.182574,0.353553,0.176777,0.133631,0.204124,0.000000,0.375000,0.102062
1,0.102062,1.000000,0.000000,0.000000,0.000000,0.077152,0.083333,0.166667,0.000000,0.000000,...,0.000000,0.166667,0.074536,0.000000,0.000000,0.000000,0.333333,0.000000,0.102062,0.083333
2,0.125000,0.000000,1.000000,0.000000,0.204124,0.000000,0.204124,0.000000,0.000000,0.000000,...,0.125000,0.204124,0.091287,0.000000,0.176777,0.133631,0.000000,0.000000,0.250000,0.102062
3,0.000000,0.000000,0.000000,1.000000,0.000000,0.178174,0.096225,0.000000,0.105409,0.222222,...,0.235702,0.000000,0.086066,0.000000,0.166667,0.000000,0.192450,0.166667,0.000000,0.000000
4,0.102062,0.000000,0.204124,0.000000,1.000000,0.000000,0.166667,0.000000,0.000000,0.000000,...,0.102062,0.166667,0.074536,0.000000,0.144338,0.109109,0.000000,0.000000,0.204124,0.083333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10871,0.133631,0.000000,0.133631,0.000000,0.109109,0.000000,0.109109,0.000000,0.000000,0.000000,...,0.133631,0.218218,0.097590,0.000000,0.188982,1.000000,0.000000,0.000000,0.133631,0.109109
10872,0.204124,0.333333,0.000000,0.192450,0.000000,0.000000,0.166667,0.166667,0.182574,0.192450,...,0.204124,0.333333,0.149071,0.000000,0.288675,0.000000,1.000000,0.288675,0.204124,0.166667
10873,0.000000,0.000000,0.000000,0.166667,0.000000,0.000000,0.144338,0.000000,0.158114,0.166667,...,0.176777,0.000000,0.000000,0.000000,0.250000,0.000000,0.288675,1.000000,0.000000,0.144338
10874,0.375000,0.102062,0.250000,0.000000,0.204124,0.000000,0.306186,0.204124,0.000000,0.000000,...,0.125000,0.612372,0.273861,0.353553,0.176777,0.133631,0.204124,0.000000,1.000000,0.204124


In [48]:
df[df['title']=='The Avengers']

Unnamed: 0,genres,id,title,keywords,genres_and_keywords
6701,"['Science Fiction', 'Action', 'Adventure']",24428,The Avengers,"['new york', 'shield', 'marvel comic', 'superh...","['Science Fiction', 'Action', 'Adventure']['ne..."


In [50]:
df[df['title']=='Avengers: Age of Ultron']

Unnamed: 0,genres,id,title,keywords,genres_and_keywords
8392,"['Action', 'Adventure', 'Science Fiction']",99861,Avengers: Age of Ultron,"['marvel comic', 'sequel', 'superhero', 'based...","['Action', 'Adventure', 'Science Fiction']['ma..."


In [52]:
df[df['title']=='Iron Man']

Unnamed: 0,genres,id,title,keywords,genres_and_keywords
5402,"['Action', 'Science Fiction', 'Adventure']",1726,Iron Man,"['middle east', 'arms dealer', 'malibu', 'marv...","['Action', 'Science Fiction', 'Adventure']['mi..."


In [53]:
df[df['title']=='Frozen']

Unnamed: 0,genres,id,title,keywords,genres_and_keywords
7638,"['Animation', 'Adventure', 'Family']",109445,Frozen,"['queen', 'musical', 'princess', 'betrayal', '...","['Animation', 'Adventure', 'Family']['queen', ..."


In [65]:
# Similarity of LotR: Fellowship to LotR: Two Towers
pd.DataFrame(cosine_sim)[2276].loc[2627]

0.8581163303210333

In [66]:
# Similarity of LotR: Fellowship to The Hobbit
pd.DataFrame(cosine_sim)[2276].loc[7206]

0.8999999999999999

In [67]:
# Similarity of LotR: Fellowship to Frozen
pd.DataFrame(cosine_sim)[2276].loc[7638]

0.11180339887498948

In [44]:
# # Save cosine_sim array to use in hybrid recommendation system
np.save('similarity_matrix/cos_themes_small.npy', cosine_sim)

In [45]:
# Reset index of our dataframe and construct reverse mapping as before
indices = pd.Series(df.index, index=df['title'])

In [46]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [47]:
get_recommendations('The Avengers')

5440                    The Incredible Hulk
1777                                  X-Men
6130                             Iron Man 2
8392                Avengers: Age of Ultron
8394                                Ant-Man
2767                                     X2
8398                         Doctor Strange
7836    Captain America: The Winter Soldier
4756                  X-Men: The Last Stand
7363                            Planet Hulk
Name: title, dtype: object

In [32]:
get_recommendations('Toy Story')

8772          Barbie and the Three Musketeers
8311               Toy Story That Time Forgot
8123                        Hawaiian Vacation
6179                              Toy Story 3
6220         The Bugs Bunny/Road Runner Movie
8376                   Scooby-Doo! Camp Scare
8562                        Stitch! The Movie
8954        Scooby-Doo! and the Samurai Sword
8957    Scooby-Doo! and the Loch Ness Monster
8958                      Big Top Scooby-Doo!
Name: title, dtype: object