In [2]:
###############
### IMPORTS ###
###############

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
df_all = pd.read_csv('data/dataframe_merged.csv')

In [4]:
print('Shape of dataframe: ', df_all.shape)
print('Columns of dataframe: ', df_all.columns)

Shape of dataframe:  (46628, 28)
Columns of dataframe:  Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'director'],
      dtype='object')


In [5]:
df = pd.read_csv('data/dataframe_merged.csv', usecols=['id', 'title', 'genres', 'cast', 'director'])

In [6]:
print('Shape of dataframe: ', df.shape)
print('Columns of dataframe: ', df.columns)

Shape of dataframe:  (46628, 5)
Columns of dataframe:  Index(['genres', 'id', 'title', 'cast', 'director'], dtype='object')


In [7]:
df

Unnamed: 0,genres,id,title,cast,director
0,"['Animation', 'Comedy', 'Family']",862,Toy Story,"['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",John Lasseter
1,"['Adventure', 'Fantasy', 'Family']",8844,Jumanji,"['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",Joe Johnston
2,"['Romance', 'Comedy']",15602,Grumpier Old Men,"['Walter Matthau', 'Jack Lemmon', 'Ann-Margret...",Howard Deutch
3,"['Comedy', 'Drama', 'Romance']",31357,Waiting to Exhale,"['Whitney Houston', 'Angela Bassett', 'Loretta...",Forest Whitaker
4,['Comedy'],11862,Father of the Bride Part II,"['Steve Martin', 'Diane Keaton', 'Martin Short...",Charles Shyer
...,...,...,...,...,...
46623,"['Drama', 'Family']",439050,Subdue,"['Leila Hatami', 'Kourosh Tahami', 'Elham Korda']",Hamid Nematollah
46624,['Drama'],111109,Century of Birthing,"['Angel Aquino', 'Perry Dizon', 'Hazel Orencio...",Lav Diaz
46625,"['Action', 'Drama', 'Thriller']",67758,Betrayal,"['Erika Eleniak', 'Adam Baldwin', 'Julie du Pa...",Mark L. Lester
46626,[],227506,Satan Triumphant,"['Iwan Mosschuchin', 'Nathalie Lissenko', 'Pav...",Yakov Protazanov


In [8]:
# This will join first and last names to a single string (and lowercase) so that they do not
# become split during the vectorization process
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [9]:
# Apply clean_data function to your features.
features = ['cast', 'director', 'genres']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [10]:
df

Unnamed: 0,genres,id,title,cast,director
0,"['animation','comedy','family']",862,Toy Story,"['tomhanks','timallen','donrickles','jimvarney...",johnlasseter
1,"['adventure','fantasy','family']",8844,Jumanji,"['robinwilliams','jonathanhyde','kirstendunst'...",joejohnston
2,"['romance','comedy']",15602,Grumpier Old Men,"['waltermatthau','jacklemmon','ann-margret','s...",howarddeutch
3,"['comedy','drama','romance']",31357,Waiting to Exhale,"['whitneyhouston','angelabassett','lorettadevi...",forestwhitaker
4,['comedy'],11862,Father of the Bride Part II,"['stevemartin','dianekeaton','martinshort','ki...",charlesshyer
...,...,...,...,...,...
46623,"['drama','family']",439050,Subdue,"['leilahatami','kouroshtahami','elhamkorda']",hamidnematollah
46624,['drama'],111109,Century of Birthing,"['angelaquino','perrydizon','hazelorencio','jo...",lavdiaz
46625,"['action','drama','thriller']",67758,Betrayal,"['erikaeleniak','adambaldwin','juliedupage','j...",markl.lester
46626,[],227506,Satan Triumphant,"['iwanmosschuchin','nathalielissenko','pavelpa...",yakovprotazanov


In [19]:
df['cast'].loc[3024]

"['tomhanks','timallen','joancusack','kelseygrammer','donrickles']"

In [53]:
def create_metasoup(x):
    return ''.join(x['cast']) + ' ' + x['director'] + ' ' + ''.join(x['genres'])
df['metasoup'] = df.apply(create_metasoup, axis=1)

In [54]:
df['metasoup']

0        ['tomhanks','timallen','donrickles','jimvarney...
1        ['robinwilliams','jonathanhyde','kirstendunst'...
2        ['waltermatthau','jacklemmon','ann-margret','s...
3        ['whitneyhouston','angelabassett','lorettadevi...
4        ['stevemartin','dianekeaton','martinshort','ki...
                               ...                        
46623    ['leilahatami','kouroshtahami','elhamkorda'] h...
46624    ['angelaquino','perrydizon','hazelorencio','jo...
46625    ['erikaeleniak','adambaldwin','juliedupage','j...
46626    ['iwanmosschuchin','nathalielissenko','pavelpa...
46627                                   [] daisyasquith []
Name: metasoup, Length: 46628, dtype: object

In [55]:
df['metasoup'].loc[0]

"['tomhanks','timallen','donrickles','jimvarney','wallaceshawn'] johnlasseter ['animation','comedy','family']"

In [56]:
vectorizer = CountVectorizer(stop_words='english')
doc_word = vectorizer.fit_transform(df['metasoup'])

In [57]:
# Compute the cosine similarity matrix from doc_word
cosine_sim = cosine_similarity(doc_word, doc_word)

In [58]:
pd.DataFrame(cosine_sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46618,46619,46620,46621,46622,46623,46624,46625,46626,46627
0,1.000000,0.111111,0.111111,0.111111,0.117851,0.000000,0.117851,0.105409,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.136083,0.000000,0.000000,0.0,0.0
1,0.111111,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.210819,0.100504,0.111111,...,0.000000,0.0,0.0,0.0,0.000000,0.136083,0.000000,0.000000,0.0,0.0
2,0.111111,0.000000,1.000000,0.222222,0.117851,0.000000,0.235702,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.111111,0.000000,0.000000,0.000000,0.0,0.0
3,0.111111,0.000000,0.222222,1.000000,0.117851,0.105409,0.235702,0.105409,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.222222,0.136083,0.125988,0.105409,0.0,0.0
4,0.117851,0.000000,0.117851,0.117851,1.000000,0.000000,0.125000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46623,0.136083,0.136083,0.000000,0.136083,0.000000,0.129099,0.000000,0.258199,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.136083,1.000000,0.154303,0.129099,0.0,0.0
46624,0.000000,0.000000,0.000000,0.125988,0.000000,0.119523,0.000000,0.119523,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.125988,0.154303,1.000000,0.119523,0.0,0.0
46625,0.000000,0.000000,0.000000,0.105409,0.000000,0.300000,0.000000,0.200000,0.190693,0.210819,...,0.105409,0.0,0.0,0.0,0.210819,0.129099,0.119523,1.000000,0.0,0.0
46626,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1.0,0.0


In [59]:
# # Save cosine_sim array to use in hybrid recommendation system
# np.save('cosine_similarity/cos_metadata.npy', cosine_sim)

In [60]:
# Reset index of our dataframe and construct reverse mapping as before
indices = pd.Series(df.index, index=df['title'])

In [61]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [62]:
get_recommendations('The Dark Knight Rises')

12589      The Dark Knight
10210        Batman Begins
18940            Last Exit
34488                 Rege
11463         The Prestige
516      Romeo Is Bleeding
9038        State of Grace
11524          Harsh Times
15083          Harry Brown
24090            Quicksand
Name: title, dtype: object

In [63]:
get_recommendations('Toy Story')

3024                    Toy Story 2
22126          Toy Story of Terror!
25999               Partysaurus Rex
26001    Toy Story That Time Forgot
29198               Superstar Goofy
15519                   Toy Story 3
3336              Creature Comforts
41622                       Lorenzo
10754                      Luxo Jr.
19301                       Tin Toy
Name: title, dtype: object