In [201]:
###############
### IMPORTS ###
###############

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import nltk

In [5]:
df_all = pd.read_csv('data/dataframe_merged.csv')

In [6]:
print('Shape of dataframe: ', df_all.shape)
print('Columns of dataframe: ', df_all.columns)

Shape of dataframe:  (46628, 28)
Columns of dataframe:  Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'director'],
      dtype='object')


In [7]:
df = pd.read_csv('data/dataframe_merged.csv', usecols=['id', 'title', 'genres', 'cast', 'director'])

In [8]:
print('Shape of dataframe: ', df.shape)
print('Columns of dataframe: ', df.columns)

Shape of dataframe:  (46628, 5)
Columns of dataframe:  Index(['genres', 'id', 'title', 'cast', 'director'], dtype='object')


In [9]:
df

Unnamed: 0,genres,id,title,cast,director
0,"['Animation', 'Comedy', 'Family']",862,Toy Story,"['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",John Lasseter
1,"['Adventure', 'Fantasy', 'Family']",8844,Jumanji,"['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",Joe Johnston
2,"['Romance', 'Comedy']",15602,Grumpier Old Men,"['Walter Matthau', 'Jack Lemmon', 'Ann-Margret...",Howard Deutch
3,"['Comedy', 'Drama', 'Romance']",31357,Waiting to Exhale,"['Whitney Houston', 'Angela Bassett', 'Loretta...",Forest Whitaker
4,['Comedy'],11862,Father of the Bride Part II,"['Steve Martin', 'Diane Keaton', 'Martin Short...",Charles Shyer
...,...,...,...,...,...
46623,"['Drama', 'Family']",439050,Subdue,"['Leila Hatami', 'Kourosh Tahami', 'Elham Korda']",Hamid Nematollah
46624,['Drama'],111109,Century of Birthing,"['Angel Aquino', 'Perry Dizon', 'Hazel Orencio...",Lav Diaz
46625,"['Action', 'Drama', 'Thriller']",67758,Betrayal,"['Erika Eleniak', 'Adam Baldwin', 'Julie du Pa...",Mark L. Lester
46626,[],227506,Satan Triumphant,"['Iwan Mosschuchin', 'Nathalie Lissenko', 'Pav...",Yakov Protazanov


In [54]:
# This will join first and last names to a single string (and lowercase) so that they do not
# become split during the vectorization process
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [48]:
# Apply clean_data function to your features.
features = ['cast', 'director']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [49]:
df

Unnamed: 0,genres,id,title,cast,director,metasoup
0,"['animation','comedy','family']",862,Toy Story,"['tomhanks','timallen','donrickles','jimvarney...",johnlasseter,"['tomhanks','timallen','donrickles','jimvarney..."
1,"['adventure','fantasy','family']",8844,Jumanji,"['robinwilliams','jonathanhyde','kirstendunst'...",joejohnston,"['robinwilliams','jonathanhyde','kirstendunst'..."
2,"['romance','comedy']",15602,Grumpier Old Men,"['waltermatthau','jacklemmon','ann-margret','s...",howarddeutch,"['waltermatthau','jacklemmon','ann-margret','s..."
3,"['comedy','drama','romance']",31357,Waiting to Exhale,"['whitneyhouston','angelabassett','lorettadevi...",forestwhitaker,"['whitneyhouston','angelabassett','lorettadevi..."
4,['comedy'],11862,Father of the Bride Part II,"['stevemartin','dianekeaton','martinshort','ki...",charlesshyer,"['stevemartin','dianekeaton','martinshort','ki..."
...,...,...,...,...,...,...
46623,"['drama','family']",439050,Subdue,"['leilahatami','kouroshtahami','elhamkorda']",hamidnematollah,"['leilahatami','kouroshtahami','elhamkorda'] h..."
46624,['drama'],111109,Century of Birthing,"['angelaquino','perrydizon','hazelorencio','jo...",lavdiaz,"['angelaquino','perrydizon','hazelorencio','jo..."
46625,"['action','drama','thriller']",67758,Betrayal,"['erikaeleniak','adambaldwin','juliedupage','j...",markl.lester,"['erikaeleniak','adambaldwin','juliedupage','j..."
46626,[],227506,Satan Triumphant,"['iwanmosschuchin','nathalielissenko','pavelpa...",yakovprotazanov,"['iwanmosschuchin','nathalielissenko','pavelpa..."


In [70]:
# Clean the text further by only keeping alphanumerics
import re

def create_metasoup(x):
    string = ''.join(x['cast']) + ' ' + x['director']
    return re.sub(r'\W+', ' ', string)
df['metasoup'] = df.apply(create_metasoup, axis=1)

In [84]:
df['metasoup']

0         tomhanks timallen donrickles jimvarney wallac...
1         robinwilliams jonathanhyde kirstendunst bradl...
2         waltermatthau jacklemmon ann margret sophialo...
3         whitneyhouston angelabassett lorettadevine le...
4         stevemartin dianekeaton martinshort kimberlyw...
                               ...                        
46623     leilahatami kouroshtahami elhamkorda hamidnem...
46624     angelaquino perrydizon hazelorencio joeltorre...
46625     erikaeleniak adambaldwin juliedupage jamesrem...
46626     iwanmosschuchin nathalielissenko pavelpavlov ...
46627                                         daisyasquith
Name: metasoup, Length: 46628, dtype: object

In [88]:
df['metasoup'].loc[3]

' whitneyhouston angelabassett lorettadevine lelarochon gregoryhines forestwhitaker'

In [101]:
# Using .split() will tokenize the row
df['metasoup'].loc[3].split()

['whitneyhouston',
 'angelabassett',
 'lorettadevine',
 'lelarochon',
 'gregoryhines',
 'forestwhitaker']

In [94]:
df['metasoup']

0         tomhanks timallen donrickles jimvarney wallac...
1         robinwilliams jonathanhyde kirstendunst bradl...
2         waltermatthau jacklemmon ann margret sophialo...
3         whitneyhouston angelabassett lorettadevine le...
4         stevemartin dianekeaton martinshort kimberlyw...
                               ...                        
46623     leilahatami kouroshtahami elhamkorda hamidnem...
46624     angelaquino perrydizon hazelorencio joeltorre...
46625     erikaeleniak adambaldwin juliedupage jamesrem...
46626     iwanmosschuchin nathalielissenko pavelpavlov ...
46627                                         daisyasquith
Name: metasoup, Length: 46628, dtype: object

In [102]:
# Tokenize each row in df['metasoup']
df['split_metasoup'] = df['metasoup'].apply(lambda x: x.split())

In [162]:
# Convert datatype to set
df['split_metasoup'] = df['metasoup'].apply(lambda x: set(x))

In [177]:
# tokenize dataset
df['split_metasoup'] = df['metasoup'].apply(lambda x: set(nltk.ngrams(nltk.word_tokenize(x), n=1)))

In [189]:
df['split_metasoup'].loc[0]

{('donrickles',),
 ('jimvarney',),
 ('johnlasseter',),
 ('timallen',),
 ('tomhanks',),
 ('wallaceshawn',)}

In [None]:
df['split_metasoup'].loc[3024]

In [179]:
df['metasoup'].loc[0]

' tomhanks timallen donrickles jimvarney wallaceshawn johnlasseter'

In [203]:
# Calculate jaccard distance between Toy Story and Toy Story II
1 - nltk.jaccard_distance(df['split_metasoup'].loc[0], df['split_metasoup'].loc[3024])

0.5

In [210]:
# Create a shorter dataset to test creating df_jaccard below
df_short = df.head(100)
df_short

Unnamed: 0,genres,id,title,cast,director,metasoup,split_metasoup
0,"['animation','comedy','family']",862,Toy Story,"['tomhanks','timallen','donrickles','jimvarney...",johnlasseter,tomhanks timallen donrickles jimvarney wallac...,"{(jimvarney,), (wallaceshawn,), (timallen,), (..."
1,"['adventure','fantasy','family']",8844,Jumanji,"['robinwilliams','jonathanhyde','kirstendunst'...",joejohnston,robinwilliams jonathanhyde kirstendunst bradl...,"{(kirstendunst,), (bonniehunt,), (robinwilliam..."
2,"['romance','comedy']",15602,Grumpier Old Men,"['waltermatthau','jacklemmon','ann-margret','s...",howarddeutch,waltermatthau jacklemmon ann margret sophialo...,"{(jacklemmon,), (ann,), (sophialoren,), (walte..."
3,"['comedy','drama','romance']",31357,Waiting to Exhale,"['whitneyhouston','angelabassett','lorettadevi...",forestwhitaker,whitneyhouston angelabassett lorettadevine le...,"{(gregoryhines,), (forestwhitaker,), (whitneyh..."
4,['comedy'],11862,Father of the Bride Part II,"['stevemartin','dianekeaton','martinshort','ki...",charlesshyer,stevemartin dianekeaton martinshort kimberlyw...,"{(stevemartin,), (martinshort,), (paisley,), (..."
...,...,...,...,...,...,...,...
95,['drama'],406,La Haine,"['vincentcassel','hubertkoundé','saïdtaghmaoui...",mathieukassovitz,vincentcassel hubertkoundé saïdtaghmaoui abde...,"{(abdelahmedghili,), (vincentcassel,), (souley..."
96,"['action','adventure','drama','sciencefiction'...",45549,Shopping,"['sadiefrost','judelaw','seanpertwee','fraserj...",paulw.s.anderson,sadiefrost judelaw seanpertwee fraserjames se...,"{(seanbean,), (sadiefrost,), (seanpertwee,), (..."
97,['documentary'],63076,Heidi Fleiss: Hollywood Madam,"['nickbroomfield','heidifleiss','madamalex','i...",nickbroomfield,nickbroomfield heidifleiss madamalex ivannagy...,"{(heidifleiss,), (corinnebohrer,), (ivannagy,)..."
98,"['drama','thriller']",11062,City Hall,"['alpacino','johncusack','bridgetfonda','danny...",haroldbecker,alpacino johncusack bridgetfonda dannyaiello ...,"{(dannyaiello,), (martinlandau,), (bridgetfond..."


In [214]:
# Write a function to calculte jaccard distance
# Without the try-except loop, an error occurs: 'cannot divide by 0', when creating df_jaccard below
# https://python.gotrained.com/nltk-edit-distance-jaccard-distance/#Jaccard_Distance
def calculate_jaccard_dist(metasoup_A, metasoup_B):
    try:
        jaccard = 1 - nltk.jaccard_distance(metasoup_A, metasoup_B)
        return jaccard
    except:
        return 0

In [215]:
# Create df_jaccard (which will be our jaccard_similarity table)
df_jaccard = pd.DataFrame()

df_jaccard = df.apply(lambda y: 
    df.apply(lambda x: calculate_jaccard_dist(y['split_metasoup'], x['split_metasoup']), axis=1)
                         ,axis=1)

SystemError: <built-in function is_scalar> returned a result with an error set

In [None]:
df_jaccard

In [None]:
jaccard_similarity = df_jaccard.to_numpy()

In [None]:
np.save('cosine_similarity/jaccard_metadata.npy', jaccard_similarity)

In [106]:
df_jaccard = pd.DataFrame(df_jaccard, index=df['split_metasoup'], columns=df['split_metasoup']

SyntaxError: unexpected EOF while parsing (<ipython-input-106-8f28163a4acf>, line 1)

In [None]:
df_jaccard

In [137]:
vectorizer = CountVectorizer(stop_words='english')
doc_word = vectorizer.fit_transform(df['metasoup'])

In [138]:
pd.DataFrame(doc_word)

Unnamed: 0,0
0,"(0, 82715)\t1\n (0, 81986)\t1\n (0, 21226)..."
1,"(0, 71841)\t1\n (0, 41385)\t1\n (0, 46611)..."
2,"(0, 86205)\t1\n (0, 35175)\t1\n (0, 5036)\..."
3,"(0, 86626)\t1\n (0, 4745)\t1\n (0, 50681)\..."
4,"(0, 78898)\t1\n (0, 20297)\t1\n (0, 55225)..."
...,...
46623,"(0, 48924)\t1\n (0, 46987)\t1\n (0, 22880)..."
46624,"(0, 39903)\t1\n (0, 4792)\t1\n (0, 48485)\..."
46625,"(0, 36169)\t1\n (0, 357)\t1\n (0, 54618)\t..."
46626,"(0, 34902)\t1\n (0, 87585)\t1\n (0, 61484)..."


In [139]:
# Compute the cosine similarity matrix from doc_word
cosine_sim = cosine_similarity(doc_word, doc_word)

In [151]:
pd.DataFrame(cosine_sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46618,46619,46620,46621,46622,46623,46624,46625,46626,46627
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
46624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
46625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
46626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [147]:
pairwise = doc_word * doc_word.T

In [150]:
pd.DataFrame(pairwise.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46618,46619,46620,46621,46622,46623,46624,46625,46626,46627
0,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,7,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46623,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4,0,0,0,0
46624,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,6,0,0,0
46625,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,7,0,0
46626,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6,0


In [140]:
pd.DataFrame(cosine_sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46618,46619,46620,46621,46622,46623,46624,46625,46626,46627
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
46624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
46625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
46626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [59]:
# # Save cosine_sim array to use in hybrid recommendation system
# np.save('cosine_similarity/cos_metadata.npy', cosine_sim)

In [141]:
# Reset index of our dataframe and construct reverse mapping as before
indices = pd.Series(df.index, index=df['title'])

In [152]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [153]:
get_recommendations('The Dark Knight Rises')

10210                                        Batman Begins
12589                                      The Dark Knight
11463                                         The Prestige
23076                                         Interstellar
21393    Batman Unmasked: The Psychology of the Dark Kn...
29128                                             Child 44
15651                                            Inception
18210                                            TV Junkie
26110                                            Doodlebug
26111                                            Doodlebug
Name: title, dtype: object

In [154]:
get_recommendations('Toy Story')

3024                    Toy Story 2
22126          Toy Story of Terror!
26001    Toy Story That Time Forgot
25999               Partysaurus Rex
10754                      Luxo Jr.
19301                       Tin Toy
19355                   Red's Dream
19405                   Knick Knack
15519                   Toy Story 3
24723             Hawaiian Vacation
Name: title, dtype: object