## Standard Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Ignoring harmless warnings

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Loading and Cleaning Data

In [3]:
mov_df = pd.read_csv("./movies_metadata.csv")
cred_df = pd.read_csv("./credits.csv")
key_df = pd.read_csv("./keywords.csv")

In [5]:
def clean_id(x):
    try:
        return int(x)
    except:
        return np.nan
mov_df['id'] = mov_df['id'].apply(clean_id)

In [11]:
mov_df = mov_df[mov_df['id'].notnull()]


In [14]:
mov_df = mov_df.merge(cred_df,on = 'id')

In [16]:
mov_df = mov_df.merge(key_df,on = 'id')

In [17]:
mov_df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [19]:
df = mov_df['id title vote_average vote_count genres keywords cast crew release_date'.split()]

In [32]:
features = 'genres keywords cast crew'.split()
from ast import literal_eval
for i in features:
    df[i] = df[i].apply(literal_eval)

In [33]:
df.head(2)

Unnamed: 0,id,title,vote_average,vote_count,genres,keywords,cast,crew,release_date
0,862,Toy Story,7.7,5415.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",1995-10-30
1,8844,Jumanji,6.9,2413.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",1995-12-15


In [34]:
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

In [35]:
df['director'] = df['crew'].apply(get_director)

In [36]:
df.head(5)

Unnamed: 0,id,title,vote_average,vote_count,genres,keywords,cast,crew,release_date,director
0,862,Toy Story,7.7,5415.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",1995-10-30,John Lasseter
1,8844,Jumanji,6.9,2413.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",1995-12-15,Joe Johnston
2,15602,Grumpier Old Men,6.5,92.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",1995-12-22,Howard Deutch
3,31357,Waiting to Exhale,6.1,34.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': 'Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",1995-12-22,Forest Whitaker
4,11862,Father of the Bride Part II,5.7,173.0,"[{'id': 35, 'name': 'Comedy'}]","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",1995-02-10,Charles Shyer


In [37]:
def generate_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names
    return []

In [38]:
df['genres'] = df['genres'].apply(generate_list)
df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)

In [41]:
df = df.drop('crew',axis = 1)

In [42]:
df.head()

Unnamed: 0,id,title,vote_average,vote_count,genres,keywords,cast,release_date,director
0,862,Toy Story,7.7,5415.0,"[Animation, Comedy, Family]","[jealousy, toy, boy]","[Tom Hanks, Tim Allen, Don Rickles]",1995-10-30,John Lasseter
1,8844,Jumanji,6.9,2413.0,"[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst]",1995-12-15,Joe Johnston
2,15602,Grumpier Old Men,6.5,92.0,"[Romance, Comedy]","[fishing, best friend, duringcreditsstinger]","[Walter Matthau, Jack Lemmon, Ann-Margret]",1995-12-22,Howard Deutch
3,31357,Waiting to Exhale,6.1,34.0,"[Comedy, Drama, Romance]","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devine]",1995-12-22,Forest Whitaker
4,11862,Father of the Bride Part II,5.7,173.0,[Comedy],"[baby, midlife crisis, confidence]","[Steve Martin, Diane Keaton, Martin Short]",1995-02-10,Charles Shyer


In [40]:
def sanitize(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [43]:
features = 'genres keywords cast director'.split()
for i in features:
    df[i] = df[i].apply(sanitize)

In [44]:
df.head()

Unnamed: 0,id,title,vote_average,vote_count,genres,keywords,cast,release_date,director
0,862,Toy Story,7.7,5415.0,"[animation, comedy, family]","[jealousy, toy, boy]","[tomhanks, timallen, donrickles]",1995-10-30,johnlasseter
1,8844,Jumanji,6.9,2413.0,"[adventure, fantasy, family]","[boardgame, disappearance, basedonchildren'sbook]","[robinwilliams, jonathanhyde, kirstendunst]",1995-12-15,joejohnston
2,15602,Grumpier Old Men,6.5,92.0,"[romance, comedy]","[fishing, bestfriend, duringcreditsstinger]","[waltermatthau, jacklemmon, ann-margret]",1995-12-22,howarddeutch
3,31357,Waiting to Exhale,6.1,34.0,"[comedy, drama, romance]","[basedonnovel, interracialrelationship, single...","[whitneyhouston, angelabassett, lorettadevine]",1995-12-22,forestwhitaker
4,11862,Father of the Bride Part II,5.7,173.0,[comedy],"[baby, midlifecrisis, confidence]","[stevemartin, dianekeaton, martinshort]",1995-02-10,charlesshyer


In [45]:
def create_metadata(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [46]:
df['metadata'] = df.apply(create_metadata,axis = 1)

In [53]:
df = df['id title metadata vote_average vote_count release_date'.split()]
df['release_year'] = df['release_date'].apply(lambda x : pd.to_datetime(x).year)
df = df.drop('release_date',axis = 1)
df.head()

Unnamed: 0,id,title,metadata,vote_average,vote_count,release_year
0,862,Toy Story,jealousy toy boy tomhanks timallen donrickles ...,7.7,5415.0,1995.0
1,8844,Jumanji,boardgame disappearance basedonchildren'sbook ...,6.9,2413.0,1995.0
2,15602,Grumpier Old Men,fishing bestfriend duringcreditsstinger walter...,6.5,92.0,1995.0
3,31357,Waiting to Exhale,basedonnovel interracialrelationship singlemot...,6.1,34.0,1995.0
4,11862,Father of the Bride Part II,baby midlifecrisis confidence stevemartin dian...,5.7,173.0,1995.0


## Vectorization of  Metadata

In [48]:
from sklearn.feature_extraction.text import CountVectorizer

Unlike the plot based Recommendor System we are going to use CountVectorizer instead of TfidfVectorizer. This is because, we are using the cast of actors also in the metadata, suppose an Actor has worked in most of the films. In that case, TfidfVectorizer will assign a very small weight to that actor.<br>To avoid this, we'll use CountVectorizer which gives equal weight to all the words

In [49]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['metadata'])

One disadvantage of using CountVectorizer is that we have to use a manual and computionally heavy method of calculating the cosine similarity.<br>
Due to this the below code may throw **MemoryError**<br>The only solution is to use some cloud based service like *GCP* or *AWS*

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

## Defining the Recommedation Function

In [51]:
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [None]:
def get_recommendations(title,cosine_sim = cosine_sim,data = df,indices = indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores,key = lambda x : x[1],reverse=True)
    sim_scores = sim_scores[1:11]
    movies_indices = [i[0] for i in sim_scores]
    return (data['title'].iloc[movies_indices])

In [37]:
content_recommender('The Lion King', cosine_sim, df, indices)

29607                                          Cheburashka
40904                   VeggieTales: Josh and the Big Wall
40913    VeggieTales: Minnesota Cuke and the Search for...
27768                                 The Little Matchgirl
15209             Spiderman: The Ultimate Villain Showdown
16613                            Cirque du Soleil: Varekai
24654                                  The Seventh Brother
29198                                      Superstar Goofy
30244                                              My Love
31179                Pokémon: Arceus and the Jewel of Life
Name: title, dtype: object

We can see that our results vary greatly from the previous results based on Plot<br>
Here we can see that most of our recommended movies are animation based<hr>
***An interesting observation is that both Pokemon: Arceus and the Jewel of Life and The Lion King feature cartoon anthropomorphic characters who return after a few years to exact revenge on those who had wronged them !!***