<h1>Plot Description Based Recommender</h1>

In [95]:
import pandas as pd
import numpy as  np
from sklearn.feature_extraction.text import TfidfVectorizer

In [96]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [97]:
pd.set_option('display.max_columns', 23)
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [98]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [99]:
movies = movies.merge(credits, on='title')

In [100]:
#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movies['overview'] = movies['overview'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(movies['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(4809, 20978)

In [101]:
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

In [102]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [103]:
#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

In [104]:
# Function that takes in movie title as input and gives recommendations 
def content_recommender(title, cosine_sim=cosine_sim, df=movies, indices=indices):
    # Obtain the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [105]:
content_recommender('The Lion King')

78                               The Jungle Book
3993                            The Wizard of Oz
4325                                  Waiting...
4363                                October Baby
1249                           Quest for Camelot
1555                           The Addams Family
1237                              The Art of War
3314                         Rumble in the Bronx
2523                           The King's Speech
15      The Chronicles of Narnia: Prince Caspian
Name: title, dtype: object

<h1>Metadata Based Recommender</h1>

In [1]:
import pandas as pd
import numpy as  np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
pd.set_option('display.max_columns', 23)
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
movies['genres'][3]

'[{"id": 28, "name": "Action"}, {"id": 80, "name": "Crime"}, {"id": 18, "name": "Drama"}, {"id": 53, "name": "Thriller"}]'

In [5]:
movies.rename(columns={'id': 'movie_id'}, inplace=True)

In [6]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [7]:
# Function to convert all non-integer IDs to NaN
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan

In [8]:
#Clean the ids of df
movies['movie_id'] = movies['movie_id'].apply(clean_ids)

#Filter all rows that have a null ID
movies = movies[movies['movie_id'].notnull()]

In [9]:
# Convert IDs into integer
movies['movie_id'] = movies['movie_id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
movies = movies.merge(credits, on='title')

#Display the head of df
movies.head(1)

Unnamed: 0,budget,genres,homepage,movie_id_x,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [10]:
movies['production_companies'][1]

'[{"name": "Walt Disney Pictures", "id": 2}, {"name": "Jerry Bruckheimer Films", "id": 130}, {"name": "Second Mate Productions", "id": 19936}]'

In [11]:
movies.drop('movie_id_y', axis=1, inplace=True)
movies.rename(columns={'movie_id_x': 'movie_id'}, inplace=True)

In [12]:
# Convert the stringifield objects into the native python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'original_language']
for feature in features:
    movies[feature] = movies[feature].apply(literal_eval)

ValueError: malformed node or string on line 1: <ast.Name object at 0x000001DFBDB39F00>

In [13]:
#Convert all NaN into stringified empty lists
movies['genres'] = movies['genres'].fillna('[]')

#Apply literal_eval to convert stringified empty lists to the list object
movies['genres'] = movies['genres'].apply(literal_eval)

#Convert list of dictionaries to a list of strings
movies['genres'] = movies['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])

In [14]:
#Convert all NaN into stringified empty lists
movies['production_companies'] = movies['production_companies'].fillna('[]')

#Apply literal_eval to convert stringified empty lists to the list object
movies['production_companies'] = movies['production_companies'].apply(literal_eval)

#Convert list of dictionaries to a list of strings
movies['production_companies'] = movies['production_companies'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])

In [15]:
#Convert all NaN into stringified empty lists
movies['production_countries'] = movies['production_countries'].fillna('[]')

#Apply literal_eval to convert stringified empty lists to the list object
movies['production_countries'] = movies['production_countries'].apply(literal_eval)

#Convert list of dictionaries to a list of strings
movies['production_countries'] = movies['production_countries'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])

In [16]:
#Print the first cast member of the first movie in df
movies.iloc[0]['crew'][0]

{'credit_id': '52fe48009251416c750aca23',
 'department': 'Editing',
 'gender': 0,
 'id': 1721,
 'job': 'Editor',
 'name': 'Stephen E. Rivkin'}

In [17]:
movies['crew'][0]

[{'credit_id': '52fe48009251416c750aca23',
  'department': 'Editing',
  'gender': 0,
  'id': 1721,
  'job': 'Editor',
  'name': 'Stephen E. Rivkin'},
 {'credit_id': '539c47ecc3a36810e3001f87',
  'department': 'Art',
  'gender': 2,
  'id': 496,
  'job': 'Production Design',
  'name': 'Rick Carter'},
 {'credit_id': '54491c89c3a3680fb4001cf7',
  'department': 'Sound',
  'gender': 0,
  'id': 900,
  'job': 'Sound Designer',
  'name': 'Christopher Boyes'},
 {'credit_id': '54491cb70e0a267480001bd0',
  'department': 'Sound',
  'gender': 0,
  'id': 900,
  'job': 'Supervising Sound Editor',
  'name': 'Christopher Boyes'},
 {'credit_id': '539c4a4cc3a36810c9002101',
  'department': 'Production',
  'gender': 1,
  'id': 1262,
  'job': 'Casting',
  'name': 'Mali Finn'},
 {'credit_id': '5544ee3b925141499f0008fc',
  'department': 'Sound',
  'gender': 2,
  'id': 1729,
  'job': 'Original Music Composer',
  'name': 'James Horner'},
 {'credit_id': '52fe48009251416c750ac9c3',
  'department': 'Directing',
  

In [18]:
movies['cast'][0]

[{'cast_id': 242,
  'character': 'Jake Sully',
  'credit_id': '5602a8a7c3a3685532001c9a',
  'gender': 2,
  'id': 65731,
  'name': 'Sam Worthington',
  'order': 0},
 {'cast_id': 3,
  'character': 'Neytiri',
  'credit_id': '52fe48009251416c750ac9cb',
  'gender': 1,
  'id': 8691,
  'name': 'Zoe Saldana',
  'order': 1},
 {'cast_id': 25,
  'character': 'Dr. Grace Augustine',
  'credit_id': '52fe48009251416c750aca39',
  'gender': 1,
  'id': 10205,
  'name': 'Sigourney Weaver',
  'order': 2},
 {'cast_id': 4,
  'character': 'Col. Quaritch',
  'credit_id': '52fe48009251416c750ac9cf',
  'gender': 2,
  'id': 32747,
  'name': 'Stephen Lang',
  'order': 3},
 {'cast_id': 5,
  'character': 'Trudy Chacon',
  'credit_id': '52fe48009251416c750ac9d3',
  'gender': 1,
  'id': 17647,
  'name': 'Michelle Rodriguez',
  'order': 4},
 {'cast_id': 8,
  'character': 'Selfridge',
  'credit_id': '52fe48009251416c750ac9e1',
  'gender': 2,
  'id': 1771,
  'name': 'Giovanni Ribisi',
  'order': 5},
 {'cast_id': 7,
  'c

In [19]:
# Extract the director's name. If director is not listed, return NaN
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

In [20]:
# Extract the director's name. If director is not listed, return NaN
def get_producer(x):
    for crew_member in x:
        if crew_member['job'] == 'Producer':
            return crew_member['name']
    return np.nan

In [21]:
# Extract the director's name. If director is not listed, return NaN
def get_screenwriter(x):
    for crew_member in x:
        if crew_member['job'] == 'Writer':
            return crew_member['name']
    return np.nan

In [22]:
# Extract the director's name. If director is not listed, return NaN
def get_editor(x):
    for crew_member in x:
        if crew_member['job'] == 'Editor':
            return crew_member['name']
    return np.nan

In [23]:
# Extract the director's name. If director is not listed, return NaN
def get_screenplay(x):
    for crew_member in x:
        if crew_member['job'] == 'Screenplay':
            return crew_member['name']
    return np.nan

In [24]:
#Define the new director feature
movies['director'] = movies['crew'].apply(get_director)

#Print the directors of the first five movies
movies['director'].head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4       Andrew Stanton
Name: director, dtype: object

In [25]:
# Returns the list top 3 elements or entire list; whichever is more.
def generate_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:10]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [26]:
movies['keywords'][0]

[{'id': 1463, 'name': 'culture clash'},
 {'id': 2964, 'name': 'future'},
 {'id': 3386, 'name': 'space war'},
 {'id': 3388, 'name': 'space colony'},
 {'id': 3679, 'name': 'society'},
 {'id': 3801, 'name': 'space travel'},
 {'id': 9685, 'name': 'futuristic'},
 {'id': 9840, 'name': 'romance'},
 {'id': 9882, 'name': 'space'},
 {'id': 9951, 'name': 'alien'},
 {'id': 10148, 'name': 'tribe'},
 {'id': 10158, 'name': 'alien planet'},
 {'id': 10987, 'name': 'cgi'},
 {'id': 11399, 'name': 'marine'},
 {'id': 13065, 'name': 'soldier'},
 {'id': 14643, 'name': 'battle'},
 {'id': 14720, 'name': 'love affair'},
 {'id': 165431, 'name': 'anti war'},
 {'id': 193554, 'name': 'power relations'},
 {'id': 206690, 'name': 'mind and soul'},
 {'id': 209714, 'name': '3d'}]

In [27]:
#Apply the generate_list function to cast and keywords
movies['cast'] = movies['cast'].apply(generate_list)
movies['keywords'] = movies['keywords'].apply(generate_list)

In [28]:
#Only consider a maximum of 3 genres
movies['genres'] = movies['genres'].apply(lambda x: x[:])

In [29]:
#Only consider a maximum of 3 genres
movies['production_companies'] = movies['production_companies'].apply(lambda x: x[:])

In [30]:
# Print the new features of the first 5 movies along with title
movies[['title', 'cast', 'director', 'keywords', 'genres', 'production_companies']].head()

Unnamed: 0,title,cast,director,keywords,genres,production_companies
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",James Cameron,"[culture clash, future, space war, space colon...","[action, adventure, fantasy, science fiction]","[ingenious film partners, twentieth century fo..."
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",Gore Verbinski,"[ocean, drug abuse, exotic island, east india ...","[adventure, fantasy, action]","[walt disney pictures, jerry bruckheimer films..."
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",Sam Mendes,"[spy, based on novel, secret agent, sequel, mi...","[action, adventure, crime]","[columbia pictures, danjaq, b24]"
3,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman, A...",Christopher Nolan,"[dc comics, crime fighter, terrorist, secret i...","[action, crime, drama, thriller]","[legendary pictures, warner bros., dc entertai..."
4,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",Andrew Stanton,"[based on novel, mars, medallion, space travel...","[action, adventure, science fiction]",[walt disney pictures]


In [134]:
# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase
def sanitize(x):
    if isinstance(x, list):
        #Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [143]:
#Apply the generate_list function to cast, keywords, director and genres
for feature in ['cast', 'director', 'genres', 'keywords', 'production_companies']:
    movies[feature] = movies[feature].apply(sanitize)

In [148]:
#Function that creates a soup out of the desired metadata
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + x['director'] + ' '.join(x['genres']) + ' ' + x['director'] + ' '.join(x['production_companies'])

In [149]:
# Create the new soup feature
movies['soup'] = movies.apply(create_soup, axis=1)

In [150]:
#Display the soup of the first movie
movies.iloc[0]['soup']

'cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien samworthington zoesaldana sigourneyweaver stephenlang michellerodriguez giovanniribisi joeldavidmoore cchpounder wesstudi lazalonso jamescameron action adventure fantasy sciencefiction ingeniousfilmpartners twentiethcenturyfoxfilmcorporation duneentertainment lightstormentertainment'

In [151]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Define a new CountVectorizer object and create vectors for the soup
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movies['soup'])

In [152]:
#Import cosine_similarity function
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity score (equivalent to dot product for tf-idf vectors)
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [153]:
# Reset index of your df and construct reverse mapping again
movies = movies.reset_index()
indices2 = pd.Series(movies.index, index=movies['title'])

In [154]:
content_recommender('The Lion King', cosine_sim2, movies, indices2)

3064                    Teacher's Pet
812                        Pocahontas
77                         Inside Out
358         Atlantis: The Lost Empire
465                     Fantasia 2000
194                          Dinosaur
430                     Lilo & Stitch
1907    The Greatest Game Ever Played
2298    Thomas and the Magic Railroad
2188           Pooh's Heffalump Movie
Name: title, dtype: object