In [1]:
#import library
import pandas as pd

#load movies metadata
metadata = pd.read_csv('movies_metadata.csv',low_memory=False)

#print the first three rows
metadata.head(100)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,False,,3000000,"[{'id': 18, 'name': 'Drama'}]",,406,tt0113247,fr,La Haine,Aimlessly whiling away their days in the concr...,...,1995-05-31,0.0,98.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,Three Young Friends... One Last Chance.,La Haine,False,7.9,695.0
96,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,45549,tt0111173,en,Shopping,"A dark, hip, urban story of a barren and anony...",...,1994-12-06,0.0,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,No one leaves without paying...,Shopping,False,5.6,13.0
97,False,,0,"[{'id': 99, 'name': 'Documentary'}]",http://www.nickbroomfield.com/heidifleiss.html,63076,tt0113283,en,Heidi Fleiss: Hollywood Madam,A documentary crew from the BBC arrives in L.A...,...,1995-12-27,0.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Heidi Fleiss: Hollywood Madam,False,6.8,4.0
98,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",,11062,tt0115907,en,City Hall,The accidental shooting of a boy in New York l...,...,1996-02-16,0.0,111.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It started with a shootout on a rainswept stre...,City Hall,False,6.0,67.0


In [2]:
#Calculate mean of vote average column
C = metadata['vote_average'].mean()
C

5.618207215133889

In [3]:
#Calculate the minimum number of votes required to be in the chart, m
m = metadata['vote_count'].quantile(0.90)
m

160.0

In [4]:
#Filter out all qualified movies into a new DataFrame
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
q_movies.shape

(4555, 24)

In [5]:
#Function that computes the weighted rating of each movie
def weighted_rating(x,m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    #Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [6]:
#Define a new features 'score' and calculate its value with weighted_rating()
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [7]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score',ascending = False)

#Print the top 15 movies
q_movies[['title','vote_count','vote_average','score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


# Content based filtering model

In [8]:
#Print plot overviews of the first 5 movies
metadata['overview'].head()
metadata['overview'].shape

(45466,)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words = 'english')

In [10]:
#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')
metadata['overview'].shape

(45466,)

In [11]:
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'].head(31000))

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(31000, 59830)

In [12]:
#Array mapping from feature integer indices to feature name.
tfidf.get_feature_names()[5000:5010]

['batty',
 'batwing',
 'batwoman',
 'batya',
 'batyi',
 'baubles',
 'bauby',
 'bauchard',
 'bauder',
 'bauer']

In [13]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

In [14]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [15]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [16]:
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [17]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [18]:
get_recommendations('The Dark Knight Rises')

12481                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
15511                           Batman: Under the Red Hood
585                                                 Batman
21194    Batman Unmasked: The Psychology of the Dark Kn...
9230                    Batman Beyond: Return of the Joker
18035                                     Batman: Year One
19792              Batman: The Dark Knight Returns, Part 1
3095                          Batman: Mask of the Phantasm
Name: title, dtype: object

In [19]:
get_recommendations('The Godfather')

1178                                The Godfather: Part II
1914                               The Godfather: Part III
23126                                           Blood Ties
11297                                     Household Saints
10821                                             Election
17729                                    Short Sharp Shock
26293                                   Beck 28 - Familjen
30787                                    The Sign of Venus
30169    Paolo Barca, maestro elementare, praticamente ...
8653                                          Violent City
Name: title, dtype: object

In [20]:
get_recommendations('Jumanji')

21633                   Table No. 21
6166                       Brainscan
8801                         Quintet
17223                 The Dark Angel
30981                   Turkey Shoot
9503                       Word Wars
16843                         DeVour
13601    The Mindscape of Alan Moore
8079                         Masques
30398                         Pixels
Name: title, dtype: object

In [21]:
get_recommendations('Toy Story')

15348                    Toy Story 3
2997                     Toy Story 2
10301         The 40 Year Old Virgin
24523                      Small Fry
23843    Andy Hardy's Blonde Trouble
29202                     Hot Splash
8327                       The Champ
27206     Life Begins for Andy Hardy
1071           Rebel Without a Cause
26304         You're Only Young Once
Name: title, dtype: object