In [2]:
#import dependencies
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [3]:
#load the meta data into a dataframe
df_meta = pd.read_csv('data/archive/movies_metadata.csv', low_memory=False)
df_meta.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


We must use a weighted rating system so that a movie's ranking represents its rating and the number of ratings that it receives.  This incentivizes popularity as well as rating in the ranking.  We will use the following ranking formula: 

\begin{equation} \text Weighted Rating (\bf WR) = \left({{\bf v} \over {\bf v} + {\bf m}} \cdot R\right) + \left({{\bf m} \over {\bf v} + {\bf m}} \cdot C\right) \end{equation}

where,
* v is the number of votes for the movie
* m is the minimum votes required to be listed in the chart (hyperparameter that filters for non-popular movies; maybe this is a feature in future clustering)
* R is the average rating of the movie
* C is the mean vote across the whole report

In [4]:
# calculate the mean vote across all movies
C = df_meta['vote_average'].mean()
print(C)

#minimum number of votes m based on a percentile
percentile = 0.9 #meaning we want all movies in consideration to be in the top 10 percent of reviewed movies
m = df_meta['vote_count'].quantile(percentile)

#filter out unpopular movies
df_top_movies = df_meta[df_meta['vote_count']>m]

5.618207215134185


In [5]:
print(df_meta.shape)
print(df_top_movies.shape)

(45466, 24)
(4538, 24)


In [6]:
#define a function to calculate the weighted rating
def weighted_rating(df, m=m, C=C):
    v = df['vote_count']
    R = df['vote_average']
    return (v/(v+m)*R)+(m/(m+v)*C)


In [7]:
df_top_movies['scores'] = df_top_movies.apply(weighted_rating, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top_movies['scores'] = df_top_movies.apply(weighted_rating, axis=1)


In [8]:
#sort in descending order
df_top_movies = df_top_movies.sort_values('scores', ascending=False)
df_top_movies[['title', 'vote_count', 'vote_average', 'scores']].head(15)

Unnamed: 0,title,vote_count,vote_average,scores
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


In [9]:
#for a content-based recommender. we can compute pariwise cosine similarity scores to recommend like movies to one particular movie
#print plot overview of movies found in the meta data
df_meta['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [10]:
#need to compute Term Frequency-Inverse Document Frequency (TF-IDF) of each overview.  
# This will give a matrix that shows word frequency inversely weighted by frequency in all documents
#create tfidf object with all stopwords removed.  language = english
tfidf = TfidfVectorizer(stop_words='english')

#filter all NaN by replacing with an empty string
df_meta['overview'] = df_meta['overview'].fillna('')

#fit the tfidf to the overviews found in the metadata
tfidf_matrix = tfidf.fit_transform(df_meta['overview'])

print(tfidf_matrix.shape)

(45466, 75827)


In [11]:
'''
Note: This block of code crashed my macbook air 2019 (intel processor) everytime I tried to run it.  

Since we have already found a vectorized tfidf matrix, we essentially just need to take the dot product of it to get a matrix of cosine similarities
The linear kernel function takes a dotproduct of matrices
'''
cos_sim_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)
#each column represents a movie and its cosine similarity score with another movie
print(cos_sim_matrix.shape)

: 