In [40]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

import warnings; warnings.simplefilter('ignore')

In [2]:
metadata = pd.read_csv('/Users/anthonymiyoro/Documents/code/MoviePredictor/data/movies_metadata.csv', low_memory=False)
links_small = pd.read_csv('/Users/anthonymiyoro/Documents/code/MoviePredictor/data/links_small.csv')

In [7]:
links_small.head(2)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0


In [5]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

metadata = metadata.drop([19730, 29503, 35587])

In [6]:
metadata['id'] = metadata['id'].astype('int')

# Collect all the metadata for movies in the links_small dataset
smd = metadata[metadata['id'].isin(links_small)]

In [7]:
# Fill empty spaces and create new description column
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [17]:
# Convert descriptions into a corpus then weigh the individual words using tfidf
# https://www.quora.com/How-does-TfidfVectorizer-work-in-laymans-terms

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

tfidf_matrix.shape

(9099, 268124)

In [18]:
# Cosine Similarity

# I will be using the Cosine Similarity to calculate a numeric quantity that denotes the similarity between two movies. 
# Mathematically, it is defined as follows:

# cosine(x,y)=x.y⊺||x||.||y||
 
# Since we have used the TF-IDF Vectorizer, calculating the Dot Product will directly give us the Cosine Similarity Score. 
# Therefore, we will use sklearn's linear_kernel instead of cosine_similarities since it is much faster.

In [19]:
# Calculate the cosine similarity f each movies description to another in the dataset
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

cosine_sim[0]

array([ 1.        ,  0.00680476,  0.        , ...,  0.        ,
        0.00344913,  0.        ])

In [20]:
# Create a new series based on movie titles using the similarity matrix
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [3]:
# Get the most similar movies using their titles
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [25]:
get_recommendations('Cars').head(10)

7814               Cars 2
1829      Bride of Chucky
3564          Bagdad Cafe
4937            Silverado
2391        On Any Sunday
3383               Driven
3473    Cannonball Run II
2069     Cookie's Fortune
5454            The Clock
871        The Great Race
Name: title, dtype: object

In [8]:
## NEW METHOD

# Import new data then change the ids to integers
credits = pd.read_csv('/Users/anthonymiyoro/Documents/code/MoviePredictor/data/credits.csv')
keywords = pd.read_csv('/Users/anthonymiyoro/Documents/code/MoviePredictor/data/keywords.csv')

# Change all ids to integers
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

In [9]:
# Merge cast, crew, genres and credits into one dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

In [10]:
metadata.shape


(46628, 27)

In [11]:
# Merge credits and keywords to metadata in the smaller dataset
smd = metadata[metadata['id'].isin(links_small)]
smd.shape

(9219, 27)

In [42]:
# From the crew we will only pick the director as a feature.
# From the cast, we will only pick the first 3 mentioned as we assume that they are the most influential

In [12]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [73]:
vote_counts = metadata[metadata['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = metadata[metadata['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.238696808510638

In [74]:
m = vote_counts.quantile(0.95)
m

425.0

In [76]:
metadata['year'] = pd.to_datetime(metadata['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [78]:
qualified = metadata[(metadata['vote_count'] >= m) & (metadata['vote_count'].notnull()) & (metadata['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2335, 6)

In [79]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [80]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [81]:
# List movies with the highest weighted ratings
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [13]:
# Function that collects directors name
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [14]:
# Create new column that will hold the diroctors name
smd['director'] = smd['crew'].apply(get_director)

In [15]:
# Collect the first 3 cast members
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [16]:
# For each movie in the dataset, there will be a metadata dump in which 
# all the genres, director, main actors and keywords. There will then be a count matrix from a count vectoriser with 
# which we calculate the cosine similarities and return movies that are most similar. 

# For the genre and credit data, we will strip spaces and convert to lowercase. We will also mention the director 3 
# times to increase its weighting to that above the cast.

smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

In [17]:
# For the keywords 

In [18]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [49]:
s = s.value_counts()
s[:5]



independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [50]:
s = s[s > 1]

stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [19]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

I use the TMDB Ratings to come up with our Top Movies Chart. I will use IMDB's weighted rating formula to construct my chart. Mathematically, it is represented as follows:

Weighted Rating (WR) =  (vv+m.R)+(mv+m.C)
 
where,

v is the number of votes for the movie
m is the minimum votes required to be listed in the chart
R is the average rating of the movie
C is the mean vote across the whole report
The next step is to determine an appropriate value for m, the minimum votes required to be listed in the chart. We will use 95th percentile as our cutoff. In other words, for a movie to feature in the charts, it must have more votes than at least 95% of the movies in the list.

I will build our overall Top 250 Chart and will define a function to build charts for a particular genre. Let's begin!

In [44]:
vote_counts = metadata[metadata['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = metadata[metadata['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.238696808510638

In [45]:
m = vote_counts.quantile(0.95)
m

425.0

In [48]:
metadata['year'] = pd.to_datetime(metadata['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [49]:
qualified = metadata[(metadata['vote_count'] >= m) & (metadata['vote_count'].notnull()) & (metadata['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2335, 6)

In [41]:
# I use the TMDB Ratings to come up with our Top Movies Chart. I will use IMDB's weighted rating formula to construct my chart. Mathematically, it is represented as follows:
# Weighted Rating (WR) = (vv+m.R)+(mv+m.C) where, v is the number of votes for the movie m is the minimum votes required to be listed in the chart R is the 
# average rating of the movie C is the mean vote across the whole report The next step is to determine an appropriate value for m, the minimum votes required to be listed in the chart. 
# We will use 95th percentile as our cutoff. In other words, for a movie to feature in the charts, it must have more votes than at least 95% of the movies in the list. 
# I will build our overall Top 250 Chart and will define a function to build charts for a particular genre. Let's begin!


def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [52]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [53]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['keywords'] + smd['cast'] +  smd['director'] 
# smd['soup'] =  + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [27]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [28]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [54]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [55]:
get_recommendations('The Dark Knight').head(6)

6218            Batman Begins
6623             The Prestige
8031    The Dark Knight Rises
2085                Following
4145                 Insomnia
8613             Interstellar
Name: title, dtype: object

In [31]:
# From our results, we can see that we need to remove the bad movies (those that have low ratings). 
# I will take the top 25 movies based on similarity scores and calculate the vote of the 60th percentile movie. 
# Then, using this as the value of m, we will calculate the weighted rating of each movie using IMDB's formula like we did in the Simple Recommender section.

In [82]:
# Returns a dataframe ??
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(6)
    return qualified
    

In [57]:
smd.columns

Index(['level_0', 'index', 'adult', 'belongs_to_collection', 'budget',
       'genres', 'homepage', 'id', 'imdb_id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'video', 'vote_average', 'vote_count', 'cast', 'crew', 'keywords',
       'cast_size', 'crew_size', 'director', 'soup'],
      dtype='object')

In [58]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['keywords'] + smd['cast'] +  smd['director'] 
# smd['soup'] =  + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [87]:
improved_recommendations('Fast & Furious 6')

Unnamed: 0,title,vote_count,vote_average,wr
1607,Saving Private Ryan,5148,7,6.865682
8939,Furious 7,4253,7,6.839984
7870,Fast Five,2491,7,6.743294
3481,The Fast and the Furious,3485,6,5.91725
9006,Star Trek Beyond,2636,6,5.894298
7266,Fast & Furious,2426,6,5.886512
