In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

import warnings
warnings.simplefilter('ignore')

## Simple Recommendation System
#### Returns the top 250 movies, based on the gnere you type during the search

In [2]:
mdata = pd.read_csv('./movies_metadata.csv')
mdata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
mdata['genres'] = mdata['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
mdata.genres

0         [Animation, Comedy, Family]
1        [Adventure, Fantasy, Family]
2                   [Romance, Comedy]
3            [Comedy, Drama, Romance]
4                            [Comedy]
                     ...             
45461                 [Drama, Family]
45462                         [Drama]
45463       [Action, Drama, Thriller]
45464                              []
45465                              []
Name: genres, Length: 45466, dtype: object

In [4]:
mdata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
# We will use the Weighted Average formula : (v/(v+m) * R) + (m/(m+v) * C)
# v = No. of votes for the movie
# m = Min. votes required to be listed in the chart
# R = Avg. rating of the movie
# C = Mean vote across the whole report

In [6]:
# For m, for a movie to feature in the charts, it must have more votes than at least 95% of the movies in the list.
m = mdata[mdata.vote_count.notnull()].vote_count.astype('int').quantile(0.95)
C = mdata[mdata.vote_average.notnull()].vote_average.mean()
print("m : ", m, " and c : ", C)

m :  434.0  and c :  5.618207215134184


In [7]:
# Adding a Column with Year
mdata['year'] = pd.to_datetime(mdata.release_date, errors = 'coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
mdata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995


In [8]:
# Find the Movies that satisfy the criteria of 'm' and 'C' we calculated above. and also Reducing columns.
# Columns are: Title, Year of release, Vote_count, Average_vote, popularity, Genre and ratings
qualified = mdata[(mdata['vote_count'] >= m) & (mdata['vote_count'].notnull()) & 
                  (mdata['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified.shape

(2274, 6)

In [9]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [10]:
qualified['ratings'] = qualified.apply(weighted_rating, axis=1)
qualified

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,ratings
0,Toy Story,1995,5415,7.7,21.946943,"[Animation, Comedy, Family]",7.545529
1,Jumanji,1995,2413,6.9,17.015539,"[Adventure, Fantasy, Family]",6.704602
5,Heat,1995,1886,7.7,17.924927,"[Action, Crime, Drama, Thriller]",7.310561
9,GoldenEye,1995,1194,6.6,14.686036,"[Adventure, Action, Thriller]",6.338269
15,Casino,1995,1343,7.8,10.137389,"[Drama, Crime]",7.267137
...,...,...,...,...,...,...,...
44624,What Happened to Monday,2017,598,7.3,60.581223,"[Science Fiction, Thriller]",6.592734
44632,Atomic Blonde,2017,748,6.1,14.455104,"[Action, Thriller]",5.923098
44678,Dunkirk,2017,2712,7.5,30.938854,"[Action, Drama, History, Thriller, War]",7.240401
44842,Transformers: The Last Knight,2017,1440,6.2,39.186819,"[Action, Science Fiction, Thriller, Adventure]",6.065263


In [11]:
# Sorting based on rating and taking only top 250 Movies
qualified = qualified.sort_values('ratings', ascending=False).head(250)
qualified.head(20)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,ratings
314,The Shawshank Redemption,1994,8358,8.5,51.645403,"[Drama, Crime]",8.357746
834,The Godfather,1972,6024,8.5,41.109264,"[Drama, Crime]",8.306334
12481,The Dark Knight,2008,12269,8.3,123.167259,"[Drama, Action, Crime, Thriller]",8.208376
2843,Fight Club,1999,9678,8.3,63.869599,[Drama],8.184899
292,Pulp Fiction,1994,8670,8.3,140.950236,"[Thriller, Crime]",8.172155
351,Forrest Gump,1994,8147,8.2,48.307194,"[Comedy, Drama, Romance]",8.069421
522,Schindler's List,1993,4436,8.3,41.725123,"[Drama, History, War]",8.061007
23673,Whiplash,2014,4376,8.3,64.29999,[Drama],8.058025
5481,Spirited Away,2001,3968,8.3,41.048867,"[Fantasy, Adventure, Animation, Family]",8.035598
1154,The Empire Strikes Back,1980,5998,8.2,19.470959,"[Adventure, Action, Science Fiction]",8.025793


In [12]:
s = mdata.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_df = mdata.drop('genres', axis=1).join(s)

# Above Step, Remove the genre column from mdata and converting the list of genre in 'Genre' Column into multiple rows.

In [13]:
gen_df

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Animation
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Comedy
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Family
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Adventure
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Fantasy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45463,False,,0,,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,...,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0,2003,Action
45463,False,,0,,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,...,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0,2003,Drama
45463,False,,0,,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,...,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0,2003,Thriller
45464,False,,0,,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,...,87.0,[],Released,,Satan Triumphant,False,0.0,0.0,1917,


In [14]:
# Doing all the things of votecounts and rating in this method, for a particular genre and returning top 250. (From mData)
def build_chart(genre, percentile=0.85):
    df = gen_df[gen_df['genre'] == genre]
    m = df[df.vote_count.notnull()].vote_count.astype('int').quantile(percentile)
    C = df[df.vote_average.notnull()]['vote_average'].mean()

    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & 
                   (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    
    qualified['ratings'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average'])
                                                       + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('ratings', ascending=False).head(250)    
    return qualified

In [15]:
build_chart('Science Fiction')

Unnamed: 0,title,year,vote_count,vote_average,popularity,ratings
1154,The Empire Strikes Back,1980,5998,8.2,19.470959,8.071052
15480,Inception,2010,14075,8.1,29.108149,8.045563
22879,Interstellar,2014,11187,8.1,32.213481,8.031857
256,Star Wars,1977,6778,8.1,42.149697,7.989310
1225,Back to the Future,1985,6239,8.0,25.778509,7.884509
...,...,...,...,...,...,...
5325,Men in Black II,2002,3188,6.1,16.775716,6.038171
17204,Attack the Block,2011,748,6.3,9.93263,6.036947
12700,The Incredible Hulk,2008,3086,6.1,19.125537,6.036299
4417,Short Circuit,1986,535,6.4,12.171218,6.033855


# Content Based Recommendations

In [16]:
mdata = mdata.drop([19730, 29503, 35587])

In [17]:
links_small = pd.read_csv(r'./links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
mdata.id = mdata.id.astype('int')
small_df = mdata[mdata['id'].isin(links_small)]
small_df.shape

(9099, 25)

##### We will use Cosine Similarity to find similarity between 2 Movies numerically.

#### Movie Description Based Recommendation System

In [18]:
small_df.tagline = small_df.tagline.fillna('')
small_df.overview = small_df.overview.fillna('')
small_df['description'] = small_df.overview + small_df.tagline

#### TFID Vectorizer
##### Since we are using the TF-IDF Vectorizer, calculating the Dot Product will directly give us the Cosine Similarity Score.

In [19]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.0, stop_words='english')
tfidf_matrix = tf.fit_transform(small_df['description'])
tfidf_matrix.shape

(9099, 268124)

In [20]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [21]:
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [22]:
small_df = small_df.reset_index()
titles = small_df['title']
indices = pd.Series(small_df.index, index=small_df['title'])

In [23]:
small_df

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,description
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ..."
1,1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,A family wedding reignites the ancient feud be...
3,3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom..."
4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,Just when George Banks has recovered from his ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9094,40224,False,,15000000,"[Action, Adventure, Drama, Horror, Science Fic...",,315011,tt4262980,ja,シン・ゴジラ,...,120.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...",Released,A god incarnate. A city doomed.,Shin Godzilla,False,6.6,152.0,2016,From the mind behind Evangelion comes a hit la...
9095,40503,False,,0,"[Documentary, Music]",http://www.thebeatlesliveproject.com/,391698,tt2531318,en,The Beatles: Eight Days a Week - The Touring Y...,...,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The band you know. The story you don't.,The Beatles: Eight Days a Week - The Touring Y...,False,7.6,92.0,2016,"The band stormed Europe in 1963, and, in 1964,..."
9096,44821,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",16000000,"[Adventure, Fantasy, Animation, Action, Family]",http://movies.warnerbros.com/pk3/,10991,tt0235679,ja,Pokémon 3: The Movie,...,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Pokémon: Spell of the Unknown,Pokémon: Spell of the Unknown,False,6.0,144.0,2000,When Molly Hale's sadness of her father's disa...
9097,44826,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",0,"[Adventure, Fantasy, Animation, Science Fictio...",http://www.pokemon.com/us/movies/movie-pokemon...,12600,tt0287635,ja,劇場版ポケットモンスター セレビィ 時を越えた遭遇（であい）,...,75.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Pokémon 4Ever: Celebi - Voice of the Forest,False,5.7,82.0,2001,"All your favorite Pokémon characters are back,..."


In [24]:
def get_recommendations(title, cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:16]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [25]:
get_recommendations('Jumanji', cosine_sim)

8889                        Pixels
8608       Guardians of the Galaxy
6392                    Stay Alive
8154                Wreck-It Ralph
3196            Dungeons & Dragons
8670                         Ouija
5356      Night of the Living Dead
8211              Would You Rather
6323                 Grandma's Boy
4082     The Giant Spider Invasion
5803               Comfort and Joy
1644                     Peter Pan
2080                      eXistenZ
6545             Sleeping Dogs Lie
6285    Zathura: A Space Adventure
Name: title, dtype: object

In [26]:
get_recommendations('The Dark Knight', cosine_sim)

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
6144                              Batman Begins
7933         Sherlock Holmes: A Game of Shadows
5511                            To End All Wars
4489                                      Q & A
7344                        Law Abiding Citizen
Name: title, dtype: object

#### Genres, Cast and Keywords Based Recommendation System

In [27]:
credits = pd.read_csv(r'./credits.csv')
keywords = pd.read_csv(r'./keywords.csv')

In [28]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

In [29]:
md1 = mdata.merge(credits, on='id')
md1 = md1.merge(keywords, on='id')

In [30]:
small_df= md1[md1['id'].isin(links_small)]
small_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,year,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [31]:
small_df.shape

(9219, 28)

In [32]:
def director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

small_df['director'] = small_df['crew'].apply(literal_eval).apply(lambda x: director(x))

In [33]:
small_df['cast'] = small_df['cast'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [34]:
small_df['cast'] = small_df['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [35]:
# Wrangling keywords
small_df['keywords'] = small_df['keywords'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [36]:
small_df

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,year,cast,crew,keywords,director
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415.0,1995,"[Tom Hanks, Tim Allen, Don Rickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy, friendship, friends, riva...",John Lasseter
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[board game, disappearance, based on children'...",Joe Johnston
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,"[Walter Matthau, Jack Lemmon, Ann-Margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fishing, best friend, duringcreditsstinger, o...",Howard Deutch
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,"[Whitney Houston, Angela Bassett, Loretta Devine]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[based on novel, interracial relationship, sin...",Forest Whitaker
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,"[Steve Martin, Diane Keaton, Martin Short]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlife crisis, confidence, aging, daug...",Charles Shyer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45985,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",0,"[Adventure, Fantasy, Animation, Science Fictio...",http://www.pokemon.com/us/movies/movie-pokemon...,12600,tt0287635,ja,劇場版ポケットモンスター セレビィ 時を越えた遭遇（であい）,"All your favorite Pokémon characters are back,...",...,,Pokémon 4Ever: Celebi - Voice of the Forest,False,5.7,82.0,2001,"[Veronica Taylor, Rachael Lillis, Maddie Blaus...","[{'credit_id': '52fe45049251416c75048e35', 'de...","[sequel, pokémon, celebi]",Kunihiko Yuyama
46424,False,,0,"[Comedy, Drama]",,265189,tt2121382,sv,Turist,"While holidaying in the French Alps, a Swedish...",...,,Force Majeure,False,6.8,255.0,2014,"[Lisa Loven Kongsli, Johannes Bah Kuhnke, Clar...","[{'credit_id': '534fd1a80e0a267eb6000e32', 'de...","[female nudity, dark comedy, family vacation, ...",Ruben Östlund
46425,False,,0,"[Comedy, Drama]",,265189,tt2121382,sv,Turist,"While holidaying in the French Alps, a Swedish...",...,,Force Majeure,False,6.8,255.0,2014,"[Lisa Loven Kongsli, Johannes Bah Kuhnke, Clar...","[{'credit_id': '534fd1a80e0a267eb6000e32', 'de...","[female nudity, dark comedy, family vacation, ...",Ruben Östlund
46426,False,,0,"[Comedy, Drama]",,265189,tt2121382,sv,Turist,"While holidaying in the French Alps, a Swedish...",...,,Force Majeure,False,6.8,255.0,2014,"[Lisa Loven Kongsli, Johannes Bah Kuhnke, Clar...","[{'credit_id': '534fd1a80e0a267eb6000e32', 'de...","[female nudity, dark comedy, family vacation, ...",Ruben Östlund


In [37]:
# Strip Spaces and Convert to Lowercase
# and repeating director 3 times, to more weightage to director above all other crew.

small_df['cast'] = small_df['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
small_df['director'] = small_df['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
small_df['director'] = small_df['director'].apply(lambda x: [x, x, x])

In [38]:
small_df

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,year,cast,crew,keywords,director
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415.0,1995,"[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy, friendship, friends, riva...","[johnlasseter, johnlasseter, johnlasseter]"
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,"[robinwilliams, jonathanhyde, kirstendunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[board game, disappearance, based on children'...","[joejohnston, joejohnston, joejohnston]"
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,"[waltermatthau, jacklemmon, ann-margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fishing, best friend, duringcreditsstinger, o...","[howarddeutch, howarddeutch, howarddeutch]"
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,"[whitneyhouston, angelabassett, lorettadevine]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[based on novel, interracial relationship, sin...","[forestwhitaker, forestwhitaker, forestwhitaker]"
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,"[stevemartin, dianekeaton, martinshort]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlife crisis, confidence, aging, daug...","[charlesshyer, charlesshyer, charlesshyer]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45985,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",0,"[Adventure, Fantasy, Animation, Science Fictio...",http://www.pokemon.com/us/movies/movie-pokemon...,12600,tt0287635,ja,劇場版ポケットモンスター セレビィ 時を越えた遭遇（であい）,"All your favorite Pokémon characters are back,...",...,,Pokémon 4Ever: Celebi - Voice of the Forest,False,5.7,82.0,2001,"[veronicataylor, rachaellillis, maddieblaustein]","[{'credit_id': '52fe45049251416c75048e35', 'de...","[sequel, pokémon, celebi]","[kunihikoyuyama, kunihikoyuyama, kunihikoyuyama]"
46424,False,,0,"[Comedy, Drama]",,265189,tt2121382,sv,Turist,"While holidaying in the French Alps, a Swedish...",...,,Force Majeure,False,6.8,255.0,2014,"[lisalovenkongsli, johannesbahkuhnke, clarawet...","[{'credit_id': '534fd1a80e0a267eb6000e32', 'de...","[female nudity, dark comedy, family vacation, ...","[rubenöstlund, rubenöstlund, rubenöstlund]"
46425,False,,0,"[Comedy, Drama]",,265189,tt2121382,sv,Turist,"While holidaying in the French Alps, a Swedish...",...,,Force Majeure,False,6.8,255.0,2014,"[lisalovenkongsli, johannesbahkuhnke, clarawet...","[{'credit_id': '534fd1a80e0a267eb6000e32', 'de...","[female nudity, dark comedy, family vacation, ...","[rubenöstlund, rubenöstlund, rubenöstlund]"
46426,False,,0,"[Comedy, Drama]",,265189,tt2121382,sv,Turist,"While holidaying in the French Alps, a Swedish...",...,,Force Majeure,False,6.8,255.0,2014,"[lisalovenkongsli, johannesbahkuhnke, clarawet...","[{'credit_id': '534fd1a80e0a267eb6000e32', 'de...","[female nudity, dark comedy, family vacation, ...","[rubenöstlund, rubenöstlund, rubenöstlund]"


In [39]:
# Further processing Keywords
kw = small_df.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
kw.name = 'keyword'
kw = kw.value_counts()
kw = kw[kw > 1]
kw

keyword
independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
                       ... 
boarder                   2
social climbing           2
covert operation          2
prisoners                 2
crystal                   2
Name: count, Length: 6709, dtype: int64

In [40]:
def filter_keywords(keys):
    words = []
    for i in keys:
        if i in kw:
            words.append(i)
    return words

small_df.keywords = small_df['keywords'].apply(lambda x: filter_keywords(x))

In [41]:
small_df

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,year,cast,crew,keywords,director
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415.0,1995,"[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy, friendship, friends, riva...","[johnlasseter, johnlasseter, johnlasseter]"
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,"[robinwilliams, jonathanhyde, kirstendunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[board game, disappearance, based on children'...","[joejohnston, joejohnston, joejohnston]"
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,"[waltermatthau, jacklemmon, ann-margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fishing, best friend, duringcreditsstinger]","[howarddeutch, howarddeutch, howarddeutch]"
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,"[whitneyhouston, angelabassett, lorettadevine]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[based on novel, interracial relationship, sin...","[forestwhitaker, forestwhitaker, forestwhitaker]"
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,"[stevemartin, dianekeaton, martinshort]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlife crisis, confidence, aging, daug...","[charlesshyer, charlesshyer, charlesshyer]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45985,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",0,"[Adventure, Fantasy, Animation, Science Fictio...",http://www.pokemon.com/us/movies/movie-pokemon...,12600,tt0287635,ja,劇場版ポケットモンスター セレビィ 時を越えた遭遇（であい）,"All your favorite Pokémon characters are back,...",...,,Pokémon 4Ever: Celebi - Voice of the Forest,False,5.7,82.0,2001,"[veronicataylor, rachaellillis, maddieblaustein]","[{'credit_id': '52fe45049251416c75048e35', 'de...","[sequel, pokémon, celebi]","[kunihikoyuyama, kunihikoyuyama, kunihikoyuyama]"
46424,False,,0,"[Comedy, Drama]",,265189,tt2121382,sv,Turist,"While holidaying in the French Alps, a Swedish...",...,,Force Majeure,False,6.8,255.0,2014,"[lisalovenkongsli, johannesbahkuhnke, clarawet...","[{'credit_id': '534fd1a80e0a267eb6000e32', 'de...","[female nudity, dark comedy, family vacation, ...","[rubenöstlund, rubenöstlund, rubenöstlund]"
46425,False,,0,"[Comedy, Drama]",,265189,tt2121382,sv,Turist,"While holidaying in the French Alps, a Swedish...",...,,Force Majeure,False,6.8,255.0,2014,"[lisalovenkongsli, johannesbahkuhnke, clarawet...","[{'credit_id': '534fd1a80e0a267eb6000e32', 'de...","[female nudity, dark comedy, family vacation, ...","[rubenöstlund, rubenöstlund, rubenöstlund]"
46426,False,,0,"[Comedy, Drama]",,265189,tt2121382,sv,Turist,"While holidaying in the French Alps, a Swedish...",...,,Force Majeure,False,6.8,255.0,2014,"[lisalovenkongsli, johannesbahkuhnke, clarawet...","[{'credit_id': '534fd1a80e0a267eb6000e32', 'de...","[female nudity, dark comedy, family vacation, ...","[rubenöstlund, rubenöstlund, rubenöstlund]"


In [42]:
stemmer = SnowballStemmer('english')
small_df['keywords'] = small_df['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
small_df['keywords'] = small_df['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [43]:
##### Create a column "soup" which is a string that contains all the metadata that we want to feed to our vectorizer (director, keywords, genre, actors, etc)

In [44]:
small_df['soup'] = small_df['cast'] + small_df['director'] + small_df['genres'] + small_df['keywords']
small_df['soup'] = small_df['soup'].apply(lambda x: ' '.join(x))

In [45]:
small_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,title,video,vote_average,vote_count,year,cast,crew,keywords,director,soup
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Toy Story,False,7.7,5415.0,1995,"[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousi, toy, boy, friendship, friend, rival...","[johnlasseter, johnlasseter, johnlasseter]",tomhanks timallen donrickles johnlasseter john...
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Jumanji,False,6.9,2413.0,1995,"[robinwilliams, jonathanhyde, kirstendunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[boardgam, disappear, basedonchildren'sbook, n...","[joejohnston, joejohnston, joejohnston]",robinwilliams jonathanhyde kirstendunst joejoh...
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Grumpier Old Men,False,6.5,92.0,1995,"[waltermatthau, jacklemmon, ann-margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fish, bestfriend, duringcreditssting]","[howarddeutch, howarddeutch, howarddeutch]",waltermatthau jacklemmon ann-margret howarddeu...
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Waiting to Exhale,False,6.1,34.0,1995,"[whitneyhouston, angelabassett, lorettadevine]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[basedonnovel, interracialrelationship, single...","[forestwhitaker, forestwhitaker, forestwhitaker]",whitneyhouston angelabassett lorettadevine for...
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Father of the Bride Part II,False,5.7,173.0,1995,"[stevemartin, dianekeaton, martinshort]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[babi, midlifecrisi, confid, age, daughter, mo...","[charlesshyer, charlesshyer, charlesshyer]",stevemartin dianekeaton martinshort charlesshy...


In [46]:
count = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.0, stop_words='english')
count_matrix = count.fit_transform(small_df['soup'])

In [47]:
cos_sim = cosine_similarity(count_matrix, count_matrix)

In [48]:
small_df = small_df.reset_index()
titles = small_df['title']
indices = pd.Series(small_df.index, index=small_df['title'])

In [49]:
get_recommendations('Jumanji', cos_sim)

1624                             Honey, I Shrunk the Kids
5130                                              Hidalgo
500                                        The Pagemaster
1663                                        The Rocketeer
7489                                          The Wolfman
3653                                    Jurassic Park III
2012                                          October Sky
7883                   Captain America: The First Avenger
4909                                   The Cat in the Hat
7390                            Where the Wild Things Are
7494    Percy Jackson & the Olympians: The Lightning T...
5055                                            Peter Pan
6635                            Arthur and the Invisibles
6506                                  Night at the Museum
567                             James and the Giant Peach
Name: title, dtype: object

In [50]:
get_recommendations('The Dark Knight', cos_sim)

7991                 The Dark Knight Rises
6186                         Batman Begins
6587                          The Prestige
2077                             Following
7608                             Inception
4125                              Insomnia
3373                               Memento
8573                          Interstellar
7619            Batman: Under the Red Hood
1122                        Batman Returns
8899               Kidnapping Mr. Heineken
5907                              Thursday
1252                        Batman & Robin
9004    Batman v Superman: Dawn of Justice
4005                  The Long Good Friday
Name: title, dtype: object

##### One thing that we notice about our recommendation system is that it recommends movies regardless of ratings and popularity.
##### Therefore, we will add a mechanism to remove bad movies and return movies which are popular and have had a good critical response. We will calculate the weighted rating of each movie using IMDB's formula like we did in the Simple Recommender section.

In [51]:
def improved_recommendation(title, cosine_sim):   
    idx = indices[title]
    sim_score = list(enumerate(cosine_sim[idx]))
    sim_score = sorted(sim_score, key = lambda x:x[1], reverse = True)[1:51]
    movie_index = [i[0] for i in sim_score]
    
    movies = small_df.iloc[movie_index][['title', 'vote_average', 'vote_count', 'genres', 'cast', 'year']]
    m = movies[movies.vote_count.notnull()]['vote_count'].quantile(0.75)
    C = movies[movies.vote_average.notnull()]['vote_average'].mean()
    
    qualified = movies[(movies.vote_count >= m) & (movies.vote_count.notnull()) & (movies.vote_average.notnull())]
    qualified['Rating'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('Rating', ascending=False)[['title','Rating', 'genres', 'cast', 'year']]
    return qualified.head(10)

In [52]:
improved_recommendation('The Dark Knight', cos_sim)

Unnamed: 0,title,Rating,genres,cast,year
7608,Inception,8.025763,"[Action, Thriller, Science Fiction, Mystery, A...","[leonardodicaprio, josephgordon-levitt, ellenp...",2010
8573,Interstellar,8.007315,"[Adventure, Drama, Science Fiction]","[matthewmcconaughey, jessicachastain, annehath...",2014
3373,Memento,7.86595,"[Mystery, Thriller]","[guypearce, carrie-annemoss, joepantoliano]",2000
6587,The Prestige,7.790919,"[Drama, Mystery, Thriller]","[hughjackman, christianbale, michaelcaine]",2006
7991,The Dark Knight Rises,7.511303,"[Action, Crime, Drama, Thriller]","[christianbale, michaelcaine, garyoldman]",2012
6186,Batman Begins,7.397206,"[Action, Crime, Drama]","[christianbale, michaelcaine, liamneeson]",2005
7539,Kick-Ass,6.975874,"[Action, Crime]","[aarontaylor-johnson, chloëgracemoretz, christ...",2010
524,Batman,6.767469,"[Fantasy, Action]","[jacknicholson, michaelkeaton, kimbasinger]",1989
1122,Batman Returns,6.400889,"[Action, Fantasy]","[michaelkeaton, dannydevito, michellepfeiffer]",1992
8423,Kick-Ass 2,6.190772,"[Action, Adventure, Crime]","[aarontaylor-johnson, chloëgracemoretz, christ...",2013


In [53]:
improved_recommendation('Jumanji', cos_sim)

Unnamed: 0,title,Rating,genres,cast,year
5420,Harry Potter and the Prisoner of Azkaban,7.560377,"[Adventure, Fantasy, Family]","[danielradcliffe, rupertgrint, emmawatson]",2004
3824,Harry Potter and the Philosopher's Stone,7.39285,"[Adventure, Fantasy, Family]","[danielradcliffe, rupertgrint, emmawatson]",2001
6322,Harry Potter and the Goblet of Fire,7.368104,"[Adventure, Fantasy, Family]","[danielradcliffe, rupertgrint, emmawatson]",2005
7706,Harry Potter and the Deathly Hallows: Part 1,7.367031,"[Adventure, Fantasy, Family]","[danielradcliffe, emmawatson, rupertgrint]",2010
4342,Harry Potter and the Chamber of Secrets,7.279172,"[Adventure, Fantasy, Family]","[danielradcliffe, rupertgrint, emmawatson]",2002
6765,Harry Potter and the Order of the Phoenix,7.27254,"[Adventure, Fantasy, Family, Mystery]","[danielradcliffe, rupertgrint, emmawatson]",2007
7305,Harry Potter and the Half-Blood Prince,7.26824,"[Adventure, Fantasy, Family]","[danielradcliffe, rupertgrint, emmawatson]",2009
521,Aladdin,7.203182,"[Animation, Family, Comedy, Adventure, Fantasy...","[scottweinger, robinwilliams, lindalarkin]",1992
7883,Captain America: The First Avenger,6.543993,"[Action, Adventure, Science Fiction]","[chrisevans, hugoweaving, tommyleejones]",2011
6506,Night at the Museum,6.299705,"[Action, Adventure, Comedy, Family, Fantasy]","[benstiller, jakecherry, carlagugino]",2006


In [54]:
improved_recommendation('Iron Man', cos_sim)

Unnamed: 0,title,Rating,genres,cast,year
8680,Guardians of the Galaxy,7.805216,"[Action, Science Fiction, Adventure]","[chrispratt, zoesaldana, davebautista]",2014
8590,Captain America: The Winter Soldier,7.463801,"[Action, Adventure, Science Fiction]","[chrisevans, samuell.jackson, scarlettjohansson]",2014
8622,X-Men: Days of Future Past,7.376051,"[Action, Adventure, Fantasy, Science Fiction]","[hughjackman, jamesmcavoy, michaelfassbender]",2014
7929,The Avengers,7.337808,"[Science Fiction, Action, Adventure]","[robertdowneyjr., chrisevans, markruffalo]",2012
8839,Deadpool,7.334897,"[Action, Adventure, Comedy]","[ryanreynolds, morenabaccarin, edskrein]",2016
8836,Avengers: Age of Ultron,7.200586,"[Action, Adventure, Science Fiction]","[robertdowneyjr., chrishemsworth, markruffalo]",2015
8840,Captain America: Civil War,7.018554,"[Adventure, Action, Science Fiction]","[chrisevans, robertdowneyjr., scarlettjohansson]",2016
8837,Ant-Man,6.907211,"[Science Fiction, Action, Adventure]","[paulrudd, michaeldouglas, evangelinelilly]",2015
8348,Iron Man 3,6.745349,"[Action, Adventure, Science Fiction]","[robertdowneyjr., gwynethpaltrow, doncheadle]",2013
7883,Captain America: The First Avenger,6.543993,"[Action, Adventure, Science Fiction]","[chrisevans, hugoweaving, tommyleejones]",2011


# Collaborative based Filtering Recommendations System
##### Collaborative Filtering is based on the idea that users similar to a me can be used to predict how much I will like a particular product or service those users have used/experienced but I have not.
##### User based filtering 
###### - These systems recommend products to a user that similar users have liked. For measuring the similarity between two users we can either use pearson correlation or cosine similarity.
##### Item Based Collaborative Filtering 
###### - Instead of measuring the similarity between users, the item-based CF recommends items based on their similarity with the items that the target user rated. Likewise, the similarity can be computed with Pearson Correlation or Cosine Similarity.

#### We will use Singular Value Decomposition(SVD) library to minimise RMSE for recommendations.

In [55]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

reader = Reader()

In [58]:
ratings = pd.read_csv(r'./ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [61]:
# convert Pandas dataframe to surpise dataset for recommendations.
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [62]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8957  0.8946  0.9008  0.8929  0.8981  0.8964  0.0028  
MAE (testset)     0.6879  0.6876  0.6943  0.6879  0.6917  0.6899  0.0027  
Fit time          0.31    0.31    0.31    0.30    0.31    0.31    0.00    
Test time         0.04    0.04    0.04    0.09    0.04    0.05    0.02    


{'test_rmse': array([0.8957411 , 0.89456811, 0.90078144, 0.89288433, 0.8981465 ]),
 'test_mae': array([0.68785822, 0.68761692, 0.69431682, 0.68791257, 0.69169849]),
 'fit_time': (0.3056979179382324,
  0.3077559471130371,
  0.3138549327850342,
  0.3020479679107666,
  0.3121800422668457),
 'test_time': (0.04407620429992676,
  0.042482852935791016,
  0.04102301597595215,
  0.09145498275756836,
  0.040960073471069336)}

In [66]:
# data.build_full_trainset() converts the dataset(data) into a Surprise Trainset object
# svd.fit(trainset) : trains the SVD model on the training data.
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x106b81730>

In [67]:
# Lets check rating and reviews given by userid 100
ratings[ratings['userId'] == 100]

Unnamed: 0,userId,movieId,rating,timestamp
15273,100,1,4.0,854193977
15274,100,3,4.0,854194024
15275,100,6,3.0,854194023
15276,100,7,3.0,854194024
15277,100,25,4.0,854193977
15278,100,32,5.0,854193977
15279,100,52,3.0,854194056
15280,100,62,3.0,854193977
15281,100,86,3.0,854194208
15282,100,88,2.0,854194208


In [69]:
# This line predicts the rating that User 100 would give to Movie 302 using the trained SVD model.
svd.predict(100, 302)

Prediction(uid=100, iid=302, r_ui=None, est=3.228487870817355, details={'was_impossible': False})

# Hybrid Recommendation

#### We will build simple hybrod recommendation system that brings together both content and collaborative filtering based engine.
##### Input: USERID and MOVIE TITLE
##### Output: Similar movies sorted on the basis of expected ratings by that particular user.

In [71]:
id_map = pd.read_csv(r'./links_small.csv')
id_map = id_map[['movieId', 'tmdbId']].dropna()
id_map['tmdbId'] = id_map['tmdbId'].astype('int')
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(small_df[['title', 'id']], on='id').set_index('title')

In [72]:
indices_map = id_map.set_index('id')

In [76]:
def hybrid(userId, title, cosine_sim):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    # print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:51]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = small_df.iloc[movie_indices][['title', 'genres', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est_rating'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est_rating', ascending=False)
    return movies.head(10)

In [96]:
hybrid(10, 'Journey 2: The Mysterious Island', cos_sim)

Unnamed: 0,title,genres,vote_count,vote_average,year,id,est_rating
958,The Empire Strikes Back,"[Adventure, Action, Science Fiction]",5998.0,8.2,1980,1891,4.193405
232,Star Wars,"[Adventure, Action, Science Fiction]",6778.0,8.1,1977,11,4.165456
8837,Ant-Man,"[Science Fiction, Action, Adventure]",6029.0,7.0,2015,102899,3.987863
8680,Guardians of the Galaxy,"[Action, Science Fiction, Adventure]",10014.0,7.9,2014,118340,3.947313
8590,Captain America: The Winter Soldier,"[Action, Adventure, Science Fiction]",5881.0,7.6,2014,100402,3.914631
6680,Next,"[Action, Science Fiction, Thriller]",880.0,5.9,2007,1738,3.862074
4194,Rollerball,"[Adventure, Action, Science Fiction]",115.0,6.0,1975,11484,3.804226
971,Return of the Jedi,"[Adventure, Action, Science Fiction]",4763.0,7.9,1983,1892,3.797341
8836,Avengers: Age of Ultron,"[Action, Adventure, Science Fiction]",6908.0,7.3,2015,99861,3.751558
8941,San Andreas,"[Action, Drama, Thriller]",3017.0,6.0,2015,254128,3.718257


In [97]:
hybrid(100, 'Avatar', cos_sim)

Unnamed: 0,title,genres,vote_count,vote_average,year,id,est_rating
999,The Terminator,"[Action, Thriller, Science Fiction]",4208.0,7.4,1984,218,3.801506
1229,The Fifth Element,"[Adventure, Fantasy, Action, Thriller, Science...",3962.0,7.3,1997,18,3.795875
1368,Titanic,"[Drama, Romance, Thriller]",7770.0,7.5,1997,597,3.712107
962,Aliens,"[Horror, Action, Thriller, Science Fiction]",3282.0,7.7,1986,679,3.687926
8357,Star Trek Into Darkness,"[Action, Adventure, Science Fiction]",4479.0,7.4,2013,54138,3.676456
2826,Predator,"[Science Fiction, Action, Adventure, Thriller]",2129.0,7.3,1987,106,3.646761
1613,Darby O'Gill and the Little People,"[Adventure, Fantasy, Science Fiction, Family]",35.0,6.7,1959,18887,3.642457
3983,Vampire Hunter D: Bloodlust,"[Action, Adventure, Animation, Fantasy, Horror...",92.0,7.0,2000,15999,3.613416
3173,The Time Machine,"[Thriller, Adventure, Fantasy, Science Fiction...",217.0,7.5,1960,2134,3.561398
1660,Return from Witch Mountain,"[Adventure, Fantasy, Science Fiction, Family]",38.0,5.6,1978,14822,3.5331


In [98]:
hybrid(1, 'Avatar', cos_sim)

Unnamed: 0,title,genres,vote_count,vote_average,year,id,est_rating
522,Terminator 2: Judgment Day,"[Action, Thriller, Science Fiction]",4274.0,7.7,1991,280,3.319828
999,The Terminator,"[Action, Thriller, Science Fiction]",4208.0,7.4,1984,218,3.134059
962,Aliens,"[Horror, Action, Thriller, Science Fiction]",3282.0,7.7,1986,679,3.113002
8357,Star Trek Into Darkness,"[Action, Adventure, Science Fiction]",4479.0,7.4,2013,54138,2.955264
344,True Lies,"[Action, Thriller]",1138.0,6.8,1994,36955,2.911642
1660,Return from Witch Mountain,"[Adventure, Fantasy, Science Fiction, Family]",38.0,5.6,1978,14822,2.907418
8984,Suicide Squad,"[Action, Adventure, Crime, Fantasy, Science Fi...",7717.0,5.9,2016,297761,2.878399
8401,Justice League: The Flashpoint Paradox,"[Fantasy, Science Fiction, Animation, Action, ...",458.0,7.3,2013,183011,2.87525
910,The Abyss,"[Adventure, Action, Thriller, Science Fiction]",822.0,7.1,1989,2756,2.865143
7055,Mutant Chronicles,"[Action, Adventure, Horror, Science Fiction]",142.0,5.1,2008,13256,2.863256
