In [1]:
# from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet


In [3]:
from surprise import Reader, Dataset, SVD, evaluate


In [4]:
import os
os.listdir('data/movies_dataset/')

['keywords.csv',
 'movies_metadata.csv',
 'ratings.csv',
 'links.csv',
 'credits.csv',
 'links_small.csv',
 'ratings_small.csv']

# popularity based

In [5]:
meta_data = pd.read_csv('data/movies_dataset/movies_metadata.csv')
meta_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [6]:
meta_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

In [7]:
meta_data['genres'][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [10]:
for i in meta_data['genres']:
    if i == isinstance(i,list):
        print('true')

In [8]:
meta_data['genres'] = meta_data['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [15]:
# for i in range(meta_data.shape[0]):
#     if meta_data['genres'][i] == meta_data['genres1'][i]:
#         print(i)

In [9]:
meta_data.shape

(45466, 24)

In [10]:
vote_counts = meta_data[meta_data['vote_count'].notnull()]['vote_count'].astype('int')
meta_data[meta_data['vote_count'].notnull()]['vote_count'][:5]


0    5415.0
1    2413.0
2      92.0
3      34.0
4     173.0
Name: vote_count, dtype: float64

In [11]:
vote_counts[:5]

0    5415
1    2413
2      92
3      34
4     173
Name: vote_count, dtype: int64

In [12]:
vote_avgs = meta_data[meta_data['vote_average'].notnull()]['vote_average'].astype('int')
meta_data[meta_data['vote_average'].notnull()]['vote_average'][:5]


0    7.7
1    6.9
2    6.5
3    6.1
4    5.7
Name: vote_average, dtype: float64

In [13]:
vote_avgs[:5]

0    7
1    6
2    6
3    6
4    5
Name: vote_average, dtype: int64

### Weighted Rating (WR) = ((v/v+m) .R)+((m/v+m) .C)

where,

    v is the number of votes for the movie
    m is the minimum votes required to be listed in the chart
    R is the average rating of the movie
    C is the mean vote across the whole report


In [14]:
C = vote_avgs.mean()
m = vote_counts.quantile(0.95)

In [15]:
# import gc
# md = meta_data.copy
# del md
# gc.collect()

In [17]:
meta_data['year'] = pd.to_datetime(meta_data['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [36]:
meta_data['year'] = pd.to_datetime(meta_data['release_date'], 
                                   errors='coerce').apply(lambda x: str(x).split('-'))

In [38]:
meta_data['year'][:5]

0    [1995, 10, 30 00:00:00]
1    [1995, 12, 15 00:00:00]
2    [1995, 12, 22 00:00:00]
3    [1995, 12, 22 00:00:00]
4    [1995, 02, 10 00:00:00]
Name: year, dtype: object

In [40]:
meta_data['release_date'][:5]

0    1995-10-30
1    1995-12-15
2    1995-12-22
3    1995-12-22
4    1995-02-10
Name: release_date, dtype: object

In [45]:
meta_data['year'] = pd.to_datetime(meta_data['release_date'], 
                                   errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [18]:
meta_data['year'][:5]

0    1995
1    1995
2    1995
3    1995
4    1995
Name: year, dtype: object

In [19]:

qualified = meta_data[(meta_data['vote_count'] >= m) & (meta_data['vote_count'].notnull()) & (meta_data['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.head()


Unnamed: 0,title,year,vote_count,vote_average,popularity,genres
0,Toy Story,1995,5415,7,21.9469,"[Animation, Comedy, Family]"
1,Jumanji,1995,2413,6,17.0155,"[Adventure, Fantasy, Family]"
5,Heat,1995,1886,7,17.9249,"[Action, Crime, Drama, Thriller]"
9,GoldenEye,1995,1194,6,14.686,"[Adventure, Action, Thriller]"
15,Casino,1995,1343,7,10.1374,"[Drama, Crime]"


In [20]:
qualified.shape

(2274, 6)

In [21]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

qualified['wr'] = qualified.apply(weighted_rating, axis=1)


In [22]:
qualified.head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
0,Toy Story,1995,5415,7,21.9469,"[Animation, Comedy, Family]",6.86977
1,Jumanji,1995,2413,6,17.0155,"[Adventure, Fantasy, Family]",5.884891
5,Heat,1995,1886,7,17.9249,"[Action, Crime, Drama, Thriller]",6.671675
9,GoldenEye,1995,1194,6,14.686,"[Adventure, Action, Thriller]",5.798701
15,Casino,1995,1343,7,10.1374,"[Drama, Crime]",6.571348


In [23]:
qualified = qualified.sort_values('wr', ascending=False).head(250)


In [24]:
qualified.head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.871787


In [55]:
meta_data.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)


0        [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1        [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2        [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
3        [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4                           [{'id': 35, 'name': 'Comedy'}]
5        [{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...
6        [{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...
7        [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
8        [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
9        [{'id': 12, 'name': 'Adventure'}, {'id': 28, '...
10       [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
11       [{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...
12       [{'id': 10751, 'name': 'Family'}, {'id': 16, '...
13       [{'id': 36, 'name': 'History'}, {'id': 18, 'na...
14       [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
15       [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...
16       [{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n.

In [25]:
s = meta_data.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
s.head()
gen_meta_data = meta_data.drop('genres',axis=1).join(s)

In [61]:
s[0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [66]:
meta_data['genres'][3]

"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]"

In [67]:
gen_meta_data = meta_data.drop('genres',axis=1).join(s)

In [70]:
gen_meta_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,False,,16000000,,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,"[{'id': 35, 'name': 'Comedy'}]"


In [71]:
meta_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995


### x['vote_count'] / (x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C)

In [26]:

def build_chart(gen_md, genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified


In [73]:
meta_data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'year'],
      dtype='object')

In [27]:
gen_meta_data[gen_meta_data['genre'] == 'Animation']

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Animation
12,False,"{'id': 117693, 'name': 'Balto Collection', 'po...",0,,21032,tt0112453,en,Balto,An outcast half-wolf risks his life to prevent...,12.1407,...,78.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Part Dog. Part Wolf. All Hero.,Balto,False,7.1,423.0,1995,Animation
47,False,"{'id': 136214, 'name': 'Pocahontas Collection'...",55000000,,10530,tt0114148,en,Pocahontas,History comes gloriously to life in Disney's e...,13.2801,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,An American legend comes to life.,Pocahontas,False,6.7,1509.0,1995,Animation
235,False,"{'id': 410261, 'name': 'A Goofy Movie Collecti...",0,,15789,tt0113198,en,A Goofy Movie,"Though Goofy always means well, his amiable cl...",10.178,...,78.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It's the story of a father who couldn't be clo...,A Goofy Movie,False,6.7,404.0,1995,Animation
240,False,,0,,43475,tt0113234,en,Gumby: The Movie,The band is back together! Gumby reunites with...,0.090452,...,77.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The original green hero!,Gumby: The Movie,False,5.0,2.0,1995,Animation
309,False,"{'id': 144200, 'name': 'Swan Princess Series',...",35000000,http://www.sonypictures.com/movies/theswanprin...,22586,tt0111333,en,The Swan Princess,The beautiful princess Odette is transformed i...,8.91046,...,89.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,An enchanting classic destined to capture your...,The Swan Princess,False,6.5,251.0,1994,Animation
359,False,"{'id': 94032, 'name': 'The Lion King Collectio...",45000000,http://movies.disney.com/the-lion-king,8587,tt0110357,en,The Lion King,A young lion cub named Simba can't wait to be ...,21.6058,...,89.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Life's greatest adventure is finding your plac...,The Lion King,False,8.0,5520.0,1994,Animation
387,False,,0,,18242,tt0108069,en,The Secret Adventures of Tom Thumb,A boy born the size of a small doll is kidnapp...,0.381704,...,61.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A nursery crime of epic proportions...,The Secret Adventures of Tom Thumb,False,7.1,8.0,1993,Animation
546,False,,18000000,,9479,tt0107688,en,The Nightmare Before Christmas,Tired of scaring humans every October 31 with ...,17.7309,...,76.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A ghoulish tale with wicked humor & stunning a...,The Nightmare Before Christmas,False,7.6,2135.0,1993,Animation
552,False,,27000000,,15139,tt0110763,en,The Pagemaster,"Tyler knows a lot about accidents. So much so,...",7.436,...,80.0,"[{'iso_639_1': 'da', 'name': 'Dansk'}, {'iso_6...",Released,All The Adventure Your Imagination Can Hold.,The Pagemaster,False,6.2,178.0,1994,Animation


In [28]:
gen_meta_data[gen_meta_data['genre'] == 'Action']

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre
5,False,,60000000,,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",17.9249,...,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0,1995,Action
7,False,,0,,45325,tt0112302,en,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses...",2.56116,...,97.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The Original Bad Boys.,Tom and Huck,False,5.4,45.0,1995,Action
8,False,,35000000,,9091,tt0114576,en,Sudden Death,International action superstar Jean Claude Van...,5.23158,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Terror goes into overtime.,Sudden Death,False,5.5,174.0,1995,Action
9,False,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,http://www.mgm.com/view/movie/757/Goldeneye/,710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,14.686,...,130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,No limits. No fears. No substitutes.,GoldenEye,False,6.6,1194.0,1995,Action
14,False,,98000000,,1408,tt0112760,en,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ...",7.28448,...,119.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The Course Has Been Set. There Is No Turning B...,Cutthroat Island,False,5.7,137.0,1995,Action
19,False,,60000000,,11517,tt0113845,en,Money Train,A vengeful New York transit cop decides to ste...,7.33791,...,103.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"Get on, or GET OUT THE WAY!",Money Train,False,5.4,224.0,1995,Action
22,False,,50000000,,9691,tt0112401,en,Assassins,Assassin Robert Rath arrives at a funeral to k...,11.0659,...,132.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,"In the shadows of life, In the business of dea...",Assassins,False,6.0,394.0,1995,Action
41,False,,10000000,,11443,tt0112819,en,Dead Presidents,"Depicts a heist of old bills, retired from cir...",9.87957,...,119.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"In this daring heist, the only color that coun...",Dead Presidents,False,6.6,80.0,1995,Action
43,False,"{'id': 9818, 'name': 'Mortal Kombat Collection...",18000000,,9312,tt0113855,en,Mortal Kombat,For nine generations an evil sorcerer has been...,10.8701,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Nothing In This World Has Prepared You For This.,Mortal Kombat,False,5.4,452.0,1995,Action
50,False,,0,,117164,tt0109950,en,Guardian Angel,Detective - turned - bodyguard Cynthia McKay (...,0.595949,...,93.0,[],Released,She's no angel of mercy.,Guardian Angel,False,6.3,3.0,1994,Action


In [29]:
build_chart(gen_meta_data,'Romance').head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.565285
351,Forrest Gump,1994,8147,8,48.3072,7.971357
876,Vertigo,1958,1162,8,18.2082,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.8451,7.745154
1132,Cinema Paradiso,1988,834,8,14.177,7.744878
19901,Paperman,2012,734,8,7.19863,7.713951
37863,Sing Street,2016,669,8,10.672862,7.689483
882,The Apartment,1960,498,8,11.9943,7.599317
38718,The Handmaiden,2016,453,8,16.727405,7.566166


In [30]:
build_chart(gen_meta_data,'Adventure').head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.906526
22879,Interstellar,2014,11187,8,32.2135,7.883426
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,7.854939
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,7.843867
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,7.832647
256,Star Wars,1977,6778,8,42.1497,7.812801
1225,Back to the Future,1985,6239,8,25.7785,7.797828
1154,The Empire Strikes Back,1980,5998,8,19.471,7.790329
5481,Spirited Away,2001,3968,8,41.0489,7.695056
9698,Howl's Moving Castle,2004,2049,8,16.136,7.465435


In [80]:
# build_chart('Romance').head(15)

## Content Based 

In [82]:
meta_data.shape

(45466, 25)

## i guess there are three NaN rows(mostly), so remove it

In [31]:
meta_data.iloc[[19730,29503,35587,35587+1,-10]]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[Carousel Productions, Vision View Entertainme...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1997-08-20,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,,,,,,,,,,NaT
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[Aniplex, GoHands, BROSTA TV, Mardock Scramble...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-09-29,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,,,,,,,,,,NaT
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[Odyssey Media, Pulser Productions, Rogue Stat...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2014-01-01,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,,,,,,,,,,NaT
35588,False,,0,"[War, Drama]",,151911,tt0029924,en,Blockade,A simple peasant is forced to take up arms to ...,...,0.0,85.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Romance under Fire!,Blockade,False,0.0,0.0,1938
45456,False,,0,"[Horror, Mystery, Thriller]",,84419,tt0038621,en,House of Horrors,An unsuccessful sculptor saves a madman named ...,...,0.0,65.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Meet...The CREEPER!,House of Horrors,False,6.3,8.0,1946


In [33]:
os.listdir('data/movies_dataset')

['keywords.csv',
 'movies_metadata.csv',
 'ratings.csv',
 'links.csv',
 'credits.csv',
 'links_small.csv',
 'ratings_small.csv']

In [41]:
import gc
del links_small
gc.collect()

146

In [45]:

links_small = pd.read_csv('data/movies_dataset/links_small.csv')
links_small.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [35]:
meta_data2 = meta_data.copy()
meta_data2.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995


In [36]:
meta_data.shape, meta_data2.shape

((45466, 25), (45466, 25))

In [37]:
gen_meta_data.to_csv('gen_meta_data_first_model.csv')

In [49]:
links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')[:10]

0      862
1     8844
2    15602
3    31357
4    11862
5      949
6    11860
7    45325
8     9091
9      710
Name: tmdbId, dtype: int64

In [51]:
links_small.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [52]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [53]:
meta_data2 = meta_data2.drop([19730, 29503, 35587])


In [55]:
meta_data2['id'] = meta_data2['id'].astype('int')
small_meta_data = meta_data2[meta_data2['id'].isin(links_small)]
small_meta_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995


In [56]:
meta_data2.shape, small_meta_data.shape

((45463, 25), (9099, 25))

In [60]:
small_meta_data[small_meta_data['genres'] == 'Adventure']

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year


In [87]:
small_meta_data['genre'][:5]#.apply(split(','))

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
2               [Romance, Comedy]
3        [Comedy, Drama, Romance]
4                        [Comedy]
Name: genre, dtype: object

In [67]:
# gapminder.rename(columns={'pop':'population',
#                           'lifeExp':'life_exp',
#                           'gdpPercap':'gdp_per_cap'}, 
#                  inplace=True)
# df.rename(index=str, columns={"A": "a", "B": "c"})
small_meta_data.rename(columns={'genres':'genre'}, inplace=True)

## Movie Description Based 

In [89]:
small_meta_data['tagline'] = small_meta_data['tagline'].fillna('')
small_meta_data['description'] = small_meta_data['overview'] + small_meta_data['tagline']
small_meta_data['description'] = small_meta_data['description'].fillna('')

In [90]:
small_meta_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genre,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,description
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,A family wedding reignites the ancient feud be...
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,Just when George Banks has recovered from his ...


In [91]:
# ngram_range ==>  to caculate the appropriate the freq of each word
# min_df      ==>  threshold to ignore the range 
# analyzer    ==>  consider words to vectorize (not character wise)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0,stop_words='english')
tfidf_matrix = tf.fit_transform(small_meta_data['description'])

In [93]:
small_meta_data['description'][:5]

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: description, dtype: object

In [94]:
tfidf_matrix

<9099x268124 sparse matrix of type '<class 'numpy.float64'>'
	with 540591 stored elements in Compressed Sparse Row format>

In [95]:
small_meta_data.shape

(9099, 26)

ince we have used the TF-IDF Vectorizer, calculating the Dot Product will directly give us the Cosine Similarity Score. Therefore, we will use sklearn's linear_kernel instead of cosine_similarities since it is much faster.

In [99]:
# np.dot(tfidf_matrix,tfidf_matrix.T)

<9099x9099 sparse matrix of type '<class 'numpy.float64'>'
	with 28022309 stored elements in Compressed Sparse Row format>

In [100]:
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

In [101]:
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [102]:
cosine_sim.shape

(9099, 9099)

In [105]:
small_meta_data = small_meta_data.reset_index()
titles = small_meta_data['title']
titles.head()

0                      Toy Story
1                        Jumanji
2               Grumpier Old Men
3              Waiting to Exhale
4    Father of the Bride Part II
Name: title, dtype: object

In [106]:
indices = pd.Series(small_meta_data.index, index=small_meta_data['title'])
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [114]:
len(list(enumerate(cosine_sim[1]))),list(enumerate(cosine_sim[1]))[:10]

(9099,
 [(0, 0.0068047556717484225),
  (1, 1.000000000000001),
  (2, 0.01531062029116973),
  (3, 0.0),
  (4, 0.0022368099402477037),
  (5, 0.014747265950384274),
  (6, 0.0),
  (7, 0.0),
  (8, 0.0331370513549892),
  (9, 0.0)])

In [116]:
sorted(list(enumerate(cosine_sim[1])), key=lambda x:x[1], reverse=False)[:10]

[(3, 0.0),
 (6, 0.0),
 (7, 0.0),
 (9, 0.0),
 (11, 0.0),
 (12, 0.0),
 (15, 0.0),
 (16, 0.0),
 (18, 0.0),
 (19, 0.0)]

In [235]:
sorted(list(enumerate(cosine_sim[1])), key=lambda x:x[1], reverse=False)[:10]

[(2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (10, 0.0),
 (11, 0.0),
 (13, 0.0),
 (15, 0.0),
 (16, 0.0)]

In [117]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [118]:
get_recommendations('The Godfather').head(10)

973      The Godfather: Part II
8387                 The Family
3509                       Made
4196         Johnny Dangerously
29               Shanghai Triad
5667                       Fury
2412             American Movie
1582    The Godfather: Part III
4221                    8 Women
2159              Summer of Sam
Name: title, dtype: object

In [119]:
get_recommendations('The Dark Knight').head(10)


7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
Name: title, dtype: object

### Now, we will build a more sophisticated recommender that takes genre, keywords, cast and crew into consideration.

In [122]:
credits = pd.read_csv('data/movies_dataset/credits.csv')
keywords = pd.read_csv('data/movies_dataset/keywords.csv')

keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')


In [125]:
meta_data2.shape

(45463, 25)

In [126]:
meta_data2['id'] = meta_data2['id'].astype('int')
meta_data2.shape

(45463, 25)

In [127]:
meta_data2 = meta_data2.merge(credits, on='id')
meta_data2 = meta_data2.merge(keywords, on='id')

In [128]:
small_meta_data = meta_data2[meta_data2['id'].isin(links_small)]
small_meta_data.shape

(9219, 28)

In [131]:
small_meta_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9219 entries, 0 to 41669
Data columns (total 28 columns):
adult                    9219 non-null object
belongs_to_collection    1688 non-null object
budget                   9219 non-null object
genres                   9219 non-null object
homepage                 2001 non-null object
id                       9219 non-null int64
imdb_id                  9219 non-null object
original_language        9219 non-null object
original_title           9219 non-null object
overview                 9207 non-null object
popularity               9219 non-null object
poster_path              9216 non-null object
production_companies     9219 non-null object
production_countries     9219 non-null object
release_date             9219 non-null object
revenue                  9219 non-null float64
runtime                  9219 non-null float64
spoken_languages         9219 non-null object
status                   9217 non-null object
tagline          

### Crew: From the crew, we will only pick the director as our feature since the others don't contribute that much to the feel of the movie.
### Cast: Choosing Cast is a little more tricky. Lesser known actors and minor roles do not really affect people's opinion of a movie. Therefore, we must only select the major characters and their respective actors. Arbitrarily we will choose the top 3 actors that appear in the credits list

In [135]:
small_meta_data['cast'] = small_meta_data['cast'].apply(literal_eval)
small_meta_data['crew'] = small_meta_data['crew'].apply(literal_eval)
small_meta_data['keywords'] = small_meta_data['keywords'].apply(literal_eval)

small_meta_data['cast_size'] = small_meta_data['cast'].apply(lambda x: len(x))
small_meta_data['crew_size'] = small_meta_data['crew'].apply(lambda x: len(x))



In [136]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

small_meta_data['director'] = small_meta_data['crew'].apply(get_director)


In [138]:
small_meta_data['director'].head()

0      John Lasseter
1       Joe Johnston
2      Howard Deutch
3    Forest Whitaker
4      Charles Shyer
Name: director, dtype: object

In [143]:
small_meta_data['cast'].apply(lambda x: [i['name'] for i in x]).head()

0    [Tom Hanks, Tim Allen, Don Rickles, Jim Varney...
1    [Robin Williams, Jonathan Hyde, Kirsten Dunst,...
2    [Walter Matthau, Jack Lemmon, Ann-Margret, Sop...
3    [Whitney Houston, Angela Bassett, Loretta Devi...
4    [Steve Martin, Diane Keaton, Martin Short, Kim...
Name: cast, dtype: object

In [146]:
small_meta_data['cast'] = small_meta_data['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
small_meta_data['cast'] = small_meta_data['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

small_meta_data['keywords'] = small_meta_data['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


In [149]:
small_meta_data['cast'][:5]

0                  [Tom Hanks, Tim Allen, Don Rickles]
1       [Robin Williams, Jonathan Hyde, Kirsten Dunst]
2           [Walter Matthau, Jack Lemmon, Ann-Margret]
3    [Whitney Houston, Angela Bassett, Loretta Devine]
4           [Steve Martin, Diane Keaton, Martin Short]
Name: cast, dtype: object



### These are steps I follow in the preparation of my genres and credits data:

1. Strip Spaces and Convert to Lowercase from all our features. This way, our engine will not confuse between Johnny Depp and Johnny Galecki.
2. Mention Director 3 times to give it more weight relative to the entire cast.


In [150]:
small_meta_data['cast'] = small_meta_data['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

small_meta_data['director'] = small_meta_data['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
small_meta_data['director'] = small_meta_data['director'].apply(lambda x: [x,x,x])



In [153]:
small_meta_data['director'].head()

0          [johnlasseter, johnlasseter, johnlasseter]
1             [joejohnston, joejohnston, joejohnston]
2          [howarddeutch, howarddeutch, howarddeutch]
3    [forestwhitaker, forestwhitaker, forestwhitaker]
4          [charlesshyer, charlesshyer, charlesshyer]
Name: director, dtype: object

In [154]:
small_meta_data['cast'].head()

0                  [tomhanks, timallen, donrickles]
1       [robinwilliams, jonathanhyde, kirstendunst]
2          [waltermatthau, jacklemmon, ann-margret]
3    [whitneyhouston, angelabassett, lorettadevine]
4           [stevemartin, dianekeaton, martinshort]
Name: cast, dtype: object

In [161]:
small_meta_data['keywords'].head()

0    [jealousy, toy, boy, friendship, friends, riva...
1    [board game, disappearance, based on children'...
2    [fishing, best friend, duringcreditsstinger, o...
3    [based on novel, interracial relationship, sin...
4    [baby, midlife crisis, confidence, aging, daug...
Name: keywords, dtype: object

In [165]:
temp = small_meta_data[:10]

In [166]:
temp.apply(lambda x: pd.Series(x['keywords']),axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,jealousy,toy,boy,friendship,friends,rivalry,boy next door,new toy,toy comes to life,,...,,,,,,,,,,
1,board game,disappearance,based on children's book,new home,recluse,giant insect,,,,,...,,,,,,,,,,
2,fishing,best friend,duringcreditsstinger,old men,,,,,,,...,,,,,,,,,,
3,based on novel,interracial relationship,single mother,divorce,chick flick,,,,,,...,,,,,,,,,,
4,baby,midlife crisis,confidence,aging,daughter,mother daughter relationship,pregnancy,contraception,gynecologist,,...,,,,,,,,,,
5,robbery,detective,bank,obsession,chase,shooting,thief,honor,murder,suspense,...,criminal mastermind,cult film,ex-con,heist movie,one last job,loner,bank job,neo-noir,gun fight,crime epic
6,paris,brother brother relationship,chauffeur,long island,fusion,millionaire,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,terrorist,hostage,explosive,vice president,,,,,,,...,,,,,,,,,,
9,cuba,falsely accused,secret identity,computer virus,secret base,secret intelligence service,kgb,satellite,special car,cossack,...,,,,,,,,,,


In [167]:
temp.apply(lambda x: pd.Series(x['keywords']),axis=1).stack()

0  0                         jealousy
   1                              toy
   2                              boy
   3                       friendship
   4                          friends
   5                          rivalry
   6                    boy next door
   7                          new toy
   8                toy comes to life
1  0                       board game
   1                    disappearance
   2         based on children's book
   3                         new home
   4                          recluse
   5                     giant insect
2  0                          fishing
   1                      best friend
   2             duringcreditsstinger
   3                          old men
3  0                   based on novel
   1         interracial relationship
   2                    single mother
   3                          divorce
   4                      chick flick
4  0                             baby
   1                   midlife crisis
   2        

In [168]:
temp.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index()

Unnamed: 0,level_0,level_1,0
0,0,0,jealousy
1,0,1,toy
2,0,2,boy
3,0,3,friendship
4,0,4,friends
5,0,5,rivalry
6,0,6,boy next door
7,0,7,new toy
8,0,8,toy comes to life
9,1,0,board game


In [169]:
temp.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1)

Unnamed: 0,level_1,0
0,0,jealousy
0,1,toy
0,2,boy
0,3,friendship
0,4,friends
0,5,rivalry
0,6,boy next door
0,7,new toy
0,8,toy comes to life
1,0,board game


In [170]:
temp.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1,drop=True)

0                        jealousy
0                             toy
0                             boy
0                      friendship
0                         friends
0                         rivalry
0                   boy next door
0                         new toy
0               toy comes to life
1                      board game
1                   disappearance
1        based on children's book
1                        new home
1                         recluse
1                    giant insect
2                         fishing
2                     best friend
2            duringcreditsstinger
2                         old men
3                  based on novel
3        interracial relationship
3                   single mother
3                         divorce
3                     chick flick
4                            baby
4                  midlife crisis
4                      confidence
4                           aging
4                        daughter
4    mother da

In [171]:
s = small_meta_data.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s.head()

0      jealousy
0           toy
0           boy
0    friendship
0       friends
Name: keyword, dtype: object

In [176]:
s1 = s.value_counts()
s1[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [182]:
s1[s1 == 1].sum(), s1.count()

(6231, 12940)

In [183]:
s1 = s1[s1>1]

In [184]:
del s
gc.collect()

64818

In [192]:
s1.head(),s1.shape

(independent film        610
 woman director          550
 murder                  399
 duringcreditsstinger    327
 based on novel          318
 Name: keyword, dtype: int64, (6709,))

In [193]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs'), stemmer.stem('walking')

('dog', 'walk')

In [198]:
def collect_words(x):
    words = []
    for i in x:
        if i in s1:
            words.append(i)
    return words

In [204]:
small_meta_data['keywords'] = small_meta_data['keywords'].apply(collect_words)
small_meta_data['keywords'] = small_meta_data['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
small_meta_data['keywords'] = small_meta_data['keywords'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])

In [208]:
small_meta_data['keywords'][0], small_meta_data['cast'][0]

(['jealousi',
  'toy',
  'boy',
  'friendship',
  'friend',
  'rivalri',
  'boynextdoor',
  'newtoy',
  'toycomestolif'],
 ['tomhanks', 'timallen', 'donrickles'])

In [211]:
a = ['toy', 'boy', 'friend']
b = ['topp','jopu']
print(a+b)
print(''.join(a + b))

['toy', 'boy', 'friend', 'topp', 'jopu']
toyboyfriendtoppjopu


In [215]:
small_meta_data['soup'] = small_meta_data['keywords'] + small_meta_data['cast'] + small_meta_data['director'] + small_meta_data['genres']

In [217]:
small_meta_data['soup'][0]

['jealousi',
 'toy',
 'boy',
 'friendship',
 'friend',
 'rivalri',
 'boynextdoor',
 'newtoy',
 'toycomestolif',
 'tomhanks',
 'timallen',
 'donrickles',
 'johnlasseter',
 'johnlasseter',
 'johnlasseter',
 'Animation',
 'Comedy',
 'Family']

In [219]:
small_meta_data['soup'].apply(lambda x: ' '.join(x))[0]

'jealousi toy boy friendship friend rivalri boynextdoor newtoy toycomestolif tomhanks timallen donrickles johnlasseter johnlasseter johnlasseter Animation Comedy Family'

In [220]:
small_meta_data['soup'] = small_meta_data['soup'].apply(lambda x: ' '.join(x))

In [223]:
count = CountVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')
count_matrix = count.fit_transform(small_meta_data['soup'])
count_matrix.shape

(9219, 107377)

In [224]:
count_matrix

<9219x107377 sparse matrix of type '<class 'numpy.int64'>'
	with 240050 stored elements in Compressed Sparse Row format>

In [225]:
cosine_sim = cosine_similarity(count_matrix,count_matrix)
small_meta_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,vote_average,vote_count,year,cast,crew,keywords,cast_size,crew_size,director,soup
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,7.7,5415.0,1995,"[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousi, toy, boy, friendship, friend, rival...",13,106,"[johnlasseter, johnlasseter, johnlasseter]",jealousi toy boy friendship friend rivalri boy...
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,6.9,2413.0,1995,"[robinwilliams, jonathanhyde, kirstendunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[boardgam, disappear, basedonchildren'sbook, n...",26,16,"[joejohnston, joejohnston, joejohnston]",boardgam disappear basedonchildren'sbook newho...
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,6.5,92.0,1995,"[waltermatthau, jacklemmon, ann-margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fish, bestfriend, duringcreditssting]",7,4,"[howarddeutch, howarddeutch, howarddeutch]",fish bestfriend duringcreditssting waltermatth...
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,6.1,34.0,1995,"[whitneyhouston, angelabassett, lorettadevine]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[basedonnovel, interracialrelationship, single...",10,10,"[forestwhitaker, forestwhitaker, forestwhitaker]",basedonnovel interracialrelationship singlemot...
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,5.7,173.0,1995,"[stevemartin, dianekeaton, martinshort]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[babi, midlifecrisi, confid, age, daughter, mo...",12,7,"[charlesshyer, charlesshyer, charlesshyer]",babi midlifecrisi confid age daughter motherda...


In [228]:
small_meta_data = small_meta_data.reset_index()
titles = small_meta_data['title']
indices = pd.Series(small_meta_data.index, index=small_meta_data['title'])

In [230]:
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [231]:
get_recommendations('The Dark Knight').head(10)

8031         The Dark Knight Rises
6218                 Batman Begins
6623                  The Prestige
2085                     Following
7648                     Inception
4145                      Insomnia
3381                       Memento
8613                  Interstellar
7659    Batman: Under the Red Hood
1134                Batman Returns
Name: title, dtype: object

In [234]:
get_recommendations('The Conjuring 2').head(10)

8447                   The Conjuring
7848                       Insidious
5684                             Saw
8939                       Furious 7
6729                    Dead Silence
6831                  Death Sentence
2251    Omen III: The Final Conflict
4598               What a Girl Wants
8651                           Locke
7609               Cemetery Junction
Name: title, dtype: object

In [233]:
indices

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
Heat                                                     5
Sabrina                                                  6
Tom and Huck                                             7
Sudden Death                                             8
GoldenEye                                                9
The American President                                  10
Dracula: Dead and Loving It                             11
Balto                                                   12
Nixon                                                   13
Cutthroat Island                                        14
Casino                                                  15
Sense and Sensibility                             

## Improve recommendation (by removing bad movies by top director or actor like batman and Robin)

Now, consider top 25 movies based on similarity scores and calculate the vote of the 60th percentile movie. Then, using this as the value of m, we will calculate the weighted rating of each movie using IMDB's formula 

In [241]:
def improved_recommendations(smd, title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified



In [242]:
improved_recommendations(small_meta_data,'The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,year,wr
7648,Inception,14075,8,2010,7.917588
8613,Interstellar,11187,8,2014,7.897107
6623,The Prestige,4510,8,2006,7.758148
3381,Memento,4168,8,2000,7.740175
8031,The Dark Knight Rises,9263,7,2012,6.921448
6218,Batman Begins,7511,7,2005,6.904127
1134,Batman Returns,1706,6,1992,5.846862
132,Batman Forever,1529,5,1995,5.054144
9024,Batman v Superman: Dawn of Justice,7189,5,2016,5.013943
1260,Batman & Robin,1447,4,1997,4.287233


### Our content based engine suffers from some severe limitations. It is only capable of suggesting movies which are close to a certain movie. That is, it is not capable of capturing tastes and providing recommendations across genres.

## Collaborative Filtering (user-user or item-item)

In [244]:
reader = Reader() # surprise library by sklearn
ratings = pd.read_csv('data/movies_dataset/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [245]:
# Load a dataset from a pandas dataframe.
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

In [246]:
data

<surprise.dataset.DatasetAutoFolds at 0x7f3a718ce6a0>

In [247]:
svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])




Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.9004
MAE:  0.6902
------------
Fold 2
RMSE: 0.8980
MAE:  0.6918
------------
Fold 3
RMSE: 0.9007
MAE:  0.6934
------------
Fold 4
RMSE: 0.8965
MAE:  0.6898
------------
Fold 5
RMSE: 0.8934
MAE:  0.6874
------------
------------
Mean RMSE: 0.8978
Mean MAE : 0.6905
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.9003581766624212,
                             0.8980138898541418,
                             0.9006946002166543,
                             0.8965277865749257,
                             0.8934404981637667],
                            'mae': [0.6901908316036015,
                             0.691762046966211,
                             0.6933522630851731,
                             0.6898483023920445,
                             0.6874357574518688]})

In [249]:
print('check if it is working')

check if it is working


In [250]:
trainset = data.build_full_trainset()
svd.train(trainset)



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f3a718ce518>

## pick any userid

In [251]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [252]:
svd.predict(1,302,3)

Prediction(uid=1, iid=302, r_ui=3, est=2.646637418812249, details={'was_impossible': False})

## Hybrid approach
### It use user-user relationship and previous history of the user.

In [255]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [253]:
id_map = pd.read_csv('data/movies_dataset/links_small.csv')

In [254]:
id_map.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [257]:
id_map = id_map[['movieId','tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)

In [258]:
id_map.head()

Unnamed: 0,movieId,tmdbId
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0


In [260]:
id_map.columns

Index(['movieId', 'tmdbId'], dtype='object')

In [261]:
id_map.columns = ['movieId','id']
id_map.head()

Unnamed: 0,movieId,id
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0


In [263]:
id_map = id_map.merge(small_meta_data[['title','id']], on='id')
id_map.head()

Unnamed: 0,movieId,id,title
0,1,862.0,Toy Story
1,2,8844.0,Jumanji
2,3,15602.0,Grumpier Old Men
3,4,31357.0,Waiting to Exhale
4,5,11862.0,Father of the Bride Part II


In [265]:
id_map.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            9209, 9210, 9211, 9212, 9213, 9214, 9215, 9216, 9217, 9218],
           dtype='int64', length=9219)

In [270]:
id_map = id_map.set_index('title')

Unnamed: 0_level_0,movieId,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story,1,862.0
Jumanji,2,8844.0
Grumpier Old Men,3,15602.0
Waiting to Exhale,4,31357.0
Father of the Bride Part II,5,11862.0


In [271]:
id_map.head()

Unnamed: 0_level_0,movieId,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story,1,862.0
Jumanji,2,8844.0
Grumpier Old Men,3,15602.0
Waiting to Exhale,4,31357.0
Father of the Bride Part II,5,11862.0


In [272]:
indices_map = id_map.set_index('id')
indices_map.head()

Unnamed: 0_level_0,movieId
id,Unnamed: 1_level_1
862.0,1
8844.0,2
15602.0,3
31357.0,4
11862.0,5


In [274]:
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [281]:
def hybrid(smd, userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    # content based filtering
    #####################################
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
#     print(movie_indices)
    #####################################
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
#     print(movies)

    # Collaborative Filtering
    #####################################
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    #####################################

    movies = movies.sort_values('est', ascending=False)
    return movies


In [291]:
id_map.loc['Avatar'], indices['Avatar']

(movieId    72998.0
 id         19995.0
 Name: Avatar, dtype: float64, 7488)

In [282]:
hybrid(small_meta_data,1,'Avatar')

[974, 522, 1011, 922, 4347, 344, 1376, 8401, 3216, 8724, 1500, 7265, 3060, 4966, 6084, 8419, 1668, 4017, 2132, 8658, 2761, 7088, 831, 1621, 2014]
                                          title  vote_count  vote_average  \
974                                      Aliens      3282.0           7.7   
522                  Terminator 2: Judgment Day      4274.0           7.7   
1011                             The Terminator      4208.0           7.4   
922                                   The Abyss       822.0           7.1   
4347             Piranha Part Two: The Spawning        41.0           3.9   
344                                   True Lies      1138.0           6.8   
1376                                    Titanic      7770.0           7.5   
8401                    Star Trek Into Darkness      4479.0           7.4   
3216                         Dungeons & Dragons       159.0           3.9   
8724                          Jupiter Ascending      2816.0           5.2   
1500   

Unnamed: 0,title,vote_count,vote_average,year,id,est
1011,The Terminator,4208.0,7.4,1984,218,3.06467
8401,Star Trek Into Darkness,4479.0,7.4,2013,54138,3.028958
522,Terminator 2: Judgment Day,4274.0,7.7,1991,280,3.011393
974,Aliens,3282.0,7.7,1986,679,3.010484
1621,Darby O'Gill and the Little People,35.0,6.7,1959,18887,2.998947
8658,X-Men: Days of Future Past,6155.0,7.5,2014,127585,2.904452
2014,Fantastic Planet,140.0,7.6,1973,16306,2.895411
922,The Abyss,822.0,7.1,1989,2756,2.855093
1668,Return from Witch Mountain,38.0,5.6,1978,14822,2.784572
4347,Piranha Part Two: The Spawning,41.0,3.9,1981,31646,2.761964


In [284]:
def hybrid(smd, userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
#     print(movie_indices)
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
#     print(movies)
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies

In [285]:
hybrid(small_meta_data,1,'Avatar').head(10)

Unnamed: 0,title,vote_count,vote_average,year,id,est
1011,The Terminator,4208.0,7.4,1984,218,3.06467
8401,Star Trek Into Darkness,4479.0,7.4,2013,54138,3.028958
522,Terminator 2: Judgment Day,4274.0,7.7,1991,280,3.011393
974,Aliens,3282.0,7.7,1986,679,3.010484
1621,Darby O'Gill and the Little People,35.0,6.7,1959,18887,2.998947
8658,X-Men: Days of Future Past,6155.0,7.5,2014,127585,2.904452
2014,Fantastic Planet,140.0,7.6,1973,16306,2.895411
922,The Abyss,822.0,7.1,1989,2756,2.855093
1668,Return from Witch Mountain,38.0,5.6,1978,14822,2.784572
4347,Piranha Part Two: The Spawning,41.0,3.9,1981,31646,2.761964


In [286]:
hybrid(small_meta_data,100,'Avatar').head(10)

Unnamed: 0,title,vote_count,vote_average,year,id,est
1011,The Terminator,4208.0,7.4,1984,218,3.982359
522,Terminator 2: Judgment Day,4274.0,7.7,1991,280,3.803937
1621,Darby O'Gill and the Little People,35.0,6.7,1959,18887,3.70792
974,Aliens,3282.0,7.7,1986,679,3.684303
8401,Star Trek Into Darkness,4479.0,7.4,2013,54138,3.68172
8658,X-Men: Days of Future Past,6155.0,7.5,2014,127585,3.579584
922,The Abyss,822.0,7.1,1989,2756,3.531269
3060,Sinbad and the Eye of the Tiger,39.0,6.3,1977,11940,3.463243
4966,Hercules in New York,63.0,3.7,1969,5227,3.427615
2014,Fantastic Planet,140.0,7.6,1973,16306,3.411477
