In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer
from sklearn.metrics.pairwise import linear_kernel , cosine_similarity 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader , Dataset , SVD , evaluate

import warnings; warnings.simplefilter('ignore')

# Simple Recommendar

In [19]:
md = pd.read_csv("movies_metadata.csv")
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,10/30/1995,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,12/15/1995,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,12/22/1995,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,12/22/1995,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,2/10/1995,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [22]:
md.shape

(45466, 24)

In [20]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [21]:
md['genres']

0                        [Animation, Comedy, Family]
1                       [Adventure, Fantasy, Family]
2                                  [Romance, Comedy]
3                           [Comedy, Drama, Romance]
4                                           [Comedy]
5                   [Action, Crime, Drama, Thriller]
6                                  [Comedy, Romance]
7                 [Action, Adventure, Drama, Family]
8                      [Action, Adventure, Thriller]
9                      [Adventure, Action, Thriller]
10                          [Comedy, Drama, Romance]
11                                  [Comedy, Horror]
12                    [Family, Animation, Adventure]
13                                  [History, Drama]
14                               [Action, Adventure]
15                                    [Drama, Crime]
16                                  [Drama, Romance]
17                                   [Crime, Comedy]
18                        [Crime, Comedy, Adve

# Building top 250 Chart

In [30]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244896612406511

In [31]:
m = vote_counts.quantile(0.95)
m

434.0

In [35]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [36]:
md['year']

0        1995
1        1995
2        1995
3        1995
4        1995
5        1995
6        1995
7        1995
8        1995
9        1995
10       1995
11       1995
12       1995
13       1995
14       1995
15       1995
16       1995
17       1995
18       1995
19       1995
20       1995
21       1995
22       1995
23       1995
24       1995
25       1995
26       1995
27       1995
28       1995
29       1995
         ... 
45436    2010
45437    2017
45438    2015
45439    1944
45440    2007
45441    2002
45442    1905
45443    1901
45444    1905
45445    1906
45446    1909
45447    1904
45448    1904
45449    2005
45450    1900
45451    1900
45452    1981
45453    2017
45454    2015
45455    1972
45456    1946
45457    2000
45458    2000
45459    1995
45460    1991
45461     NaT
45462    2011
45463    2003
45464    1917
45465    2017
Name: year, Length: 45466, dtype: object

In [47]:
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [49]:
# a function for weighted rating------
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [51]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)
# applying weighted rating function

In [53]:
qualified['wr']

0        6.869770
1        5.884891
5        6.671675
9        5.798701
15       6.571348
17       5.663191
18       5.790195
31       6.737701
33       5.724609
38       5.740321
43       5.119961
46       7.811669
47       5.831336
49       7.682666
68       6.195655
69       5.842293
93       5.119020
95       6.325319
102      5.727132
108      6.801533
109      7.610008
142      5.848491
144      6.159255
147      6.632199
150      5.054144
155      5.778421
158      6.191385
160      5.753968
162      5.870366
170      5.098686
           ...   
42214    5.831075
42222    5.945051
42235    5.881691
42271    5.067568
42309    5.035654
42355    5.881905
42537    6.336485
42853    5.863053
42868    5.682447
42886    6.364708
42901    5.813268
42902    5.034164
43059    5.664570
43186    6.225112
43190    6.968322
43220    6.310041
43231    5.855186
43238    5.041583
43255    5.922654
43294    5.715525
43644    6.697372
43645    6.380216
44009    5.865470
44274    5.844611
44337    5

In [54]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

# Top Movies

In [55]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.95,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.6454,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.3072,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,"[Adventure, Fantasy, Action]",7.851924


In [60]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

In [61]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

# Top 15 Romantic Movies

In [64]:
build_chart('Romance').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.565285
351,Forrest Gump,1994,8147,8,48.3072,7.971357
876,Vertigo,1958,1162,8,18.2082,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.8451,7.745154
1132,Cinema Paradiso,1988,834,8,14.177,7.744878
19901,Paperman,2012,734,8,7.19863,7.713951
37863,Sing Street,2016,669,8,10.672862,7.689483
882,The Apartment,1960,498,8,11.9943,7.599317
38718,The Handmaiden,2016,453,8,16.727405,7.566166


# CONTENT BASED RECOMMENDERS

In [70]:
links_small = pd.read_csv("links_small.csv")
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')



In [88]:
links_small.shape

(9112,)

In [77]:
md.shape

(45466, 25)

In [78]:
md = md.drop([19730, 29503, 35587])
# dropping these from md dataset


In [79]:
md.shape

(45463, 25)

In [85]:
md['id'] = md['id'].astype('int')
md['id']


0           862
1          8844
2         15602
3         31357
4         11862
5           949
6         11860
7         45325
8          9091
9           710
10         9087
11        12110
12        21032
13        10858
14         1408
15          524
16         4584
17            5
18         9273
19        11517
20         8012
21         1710
22         9691
23        12665
24          451
25        16420
26         9263
27        17015
28          902
29        37557
          ...  
45436     45527
45437    455661
45438    327237
45439     84710
45440     39562
45441     14008
45442     44330
45443     49279
45444     44333
45445     49277
45446     49271
45447     44324
45448    122036
45449     14885
45450     49280
45451    106807
45452    276895
45453    404604
45454    420346
45455     67179
45456     84419
45457    390959
45458    289923
45459    222848
45460     30840
45461    439050
45462    111109
45463     67758
45464    227506
45465    461257
Name: id, Length: 45463,

In [89]:
smd = md[md['id'].isin(links_small)]
smd.shape
# there are 9099 movies in smallmovie metadataset which is 5 times less than the movies in original dataset

(9099, 26)

# Movie Description based Recommendar

In [90]:
smd

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,1318961311304
0,FALSE,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,19730
1,FALSE,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,19730
2,FALSE,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,19730
3,FALSE,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,19730
4,FALSE,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,19730
5,FALSE,,60000000,"[Action, Crime, Drama, Thriller]",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0,1995,19730
6,FALSE,,58000000,"[Comedy, Romance]",,11860,tt0114319,en,Sabrina,An ugly duckling having undergone a remarkable...,...,127.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,You are cordially invited to the most surprisi...,Sabrina,False,6.2,141.0,1995,19730
7,FALSE,,0,"[Action, Adventure, Drama, Family]",,45325,tt0112302,en,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses...",...,97.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The Original Bad Boys.,Tom and Huck,False,5.4,45.0,1995,19730
8,FALSE,,35000000,"[Action, Adventure, Thriller]",,9091,tt0114576,en,Sudden Death,International action superstar Jean Claude Van...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Terror goes into overtime.,Sudden Death,False,5.5,174.0,1995,19730
9,FALSE,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,"[Adventure, Action, Thriller]",http://www.mgm.com/view/movie/757/Goldeneye/,710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,...,130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,No limits. No fears. No substitutes.,GoldenEye,False,6.6,1194.0,1995,19730


In [93]:
smd.isna().sum()

adult                       0
belongs_to_collection    7425
budget                      0
genres                      0
homepage                 7125
id                          0
imdb_id                     0
original_language           0
original_title              0
overview                   12
popularity                  0
poster_path                 3
production_companies        0
production_countries        0
release_date                0
revenue                     0
runtime                     0
spoken_languages            0
status                      2
tagline                     0
title                       0
video                       0
vote_average                0
vote_count                  0
year                        0
1318961311304               0
description                 0
dtype: int64

In [92]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')


In [94]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [107]:
tfidf_matrix.shape

(9099, 268123)

In [108]:
# cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [109]:
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

# 30 similar Movies based on cosine similarity score

In [111]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [112]:
titles

0                                               Toy Story
1                                                 Jumanji
2                                        Grumpier Old Men
3                                       Waiting to Exhale
4                             Father of the Bride Part II
5                                                    Heat
6                                                 Sabrina
7                                            Tom and Huck
8                                            Sudden Death
9                                               GoldenEye
10                                 The American President
11                            Dracula: Dead and Loving It
12                                                  Balto
13                                                  Nixon
14                                       Cutthroat Island
15                                                 Casino
16                                  Sense and Sensibility
17            

In [113]:
indices

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
Heat                                                     5
Sabrina                                                  6
Tom and Huck                                             7
Sudden Death                                             8
GoldenEye                                                9
The American President                                  10
Dracula: Dead and Loving It                             11
Balto                                                   12
Nixon                                                   13
Cutthroat Island                                        14
Casino                                                  15
Sense and Sensibility                             

In [116]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

# Top Recommendations for movies

In [117]:
get_recommendations('The Godfather').head(10)

973      The Godfather: Part II
8387                 The Family
3509                       Made
4196         Johnny Dangerously
29               Shanghai Triad
5667                       Fury
2412             American Movie
1582    The Godfather: Part III
4221                    8 Women
2159              Summer of Sam
Name: title, dtype: object

In [118]:
get_recommendations('The Dark Knight').head(10)

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
Name: title, dtype: object

# Recommendar based on Collabrative Filtering

In [129]:
reader = Reader()

In [130]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [131]:
ratings.shape

(100004, 4)

In [132]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

In [135]:
svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.9000
MAE:  0.6933
------------
Fold 2
RMSE: 0.8978
MAE:  0.6923
------------
Fold 3
RMSE: 0.8924
MAE:  0.6870
------------
Fold 4
RMSE: 0.9020
MAE:  0.6945
------------
Fold 5
RMSE: 0.8893
MAE:  0.6840
------------
------------
Mean RMSE: 0.8963
Mean MAE : 0.6902
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.9000070314684759,
                             0.8977656058622303,
                             0.8923898711716608,
                             0.9019651379690982,
                             0.8893065618989267],
                            'mae': [0.6932898358892997,
                             0.6923266432008054,
                             0.686993385068332,
                             0.6944900190115864,
                             0.6840176965489243]})

In [None]:
# RMSE is 0.8963 which is conssidered as quite good.

In [136]:
trainset = data.build_full_trainset()
svd.train(trainset)
# training dataset to have predictions.

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x13317063e80>

In [140]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [143]:
svd.predict(1, 302,3)
# it will just tell how userid 1 will predict movie with id as 302


Prediction(uid=1, iid=302, r_ui=3, est=2.5813400028365154, details={'was_impossible': False})