In [5]:
# selecting kaggle-token to download dataset
from google.colab import files
files.upload()

# checking if the token was uploaded correctly
does_exist = !ls -lha kaggle.json | grep 'kaggle.json'
does_exist = does_exist[0]

if does_exist.find('No such file or directory') == -1:
  print('Token succesfully uploaded')

  # coping token to kaggle expected directory, and get permissions to it
  !mkdir -p ~/.kaggle
  !cp kaggle.json ~/.kaggle/
  !chmod 600 ~/.kaggle/kaggle.json

  # install the kaggle api-client
  !pip install -q kaggle

  # list of kaggle available datasets
  # !kaggle datasets list

  # copy the 'The Movies Datasets' data set localy with the API-COMMAND
  !kaggle datasets download -d rounakbanik/the-movies-dataset

  # unzip datasets to data folder
  ! mkdir data
  ! unzip the-movies-dataset.zip -d data

else:
  print('Token not found, please try again!')

Saving kaggle.json to kaggle.json
Token succesfully uploaded
Downloading the-movies-dataset.zip to /content
 92% 209M/228M [00:01<00:00, 184MB/s]
100% 228M/228M [00:01<00:00, 150MB/s]
mkdir: cannot create directory ‘data’: File exists
Archive:  the-movies-dataset.zip
  inflating: data/credits.csv        
  inflating: data/keywords.csv       
  inflating: data/links.csv          
  inflating: data/links_small.csv    
  inflating: data/movies_metadata.csv  
  inflating: data/ratings.csv        
  inflating: data/ratings_small.csv  


In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings; warnings.simplefilter('ignore')

In [182]:
# load data
data = pd.read_csv('/content/data/movies_metadata.csv')
data_size = data.shape[0]
print('Data shape: ', data.shape)
data.head()

Data shape:  (45466, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [183]:
# print number of null values of data features
useless_features = []
print('Number of null values of columns:')
for column in data.columns:
  num = pd.isnull(data[column]).sum()
  if (num / data_size) > 0.7:
    useless_features.append(column)
  print('{:<25}{}'.format(column, num))

print()
print('Useless features: ', useless_features)

Number of null values of columns:
adult                    0
belongs_to_collection    40972
budget                   0
genres                   0
homepage                 37684
id                       0
imdb_id                  17
original_language        11
original_title           0
overview                 954
popularity               5
poster_path              386
production_companies     3
production_countries     3
release_date             87
revenue                  6
runtime                  263
spoken_languages         6
status                   87
tagline                  25054
title                    6
video                    6
vote_average             6
vote_count               6

Useless features:  ['belongs_to_collection', 'homepage']


In [9]:
useless_features

['belongs_to_collection', 'homepage']

In [184]:
# remove columns with > 0.7 null values
data = data.drop(useless_features, axis=1)
data.isnull().sum()

adult                       0
budget                      0
genres                      0
id                          0
imdb_id                    17
original_language          11
original_title              0
overview                  954
popularity                  5
poster_path               386
production_companies        3
production_countries        3
release_date               87
revenue                     6
runtime                   263
spoken_languages            6
status                     87
tagline                 25054
title                       6
video                       6
vote_average                6
vote_count                  6
dtype: int64

In [12]:
# columns data types
data.dtypes

adult                    object
budget                   object
genres                   object
id                       object
imdb_id                  object
original_language        object
original_title           object
overview                 object
popularity               object
poster_path              object
production_companies     object
production_countries     object
release_date             object
revenue                 float64
runtime                 float64
spoken_languages         object
status                   object
tagline                  object
title                    object
video                    object
vote_average            float64
vote_count              float64
dtype: object

In [185]:
# fill null vaules
import datetime
import random

def generate_random_imbd_id():
  range_start = 10**(6-1)
  range_end = (10**6)-1
  return 'tt' + str(random.randint(range_start, range_end))

def fillna_mean_popularity():
  column = data['popularity']
  mean_value = column.loc[column.apply(lambda x: isinstance(x, float))].mean()
  column = column.apply(lambda x: mean_value if not isinstance(x, float) else x)
  column.fillna(mean_value, inplace= True)
  data['popularity'] = column

def generate_random_release_date():
  return datetime.datetime(
      random.randint(1990, 2020),
      random.randint(1, 12),
      random.randint(1, 30)
    )

def fillna_mean_revenue():
  column = data['revenue']
  mean_value = column.loc[column.apply(lambda x: isinstance(x, float))].mean()
  column = column.apply(lambda x: mean_value if not isinstance(x, float) else x)
  column.fillna(mean_value, inplace= True)
  data['revenue'] = column


def fillna_mean_runtime():
  column = data['runtime']
  mean_value = column.loc[column.apply(lambda x: isinstance(x, float))].mean()
  column = column.apply(lambda x: mean_value if not isinstance(x, float) else x)
  column.fillna(mean_value, inplace= True)
  data['runtime'] = column


def fillna_mean_vote_average():
  column = data['vote_average']
  mean_value = column.loc[column.apply(lambda x: isinstance(x, float))].mean()
  column = column.apply(lambda x: mean_value if not isinstance(x, float) else x)
  column.fillna(mean_value, inplace= True)
  data['vote_average'] = column


def fillna_mean_vote_count():
  column = data['vote_count']
  mean_value = column.loc[column.apply(lambda x: isinstance(x, float))].mean()
  column = column.apply(lambda x: mean_value if not isinstance(x, float) else x)
  column.fillna(mean_value, inplace= True)
  data['vote_count'] = column

fillna_mean_popularity()
fillna_mean_revenue()
fillna_mean_runtime()
fillna_mean_vote_average()
fillna_mean_vote_count()

values = {
  "imdb_id": generate_random_imbd_id(),
  "original_language": "en",
  "overview": "",
  "poster_path": "/random.jpg",
  "production_companies": "[{'name': 'Warner Bros.', 'id': 6194}]",
  "production_countries": "[{'iso_3166_1': 'US', 'name': 'United States of America'}]",
  "release_date": generate_random_release_date(),
  "spoken_languages": "[{'iso_639_1': 'en', 'name': 'English'}]",
  "status": "Released",
  "title": "Null",
  "video": False,
}
data.fillna(value=values, inplace= True)

In [14]:
data.isnull().sum()

adult                       0
budget                      0
genres                      0
id                          0
imdb_id                     0
original_language           0
original_title              0
overview                    0
popularity                  0
poster_path                 0
production_companies        0
production_countries        0
release_date                0
revenue                     0
runtime                     0
spoken_languages            0
status                      0
tagline                 25054
title                       0
video                       0
vote_average                0
vote_count                  0
dtype: int64

Can not fill tagline (We want to save what movies realy have tagline)

In [187]:
# cast to integer
data['id'] = data['id'].astype('int')

ValueError: ignored

***The id of one of rows is '1997-08-20', we search about these type rows and remove them.***

In [13]:
data[data['id'] == '1997-08-20']

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,poster_path,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19730,- Written by Ørnås,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'name': 'Carousel Productions', 'id': 11176}...",1997-08-20,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,3.393351,Midnight Man,...,1,11209350.0,94.128199,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Null,False,5.618207,109.897338


***We found three rows: [19730, 29503, 35587]***

In [186]:
data = data.drop([19730, 29503, 35587])

In [187]:
data['id'] = data['id'].astype('int')

In [17]:
data.dtypes

adult                    object
budget                   object
genres                   object
id                        int64
imdb_id                  object
original_language        object
original_title           object
overview                 object
popularity              float64
poster_path              object
production_companies     object
production_countries     object
release_date             object
revenue                 float64
runtime                 float64
spoken_languages         object
status                   object
tagline                  object
title                    object
video                      bool
vote_average            float64
vote_count              float64
dtype: object

I used the IMDB's weighted rating formula with the TMDB rating to construct my chart.

v: the number of votes for the movie
m: the minimum votes required to be listed in the chart
R: the average rating of the movie
C: the mean vote across the whole report

In the next step, I determined an appropriate value for m, the minimum votes required to be listed in the chart. For example, I used 95th percentile as our cutoff. In other words, for a movie to feature in the charts, it must have more votes than at least 95% of the movies in the list.

In [188]:
# casting vote_count column to integer
vote_counts = data[data['vote_count'].notnull()]['vote_count'].astype('int')

# m: the minimum votes required to be listed in the chart - it must have more votes than at least 95% of the movies in the list
m = vote_counts.quantile(0.95)
print('m', m)

# casting vote_average column to integer
vote_averages = data[data['vote_average'].notnull()]['vote_average'].astype('int')

# C: the mean vote across the whole report
C = vote_averages.mean()
print('C', C)

m 433.90000000000146
C 5.2448804522358845


Therefore, to qualify to be considered for the chart, a movie has to have at least 433.9 votes on TMDB. We also see that the average rating for a movie on TMDB is 5.245 on a scale of 10.

In [189]:
# calculate year column base release column
data['year'] = pd.to_datetime(data['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [190]:
# gattering rows which have more votes than at least 95% of the movies in the list (with not null value for vote_count & vote_average)
# selecting 'title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres' columns (Importnt columns for us)
qualified = data[(data['vote_count'] >= m) & (data['vote_count'].notnull()) & (data['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]

# cast vote_count to integer type for our data
qualified['vote_count'] = qualified['vote_count'].astype('int')

# cast vote_average to integer type for our data
qualified['vote_average'] = qualified['vote_average'].astype('int')

# shape of qualified data (With selected columns)
qualified.shape

(2274, 6)

 2274 Movies qualify to be on our chart.

In [191]:
qualified

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres
0,Toy Story,1995,5415,7,21.946943,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,Jumanji,1995,2413,6,17.015539,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
5,Heat,1995,1886,7,17.924927,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam..."
9,GoldenEye,1995,1194,6,14.686036,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '..."
15,Casino,1995,1343,7,10.137389,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name..."
...,...,...,...,...,...,...
44624,What Happened to Monday,2017,598,7,3.393351,"[{'id': 878, 'name': 'Science Fiction'}, {'id'..."
44632,Atomic Blonde,2017,748,6,3.393351,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam..."
44678,Dunkirk,2017,2712,7,3.393351,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam..."
44842,Transformers: The Last Knight,2017,1440,6,3.393351,"[{'id': 28, 'name': 'Action'}, {'id': 878, 'na..."


In [22]:
# define calculating weighted_rating function to add its column to data
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

# adding weighted-rating column to qualified data
qualified['weighted_rating'] = qualified.apply(weighted_rating, axis=1)

# sort qualified data base weighted_rating column
qualified = qualified.sort_values('weighted_rating', ascending=False).head(250)

**Top movies**

In [23]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating
15480,Inception,2010,14075,8,29.108149,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",7.917606
12481,The Dark Knight,2008,12269,8,123.167259,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",7.905892
22879,Interstellar,2014,11187,8,32.213481,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",7.89713
2843,Fight Club,1999,9678,8,63.869599,"[{'id': 18, 'name': 'Drama'}]",7.881778
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",7.871814
292,Pulp Fiction,1994,8670,8,140.950236,"[{'id': 53, 'name': 'Thriller'}, {'id': 80, 'n...",7.868689
314,The Shawshank Redemption,1994,8358,8,51.645403,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",7.864029
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",7.861956
351,Forrest Gump,1994,8147,8,48.307194,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",7.860685
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",7.851955


In the previous part, we saw the three Christopher Nolan Films, Inception, The Dark Knight, and Interstellar, occurred at the very top of our chart. The chart also indicates a strong bias of TMDB Users towards particular genres and directors.
Then I constructed our function that builds charts for particular genres. For this, I used to relax our default conditions to the 85th percentile instead of 95.


In [193]:
from ast import literal_eval

data['genres'] = data['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


In [194]:
data

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,poster_path,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,False,30000000,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995
1,False,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995
2,False,0,"[Romance, Comedy]",15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.712900,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995
3,False,16000000,"[Comedy, Drama, Romance]",31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995
4,False,0,[Comedy],11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,0,"[Drama, Family]",439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,3.393351,/jldsYflnId4tTWPx8es3uzsB1I8.jpg,...,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0,2020
45462,False,0,[Drama],111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,3.393351,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,...,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0,2011
45463,False,0,"[Action, Drama, Thriller]",67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",3.393351,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,...,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0,2003
45464,False,0,[],227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",3.393351,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,...,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0,1917


In [25]:
# snippet transforms the 'genres' column in data into a new Series s that contains each individual genre as a separate value, with a single level index
snippet_genres = data.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)
snippet_genres.name = 'genre'
gen_data = data.drop('genres', axis=1).join(snippet_genres)
# we did this to make genre column intractive

In [26]:
gen_data

Unnamed: 0,adult,budget,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre
0,False,30000000,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Animation
0,False,30000000,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Comedy
0,False,30000000,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Family
1,False,65000000,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...",...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Adventure
1,False,65000000,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...",...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Fantasy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45463,False,0,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",3.393351,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,"[{'name': 'American World Pictures', 'id': 6165}]",...,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0,2003,Action
45463,False,0,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",3.393351,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,"[{'name': 'American World Pictures', 'id': 6165}]",...,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0,2003,Drama
45463,False,0,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",3.393351,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,"[{'name': 'American World Pictures', 'id': 6165}]",...,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0,2003,Thriller
45464,False,0,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",3.393351,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,"[{'name': 'Yermoliev', 'id': 88753}]",...,87.0,[],Released,,Satan Triumphant,False,0.0,0.0,1917,


In [35]:
def build_chart(genre, head_num=200, percentile=0.85):
  # filter rows with input genre

  df = gen_data[gen_data['genre'] == genre]

  # cast vote_count to integer for filtered-data
  vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')

  # cast vote_average to integer for filtered-data
  vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')

  # calculate C (the mean vote across the whole report) for filtered-data
  C = vote_averages.mean()

  # caculate m (the minimum votes required to be listed in the chart - it must have more votes than at least {percentile} of the movies in the list) for filtered-data
  m = vote_counts.quantile(percentile)

  # make qualified-table
  qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
  qualified['vote_count'] = qualified['vote_count'].astype('int')
  qualified['vote_average'] = qualified['vote_average'].astype('int')

  qualified['weighted_rating'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
  qualified = qualified.sort_values('weighted_rating', ascending=False).head(head_num)

  return qualified

In [36]:
# try to enter head
build_chart("Romance", 15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,weighted_rating
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457024,8.565285
351,Forrest Gump,1994,8147,8,48.307194,7.971357
876,Vertigo,1958,1162,8,18.20822,7.811667
40251,Your Name.,2016,1030,8,3.393351,7.789489
883,Some Like It Hot,1959,835,8,11.845107,7.745154
1132,Cinema Paradiso,1988,834,8,14.177005,7.744878
19901,Paperman,2012,734,8,7.198633,7.713951
37863,Sing Street,2016,669,8,3.393351,7.689483
882,The Apartment,1960,498,8,11.994281,7.599317
38718,The Handmaiden,2016,453,8,3.393351,7.566166


To personalise our recommendations more, I am going to build an engine that computes similarity between movies based on certain metrics and suggests movies that are most similar to a particular movie that a user liked. Since we will be using movie metadata (or content) to build this engine, this also known as Content Based Filtering.

I will build two Content Based Recommenders based on:

Movie Overviews and Taglines

Movie Cast, Crew, Keywords and Genre

Also, as mentioned in the introduction, I will be using a subset of all the movies available to us due to limiting computing power available to me.

In [37]:
links_small = pd.read_csv('/content/data/links_small.csv')
links_small.head(5)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [38]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [39]:
smd = data[data['id'].isin(links_small)]
smd.shape

(9099, 23)

We have 9099 movies avaiable in our small movies metadata dataset which is 5 times smaller than our original dataset of 45000 movies.

***Movie Description Based Recommender***

Let us first try to build a recommender using movie descriptions and taglines. We do not have a quantitative metric to judge our machine's performance so this will have to be done qualitatively.

In [40]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [41]:
smd['description']

0        Led by Woody, Andy's toys live happily in his ...
1        When siblings Judy and Peter discover an encha...
2        A family wedding reignites the ancient feud be...
3        Cheated on, mistreated and stepped on, the wom...
4        Just when George Banks has recovered from his ...
                               ...                        
40224    From the mind behind Evangelion comes a hit la...
40503    The band stormed Europe in 1963, and, in 1964,...
44821    When Molly Hale's sadness of her father's disa...
44826    All your favorite Pokémon characters are back,...
45265    While holidaying in the French Alps, a Swedish...
Name: description, Length: 9099, dtype: object

***TfidfVectorizer***

The TfidfVectorizer class in scikit-learn is used to convert a collection of raw documents into a matrix of TF-IDF (Term Frequency-Inverse Document Frequency) features. TF-IDF is a numerical statistic that reflects the importance of a word in a document relative to a collection of documents.

We use tf to recommend movies base description [overview + tagline]

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])
tfidf_matrix.shape

(9099, 268124)

***Cosine Similarity***


I will be using the Cosine Similarity to calculate a numeric quantity that denotes the similarity between two movies. Mathematically, it is defined as follows:

cosine(x,y)=(x.y^⊺)/(||x||.||y||)

Since we have used the TF-IDF Vectorizer, calculating the Dot Product will directly give us the Cosine Similarity Score. Therefore, we will use sklearn's linear_kernel instead of cosine_similarities since it is much faster.

In [43]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

We now have a pairwise cosine similarity matrix for all the movies in our dataset. The next step is to write a function that returns the 30 most similar movies based on the cosine similarity score.

In [44]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [45]:
def get_recommendations(title, head_num=200):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:head_num+1]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

We're all set. Let us now try and get the top recommendations for a few movies and see how good the recommendations are.

In [46]:
get_recommendations('The Godfather', 10)

973      The Godfather: Part II
8387                 The Family
3509                       Made
4196         Johnny Dangerously
29               Shanghai Triad
5667                       Fury
2412             American Movie
1582    The Godfather: Part III
4221                    8 Women
2159              Summer of Sam
Name: title, dtype: object

In [47]:
get_recommendations('The Usual Suspects', 15)

5161            Enter the Dragon
2977                       Shaft
985                    The Sting
3657                Tango & Cash
5216             Throne of Blood
3977     All About the Benjamins
331                     The Mask
8010                    The Raid
3303    The Million Dollar Hotel
2392                     RoboCop
8291                       Syrup
4611                Once a Thief
2409                The Bachelor
6325                  Glory Road
8391                    Nebraska
Name: title, dtype: object

In [48]:
get_recommendations('The Dark Knight', 15)

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
6144                              Batman Begins
7933         Sherlock Holmes: A Game of Shadows
5511                            To End All Wars
4489                                      Q & A
7344                        Law Abiding Citizen
Name: title, dtype: object

We see that for The Dark Knight, our system is able to identify it as a Batman film and subsequently recommend other Batman films as its top recommendations. But unfortunately, that is all this system can do at the moment. This is not of much use to most people as it doesn't take into considerations very important features such as cast, crew, director and genre, which determine the rating and the popularity of a movie. Someone who liked The Dark Knight probably likes it more because of Nolan and would hate Batman Forever and every other substandard movie in the Batman Franchise.

Therefore, we are going to use much more suggestive metadata than Overview and Tagline. In the next subsection, we will build a more sophisticated recommender that takes genre, keywords, cast and crew into consideration.

***Metadata Based Recommender***

To build our standard metadata based content recommender, we will need to merge our current dataset with the crew and the keyword datasets. Let us prepare this data as our first step.

In [49]:
credits = pd.read_csv('/content/data/credits.csv')
credits.head(5)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [50]:
keywords = pd.read_csv('/content/data/keywords.csv')
keywords.head(5)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [51]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
data['id'] = data['id'].astype('int')

In [52]:
data.shape

(45463, 23)

In [53]:
data = data.merge(credits, on='id')
data = data.merge(keywords, on='id')
smd = data[data['id'].isin(links_small)]
smd.shape

(9219, 26)

We now have our cast, crew, genres and credits, all in one dataframe. Let us wrangle this a little more using the following intuitions:

Crew: From the crew, we will only pick the director as our feature since the others don't contribute that much to the feel of the movie.
Cast: Choosing Cast is a little more tricky. Lesser known actors and minor roles do not really affect people's opinion of a movie. Therefore, we must only select the major characters and their respective actors. Arbitrarily we will choose the top 3 actors that appear in the credits list.

In [54]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [55]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [56]:
smd['director'] = smd['crew'].apply(get_director)

In [57]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [58]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

My approach to building the recommender is going to be extremely hacky. What I plan on doing is creating a metadata dump for every movie which consists of genres, director, main actors and keywords. I then use a Count Vectorizer to create our count matrix as we did in the Description Recommender. The remaining steps are similar to what we did earlier: we calculate the cosine similarities and return movies that are most similar.

In [59]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

Keywords

We will do a small amount of pre-processing of our keywords before putting them to any use. As a first step, we calculate the frequenct counts of every keyword that appears in the dataset.

In [60]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

Keywords occur in frequencies ranging from 1 to 610. We do not have any use for keywords that occur only once. Therefore, these can be safely removed. Finally, we will convert every word to its stem so that words such as Dogs and Dog are

In [61]:
s = s[s > 1]

In [62]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [63]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [64]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [65]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [66]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [None]:
smd.to_pickle('smd_data.pkl')

In [76]:
indices.to_csv('indices_data.csv')

In [None]:
np.save('cosine_sim_data.npy', cosine_sim)

We will reuse the get_recommendations function that we had written earlier. Since our cosine similarity scores have changed, we expect it to give us different (and probably better) results. Let us check for The Dark Knight again and see what recommendations I get this time around.

In [79]:
get_recommendations('The Dark Knight', 15)

8031                 The Dark Knight Rises
6218                         Batman Begins
6623                          The Prestige
2085                             Following
7648                             Inception
4145                              Insomnia
3381                               Memento
8613                          Interstellar
7659            Batman: Under the Red Hood
1134                        Batman Returns
8927               Kidnapping Mr. Heineken
5943                              Thursday
1260                        Batman & Robin
9024    Batman v Superman: Dawn of Justice
4021                  The Long Good Friday
Name: title, dtype: object

In [80]:
get_recommendations('Restoration', 15)

4396           The Emperor's Club
8772               The Best of Me
2981                     Soapdish
533                  One Fine Day
2115    A Midsummer Night's Dream
583                  Country Life
5417             The Razor's Edge
265     A Pyromaniac's Love Story
1429          Two Girls and a Guy
5089             The Great Gatsby
5090            Wuthering Heights
5660              Shall We Dance?
345      When a Man Loves a Woman
5263                 Pat and Mike
5633                       Lilith
Name: title, dtype: object

Popularity and Ratings

One thing that we notice about our recommendation system is that it recommends movies regardless of ratings and popularity. It is true that Batman and Robin has a lot of similar characters as compared to The Dark Knight but it was a terrible movie that shouldn't be recommended to anyone.

Therefore, we will add a mechanism to remove bad movies and return movies which are popular and have had a good critical response.

I will take the top 25 movies based on similarity scores and calculate the vote of the 60th percentile movie. Then, using this as the value of  m
 , we will calculate the weighted rating of each movie using IMDB's formula like we did in the Simple Recommender section.

In [81]:
def improved_recommendations(title, head_num=200):
  idx = indices[title]
  sim_scores = list(enumerate(cosine_sim[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:26]
  movie_indices = [i[0] for i in sim_scores]

  movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
  vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
  vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
  C = vote_averages.mean()
  m = vote_counts.quantile(0.60)
  qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
  qualified['vote_count'] = qualified['vote_count'].astype('int')
  qualified['vote_average'] = qualified['vote_average'].astype('int')
  qualified['wr'] = qualified.apply(weighted_rating, axis=1)
  qualified = qualified.sort_values('wr', ascending=False).head(head_num)
  return qualified

In [126]:
indices

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
                                                      ... 
The Last Brickmaker in America                        9214
Rustom                                                9215
Mohenjo Daro                                          9216
Shin Godzilla                                         9217
The Beatles: Eight Days a Week - The Touring Years    9218
Length: 9219, dtype: int64

In [82]:
improved_recommendations('The Dark Knight', 15)

Unnamed: 0,title,vote_count,vote_average,year,wr
7648,Inception,14075,8,2010,7.917606
8613,Interstellar,11187,8,2014,7.89713
6623,The Prestige,4510,8,2006,7.758198
3381,Memento,4168,8,2000,7.740228
8031,The Dark Knight Rises,9263,7,2012,6.921465
6218,Batman Begins,7511,7,2005,6.904147
1134,Batman Returns,1706,6,1992,5.846887
132,Batman Forever,1529,5,1995,5.054131
9024,Batman v Superman: Dawn of Justice,7189,5,2016,5.013939
1260,Batman & Robin,1447,4,1997,4.287178


In [103]:
improved_recommendations('Richard III', 15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['vote_count'] = qualified['vote_count'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['vote_average'] = qualified['vote_average'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['wr'] = qualified.apply(weighted_rating, axis=1)


Unnamed: 0,title,vote_count,vote_average,year,wr
6596,Pan's Labyrinth,3122,7,2006,6.785836
8835,Unbroken,1437,7,2014,6.592952
9082,Beasts of No Nation,476,7,2015,6.163044
6970,The Counterfeiters,145,7,2007,5.684494
4561,Tears of the Sun,582,6,2003,5.677482
5639,Wimbledon,303,6,2004,5.555372
997,Henry V,73,7,1989,5.49764
3174,Tigerland,155,6,2000,5.44363
3979,Hart's War,246,5,2002,5.156278
6428,Firewall,270,5,2006,5.15095


***Collaborative Filtering***

Our content based engine suffers from some severe limitations. It is only capable of suggesting movies which are close to a certain movie. That is, it is not capable of capturing tastes and providing recommendations across genres.

Also, the engine that we built is not really personal in that it doesn't capture the personal tastes and biases of a user. Anyone querying our engine for recommendations based on a movie will receive the same recommendations for that movie, regardless of who s/he is.

Therefore, in this section, we will use a technique called Collaborative Filtering to make recommendations to Movie Watchers. Collaborative Filtering is based on the idea that users similar to a me can be used to predict how much I will like a particular product or service those users have used/experienced but I have not.

I will not be implementing Collaborative Filtering from scratch. Instead, I will use the Surprise library that used extremely powerful algorithms like Singular Value Decomposition (SVD) to minimise RMSE (Root Mean Square Error) and give great recommendations.

In [86]:
!pip install scikit-surprise
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate, KFold

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.0/772.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3096338 sha256=48cb2023c43d899b239c10e360848ad0d1e15f5ad9fe66c327da985b39fc6483
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-

In [87]:
reader = Reader()
ratings = pd.read_csv('/content/data/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [124]:
rating_data = pd.read_csv('/content/data/ratings_small.csv').drop(['timestamp'], axis=1)
removing_columns = ['index', 'adult', 'budget', 'genres', 'id', 'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'video', 'vote_average', 'vote_count', 'year', 'cast', 'crew', 'keywords',
       'cast_size', 'crew_size', 'director', 'soup']
rating_title_data = rating_data.merge(smd, left_on='movieId', right_on='id').drop(removing_columns, axis=1)

In [125]:
rating_title_data.to_pickle('rating_data.pkl')

In [88]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
kf = KFold(n_splits=5)

svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=kf)

{'test_rmse': array([0.88921908, 0.89477933, 0.90204096, 0.89327409, 0.90560663]),
 'test_mae': array([0.684199  , 0.68862778, 0.69422635, 0.68638943, 0.69649204]),
 'fit_time': (1.241581678390503,
  1.9399735927581787,
  2.110851764678955,
  3.0659844875335693,
  2.313386917114258),
 'test_time': (0.5774059295654297,
  0.36243748664855957,
  0.32106661796569824,
  0.28281712532043457,
  0.11279511451721191)}

In [89]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb607a43940>

In [90]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [91]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=2.6549142760525566, details={'was_impossible': False})

In [None]:
from surprise import dump

dump.dump('model_file', algo=svd)

In [92]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [93]:
id_map = pd.read_csv('/content/data/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

In [99]:
id_map.to_pickle('id_map_data.pkl')

In [96]:
indices_map = id_map.set_index('id')

In [100]:
indices_map.to_pickle('indices_map_data.pkl')

In [101]:
def hybrid(userId, title, head_num=10):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']

    movie_id = id_map.loc[title]['movieId']

    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]

    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(head_num)

In [197]:
hybrid(80, 'Avatar', 13)

Unnamed: 0,title,vote_count,vote_average,year,id,est
974,Aliens,3282.0,7.7,1986,679,4.148646
522,Terminator 2: Judgment Day,4274.0,7.7,1991,280,3.939488
1011,The Terminator,4208.0,7.4,1984,218,3.768555
8658,X-Men: Days of Future Past,6155.0,7.5,2014,127585,3.744055
1621,Darby O'Gill and the Little People,35.0,6.7,1959,18887,3.743479
8401,Star Trek Into Darkness,4479.0,7.4,2013,54138,3.709726
2014,Fantastic Planet,140.0,7.6,1973,16306,3.614358
1376,Titanic,7770.0,7.5,1997,597,3.532304
1668,Return from Witch Mountain,38.0,5.6,1978,14822,3.512823
344,True Lies,1138.0,6.8,1994,36955,3.489947


In [196]:
hybrid(25, 'Avatar', 13)

Unnamed: 0,title,vote_count,vote_average,year,id,est
522,Terminator 2: Judgment Day,4274.0,7.7,1991,280,3.748513
974,Aliens,3282.0,7.7,1986,679,3.598249
8401,Star Trek Into Darkness,4479.0,7.4,2013,54138,3.558352
1011,The Terminator,4208.0,7.4,1984,218,3.49865
2014,Fantastic Planet,140.0,7.6,1973,16306,3.398614
8658,X-Men: Days of Future Past,6155.0,7.5,2014,127585,3.335864
1376,Titanic,7770.0,7.5,1997,597,3.316582
1621,Darby O'Gill and the Little People,35.0,6.7,1959,18887,3.305522
922,The Abyss,822.0,7.1,1989,2756,3.294811
4017,Hawk the Slayer,13.0,4.5,1980,25628,3.223319


In [113]:
hybrid(25, 'Richard III')

Unnamed: 0,title,vote_count,vote_average,year,id,est
997,Henry V,73.0,7.4,1989,10705,3.884769
6596,Pan's Labyrinth,3122.0,7.6,2006,1417,3.587221
6970,The Counterfeiters,145.0,7.3,2007,7862,3.547935
5291,Ambush,13.0,6.3,1999,49320,3.507349
8247,The Patience Stone,33.0,6.2,2012,128158,3.465927
1191,Prisoner of the Mountains,14.0,6.6,1996,55936,3.457796
3304,Hope and Glory,44.0,6.5,1987,32054,3.456722
2154,"Run Silent, Run Deep",31.0,7.5,1958,18784,3.389073
5779,The Unknown Soldier,19.0,7.8,1955,36439,3.377613
5056,King of Hearts,15.0,6.8,1966,17685,3.312065


In [102]:
hybrid(88, 'Mortal Kombat')

Unnamed: 0,title,vote_count,vote_average,year,id,est
8867,Warcraft,2325.0,6.3,2016,68735,3.553159
1046,Highlander,642.0,6.8,1986,8009,3.396544
1854,Soldier,226.0,6.1,1998,9425,3.375322
7990,The Three Musketeers,945.0,5.6,2011,52451,3.35909
7094,Death Race,1205.0,6.0,2008,10483,3.277658
6736,Meet the Robinsons,787.0,6.7,2007,1267,3.270112
6562,Cabin in the Sky,8.0,7.1,1943,59964,3.247353
5866,The Raiders of Atlantis,8.0,6.4,1983,29173,3.201157
7077,The Mummy: Tomb of the Dragon Emperor,1418.0,5.2,2008,1735,3.18456
7418,Ink,102.0,6.5,2009,24869,3.113554


In [112]:
def search(text):
  results = []
  results.append(smd[smd['title'].str.contains(text, case=False)]['title'])
  results.append(smd[smd['overview'].str.contains(text, case=False)]['title'])
  results.append(smd[smd['genres'].apply(lambda x: any(text.lower() in genre.lower() for genre in x))]['title'])
  return results[0].tolist() + results[1].tolist() + results[2].tolist()

In [117]:
search('ro')

['Cutthroat Island',
 'Four Rooms',
 'Dangerous Minds',
 'Across the Sea of Time',
 'Mighty Aphrodite',
 'From Dusk Till Dawn',
 'Bed of Roses',
 'The Crossing Guard',
 'The Juror',
 'Vampire in Brooklyn',
 'Broken Arrow',
 'Bottle Rocket',
 'Mr. Wrong',
 'Rumble in the Bronx',
 'The Neverending Story III: Escape from Fantasia',
 'The Brothers McMullen',
 'Rob Roy',
 'The Prophecy',
 'Unstrung Heroes',
 'The Browning Version',
 'Drop Zone',
 'Far from Home: The Adventures of Yellow Dog',
 'Leon: The Professional',
 "A Pyromaniac's Love Story",
 'Roommates',
 'The Secret of Roan Inish',
 'Tales from the Crypt: Demon Knight',
 'Tales from the Hood',
 'Bullets Over Broadway',
 'Crooklyn',
 'The Crow',
 'I Love Trouble',
 'Red Rock West',
 'A Bronx Tale',
 'Cops & Robbersons',
 'Dangerous Game',
 'Geronimo: An American Legend',
 'The Hudsucker Proxy',
 'Last Action Hero',
 'The Program',
 'The Road to Wellville',
 'RoboCop 3',
 'Robin Hood: Men in Tights',
 'Romeo Is Bleeding',
 'Romper St

In [158]:
def get_user_top_movies(user_id):
  return list(rating_title_data[rating_title_data['userId'] == user_id].sort_values('rating', ascending=False)['title'][:5])

get_user_top_movie(1)

In [180]:
def user_search(user_id):
  movies = get_user_top_movies(user_id)
  if (len(movies) < 1): return
  result = hybrid(user_id, movies[0])
  for i in range(1, len(movies)):
    result = pd.concat([result, hybrid(user_id, movies[i])], ignore_index=True)
  return result.sort_values('vote_average', ascending=False)

In [181]:
user_search(3)

Unnamed: 0,title,vote_count,vote_average,year,id,est
21,The Matrix,9079.0,7.9,1999,603,3.969753
20,Ghost in the Shell,854.0,7.8,1995,9323,4.15308
47,Amélie,3403.0,7.8,2001,194,3.618942
42,Breathless,322.0,7.7,1960,269,3.724446
22,Terminator 2: Judgment Day,4274.0,7.7,1991,280,3.70361
31,Three Colors: Blue,311.0,7.7,1993,108,3.887627
37,Pierrot le Fou,134.0,7.7,1965,2786,3.517382
0,"Paris, Texas",282.0,7.7,1984,655,3.995019
48,Pierrot le Fou,134.0,7.7,1965,2786,3.517382
2,Braveheart,3404.0,7.7,1995,197,3.885388
