In [None]:
# Simple Recommender

In [None]:
# sorting based on IMDb weighted average formula

In [None]:
# same for everyone for every movie

In [None]:
# Data source used for this model: IMDb movies dataset

In [83]:
import pandas as pd
import numpy as np

# import dataset - cleaned in the data cleaning file
movies = pd.read_csv('imdb movies_cleaned.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [66]:
# check shape and data types of the data
print(movies.shape)

(85855, 16)


In [4]:
print(movies.dtypes)

imdb_title_id          object
title                  object
original_title         object
year                   object
date_published         object
genre                  object
duration                int64
country                object
language               object
director               object
writer                 object
production_company     object
actors                 object
description            object
avg_vote              float64
votes                   int64
dtype: object


In [5]:
print(movies.isnull().sum())   # check for missing values

imdb_title_id         0
title                 0
original_title        0
year                  0
date_published        0
genre                 0
duration              0
country               0
language              0
director              0
writer                0
production_company    0
actors                0
description           0
avg_vote              0
votes                 0
dtype: int64


In [None]:
# calculate IMDB's weighted rating (score) for each movie
# v is the number of votes for the movie;
# m is the minimum votes required to be listed in the chart;
# R is the average rating of the movie; And
# C is the mean vote across the whole report

In [84]:
# C is the mean vote across the whole report
C= movies['avg_vote'].mean()   
C

5.898655873274613

In [89]:
# use 90th percentile as our cutoff
# for a movie to feature in the charts, it must have more votes than at least 90% of the movies in the list.
m= movies['votes'].quantile(0.9)
m

9819.600000000006

In [90]:
# filter the movies that qualify for the chart
filtered_movies = movies.copy().loc[movies['votes'] >= m]
filtered_movies.shape

(8586, 16)

In [None]:
# We see that 8586 movies are qualified to be in the list

In [91]:
def IMDB_weighted_rating(df, m=m, C=C):
    v = df['votes']
    R = df['avg_vote']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [92]:
# Define a new feature 'score' and calculate its value with `IMDB_weighted_rating`
filtered_movies['score'] = filtered_movies.apply(IMDB_weighted_rating, axis=1)

In [93]:
# Sort movies based on score calculated above
filtered_movies = filtered_movies.sort_values('score', ascending=False)

# Print the top 10 movies
filtered_movies[['original_title','year','country','genre','votes', 'avg_vote', 'score']].head(10)

Unnamed: 0,original_title,year,country,genre,votes,avg_vote,score
28453,The Shawshank Redemption,1994,USA,Drama,2278845,9.3,9.285406
15528,The Godfather,1972,USA,"Crime, Drama",1572674,9.2,9.179515
48078,The Dark Knight,2008,"USA, UK","Action, Crime, Drama",2241615,9.0,8.986474
16556,The Godfather: Part II,1974,USA,"Crime, Drama",1098714,9.0,8.972528
28381,Pulp Fiction,1994,USA,"Crime, Drama",1780147,8.9,8.883535
34127,The Lord of the Rings: The Return of the King,2003,"New Zealand, USA","Action, Adventure, Drama",1604280,8.9,8.881741
27629,Schindler's List,1993,USA,"Biography, Drama, History",1183248,8.9,8.875297
8973,12 Angry Men,1957,USA,"Crime, Drama",668473,8.9,8.85655
57475,Inception,2010,"USA, UK","Action, Adventure, Sci-Fi",2002816,8.8,8.785844
32487,Fight Club,1999,"USA, Germany",Drama,1807440,8.8,8.784323


In [None]:
# Could further breakdown by Genre if needed...
# or try out diffent percentile level

In [94]:
# split the genre
movies['genre'] = movies['genre'].str.split(',')
movies = movies.explode('genre').reset_index()

In [96]:
def build_chart(genre, percentile=0.9):
    df = movies[movies['genre'] == genre]
    vote_counts = df[df['votes'].notnull()]['votes'].astype('int')
    vote_averages = df[df['avg_vote'].notnull()]['avg_vote'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['votes'] >= m) & (df['votes'].notnull()) & (df['votes'].notnull())][['original_title','year','country','genre','votes', 'avg_vote']]
    qualified['votes'] = qualified['votes'].astype('int')
    qualified['avg_vote'] = qualified['avg_vote'].astype('int')
    
    qualified['score'] = qualified.apply(lambda x: (x['votes']/(x['votes']+m) * x['avg_vote']) + (m/(m+x['votes']) * C), axis=1)
    qualified = qualified.sort_values('score', ascending=False).head(250)
    
    return qualified

In [102]:
build_chart('Adventure').head(10)

Unnamed: 0,original_title,year,country,genre,votes,avg_vote,score
105027,Interstellar,2014,"USA, UK, Canada",Adventure,1449256,8,7.970355
90939,Inglourious Basterds,2009,"Germany, USA",Adventure,1229958,8,7.96514
46039,Back to the Future,1985,USA,Adventure,1027330,8,7.958376
115106,Harry Potter and the Deathly Hallows: Part 2,2011,"UK, USA",Adventure,743339,8,7.942825
149047,The Martian,2015,"UK, USA, Hungary, Jordan",Adventure,740757,8,7.94263
136398,The Grand Budapest Hotel,2014,"Germany, USA",Adventure,687669,8,7.938307
28826,2001: A Space Odyssey,1968,"UK, USA",Adventure,587866,8,7.928125
103382,Into the Wild,2007,USA,Adventure,560692,8,7.924743
36290,Jaws,1975,USA,Adventure,535807,8,7.921355
35476,Monty Python and the Holy Grail,1975,UK,Adventure,491629,8,7.914521


In [103]:
build_chart('Horror').head(10)

Unnamed: 0,original_title,year,country,genre,votes,avg_vote,score
39796,Alien,1979,"UK, USA",Horror,768874,8,7.963386
23013,Psycho,1960,USA,Horror,586765,8,7.952164
43655,The Thing,1982,USA,Horror,360147,8,7.92267
34119,The Exorcist,1973,USA,Horror,354234,8,7.921405
158046,Get Out,2017,"USA, Japan",Horror,472430,7,6.956167
119761,It,2017,"Canada, USA",Horror,442715,7,6.953273
121027,The Conjuring,2013,USA,Horror,430412,7,6.95196
157551,Split,2016,"USA, Japan",Horror,408269,7,6.949401
93687,Saw,2004,USA,Horror,371979,7,6.944561
116646,The Cabin in the Woods,2011,USA,Horror,366795,7,6.943793
