## Simple recommender based on popularity

In [11]:
import pandas as pd

movies = pd.read_csv(r'data\movies.csv', low_memory=False)
ratings = pd.read_csv(r'data\ratings.csv', low_memory=False)
movie_ratings = pd.merge(movies, ratings)

In [12]:
# Additional columns  (Country, Age )
movie_ratings['user_country'] = 'user_country'
movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,user_country
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,user_country
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,user_country
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946,user_country
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970,user_country
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483,user_country


In [3]:
movie_ratings['rating'].value_counts(bins=10).sort_index() # check whether user ratings are valid or not (0.5-5)

(0.495, 0.95]     1370
(0.95, 1.4]       2811
(1.4, 1.85]       1791
(1.85, 2.3]       7551
(2.3, 2.75]       5550
(2.75, 3.2]      20047
(3.2, 3.65]      13136
(3.65, 4.1]      26818
(4.1, 4.55]       8551
(4.55, 5.0]      13211
Name: rating, dtype: int64

In [4]:
avg_rating_df = movie_ratings[['title', 'rating']].groupby(['title']).mean()
avg_rating_df = avg_rating_df.sort_values('rating', ascending=False)
avg_rating_df.head()

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
Gena the Crocodile (1969),5.0
True Stories (1986),5.0
Cosmic Scrat-tastrophe (2015),5.0
Love and Pigeons (1985),5.0
Red Sorghum (Hong gao liang) (1987),5.0


In [5]:
avg_rating_df = movie_ratings.groupby('title')['rating'].agg(['mean', 'count'])
avg_rating_df.rename(columns={'mean': 'vote_average', 'count': 'vote_count'}, inplace=True)

avg_rating_df.sort_values(by='vote_average', ascending=False).head()

Unnamed: 0_level_0,vote_average,vote_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Gena the Crocodile (1969),5.0,1
True Stories (1986),5.0,1
Cosmic Scrat-tastrophe (2015),5.0,1
Love and Pigeons (1985),5.0,1
Red Sorghum (Hong gao liang) (1987),5.0,1


In [6]:
# C is the mean vote across the whole report
C = avg_rating_df['vote_average'].mean()

# m is the minimum votes required to be listed
m = avg_rating_df['vote_count'].quantile(0.5)
q_movies = avg_rating_df.copy().loc[avg_rating_df['vote_count'] > m]
q_movies.sort_values(by='vote_average', ascending=False).head()


Unnamed: 0_level_0,vote_average,vote_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Trial, The (Procès, Le) (1962)",4.9,5
Adam's Rib (1949),4.75,4
Woman in the Dunes (Suna no onna) (1964),4.75,4
Black Mirror: White Christmas (2014),4.75,4
"Three Billboards Outside Ebbing, Missouri (2017)",4.75,8


In [7]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [8]:
avg_rating_df2 = avg_rating_df.copy().loc[avg_rating_df['vote_count'] >= m] # Subset for vote_count above min votes.

avg_rating_df2['score'] = avg_rating_df2.apply(weighted_rating, axis=1)

avg_rating_df2 = avg_rating_df2.sort_values('score', ascending=False)

avg_rating_df2.head(10)

Unnamed: 0_level_0,vote_average,vote_count,score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Shawshank Redemption, The (1994)",4.429022,317,4.418085
"Three Billboards Outside Ebbing, Missouri (2017)",4.75,8,4.344288
"Streetcar Named Desire, A (1951)",4.475,20,4.316833
Secrets & Lies (1996),4.590909,11,4.306226
"Trial, The (Procès, Le) (1962)",4.9,5,4.285896
Paths of Glory (1957),4.541667,12,4.285811
"Godfather, The (1972)",4.289062,192,4.273268
Guess Who's Coming to Dinner (1967),4.545455,11,4.270512
Fight Club (1999),4.272936,218,4.259218
Ran (1985),4.433333,15,4.238176
