## Simple recommender based on popularity among Friends

In [41]:
import pandas as pd

movies = pd.read_csv(r'data\movies.csv', low_memory=False)
ratings = pd.read_csv(r'data\ratings.csv', low_memory=False)
movie_ratings = pd.merge(movies, ratings)

In [47]:
# Additional columns  (Country, Age )
movie_ratings['user_country'] = 'user_country'
movie_ratings['friend_ids'] = ''
movie_ratings['friend_ids'] = movie_ratings[movie_ratings.userId==1].apply(lambda x: [5, 7, 9, 10], axis=1) # Example data for friends 

In [48]:
movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,user_country,friend_ids
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,user_country,"[5, 7, 9, 10]"
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,user_country,
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946,user_country,
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970,user_country,
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483,user_country,


In [50]:
movie_ratings['rating'].value_counts(bins=5).sort_index() # check whether user ratings are valid or not (0.5-5)

(0.495, 1.4]     4181
(1.4, 2.3]       9342
(2.3, 3.2]      25597
(3.2, 4.1]      39954
(4.1, 5.0]      21762
Name: rating, dtype: int64

In [157]:
# Subset for only users that are friends with the target user
target_user = 1
friends_target_user = movie_ratings[movie_ratings.userId==target_user]['friend_ids']

In [158]:
friends_list = friends_target_user.values[0]

In [159]:
friends_list

[5, 7, 9, 10]

In [156]:
friend_index = friends_list
smaller_selection = movie_ratings.loc[movie_ratings['userId'].isin(friend_index)]

In [132]:
smaller_selection.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,user_country,friend_ids
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,user_country,
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946,user_country,
1146,21,Get Shorty (1995),Comedy|Crime|Thriller,5,4.0,847435238,user_country,
1679,34,Babe (1995),Children|Drama,5,4.0,847434881,user_country,
1807,36,Dead Man Walking (1995),Crime|Drama,5,4.0,847435292,user_country,


In [133]:
avg_rating_df = smaller_selection[['title', 'rating']].groupby(['title']).mean()
avg_rating_df = avg_rating_df.sort_values('rating', ascending=False)
avg_rating_df.head()

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
Heavenly Creatures (1994),5.0
"King's Speech, The (2010)",5.0
Spirited Away (Sen to Chihiro no kamikakushi) (2001),5.0
Spectre (2015),5.0
Snow White and the Seven Dwarfs (1937),5.0


In [134]:
avg_rating_df = smaller_selection.groupby('title')['rating'].agg(['mean', 'count'])
avg_rating_df.rename(columns={'mean': 'vote_average', 'count': 'vote_count'}, inplace=True)

avg_rating_df.sort_values(by='vote_average', ascending=False).head()

Unnamed: 0_level_0,vote_average,vote_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Heavenly Creatures (1994),5.0,1
"King's Speech, The (2010)",5.0,1
Spirited Away (Sen to Chihiro no kamikakushi) (2001),5.0,1
Spectre (2015),5.0,1
Snow White and the Seven Dwarfs (1937),5.0,1


In [135]:
# C is the mean vote across the whole report
C = avg_rating_df['vote_average'].mean()

# m is the minimum votes required to be listed
m = avg_rating_df['vote_count'].quantile(0.5)
q_movies = avg_rating_df.copy().loc[avg_rating_df['vote_count'] > m]
q_movies.sort_values(by='vote_average', ascending=False).head()


Unnamed: 0_level_0,vote_average,vote_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Back to the Future (1985),5.0,2
Casino Royale (2006),4.75,2
"Lord of the Rings: The Two Towers, The (2002)",4.5,3
"Lord of the Rings: The Fellowship of the Ring, The (2001)",4.5,3
"Usual Suspects, The (1995)",4.25,2


In [136]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [137]:
avg_rating_df2 = avg_rating_df.copy().loc[avg_rating_df['vote_count'] >= m] # Subset for vote_count above min votes.

avg_rating_df2['score'] = avg_rating_df2.apply(weighted_rating, axis=1)

avg_rating_df2 = avg_rating_df2.sort_values('score', ascending=False)

avg_rating_df2.head(10)

Unnamed: 0_level_0,vote_average,vote_count,score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Back to the Future (1985),5.0,2,4.42458
Casino Royale (2006),4.75,2,4.257913
"Lord of the Rings: The Fellowship of the Ring, The (2001)",4.5,3,4.193435
"Lord of the Rings: The Two Towers, The (2002)",4.5,3,4.193435
Heavenly Creatures (1994),5.0,1,4.13687
Psycho (1960),5.0,1,4.13687
Spectre (2015),5.0,1,4.13687
Snow White and the Seven Dwarfs (1937),5.0,1,4.13687
Skyfall (2012),5.0,1,4.13687
First Daughter (2004),5.0,1,4.13687
