In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt

In [2]:
movies_df = pd.read_csv('movies.csv')
rating_df = pd.read_csv('ratings_sample.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# extract the movie year from the title
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))', expand = False)
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)', expand = False)
movies_df['year'] = movies_df.year.astype(int, errors = 'ignore')
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

  movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [4]:
movies_df.drop('genres', axis = 1, inplace = True)
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [5]:
rating_df.drop('timestamp', axis = 1, inplace = True)
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [6]:
# User Input
UserInput = [
    {'title': 'Breakfast Club, The', 'rating': 5},
    {'title': 'Toy Story', 'rating': 3.5},
    {'title': 'Jumanji', 'rating': 2},
    {'title': "Pulp Fiction", 'rating': 5},
    {'title': 'Akira', 'rating': 4.5}
]
input_movies = pd.DataFrame(UserInput)
input_movies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [7]:
# add movieID to input_movies
inputID = movies_df[movies_df['title'].isin(input_movies['title'].tolist())]
input_movies = pd.merge(inputID, input_movies)
input_movies.drop(columns = ['year'], axis = 1, inplace = True)
input_movies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [8]:
# filter out users that have watched movies that the input has watched
userSubset = rating_df[rating_df['movieId'].isin(input_movies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0


In [9]:
userSubset_group = userSubset.groupby(['userId'])

In [10]:
userSubset_group = sorted(userSubset_group, key = lambda x: len(x[1]), reverse = True)

In [11]:
usg = userSubset_group[0:100]

In [12]:
# similarity of users
from scipy.stats import pearsonr
user_similraity = {}
for userId, ratings in usg:
    user_similraity[userId] = pearsonr(ratings['rating'], input_movies['rating'])[0]

In [13]:
# similarity to dataframe
user_similarity_df = pd.DataFrame.from_dict(user_similraity, orient = 'index')
user_similarity_df.columns = ['Similarity']
user_similarity_df['userId'] = user_similarity_df.index
user_similarity_df.index = range(len(user_similarity_df))
user_similarity_df.head()

Unnamed: 0,Similarity,userId
0,0.827278,75
1,0.586009,106
2,0.83205,686
3,0.576557,815
4,0.943456,1040


In [14]:
top_user = user_similarity_df.sort_values(by = 'Similarity', ascending = False)[0:50]
top_user.head()

Unnamed: 0,Similarity,userId
64,0.961678,12325
34,0.961538,6207
55,0.961538,10707
67,0.960769,13053
4,0.943456,1040


In [15]:
top_user_rating = top_user.merge(rating_df, left_on = 'userId', right_on = 'userId', how = 'inner')
top_user_rating.head()

Unnamed: 0,Similarity,userId,movieId,rating
0,0.961678,12325,1,3.5
1,0.961678,12325,2,1.5
2,0.961678,12325,3,3.0
3,0.961678,12325,5,0.5
4,0.961678,12325,6,2.5


In [16]:
top_user_rating['Weighted Rating'] = top_user_rating['Similarity'] * top_user_rating['rating']
top_user_rating.head()

Unnamed: 0,Similarity,userId,movieId,rating,Weighted Rating
0,0.961678,12325,1,3.5,3.365874
1,0.961678,12325,2,1.5,1.442517
2,0.961678,12325,3,3.0,2.885035
3,0.961678,12325,5,0.5,0.480839
4,0.961678,12325,6,2.5,2.404196


In [17]:
temp = top_user_rating.groupby('movieId').sum()[['Similarity', 'Weighted Rating']]
temp.columns = ['Total Similarity', 'Total Weighted Rating']
temp.head()

Unnamed: 0_level_0,Total Similarity,Total Weighted Rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.376281,140.800834
2,38.376281,96.656745
3,10.253981,27.254477
4,0.929294,2.787882
5,11.723262,27.151751


In [18]:
recommendation_df = pd.DataFrame()
recommendation_df['Total Score'] = temp['Total Weighted Rating']/temp['Total Similarity']
recommendation_df['movieId'] = temp.index
recommendation_df = recommendation_df.sort_values(by = 'Total Score', ascending = False)
recommendation_df.head()

Unnamed: 0_level_0,Total Score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
3067,5.0,3067
26801,5.0,26801
6918,5.0,6918
1902,5.0,1902
6660,5.0,6660


In [19]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
119,121,"Boys of St. Vincent, The",1992
1819,1902,Dream for an Insomniac,1996
2981,3067,Women on the Verge of a Nervous Breakdown (Muj...,1988
3686,3776,Melody Time,1948
3759,3851,I'm the One That I Want,2000
6551,6660,"Red Shoes, The",1948
6559,6668,"Road Home, The (Wo de fu qin mu qin)",1999
6808,6918,"Unvanquished, The (Aparajito)",1957
9064,26801,Dragon Inn (Sun lung moon hak chan),1992
18106,90531,Shame,2011
