In [1]:
#Importing the libraries
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#Reading in the data
movies_df= pd.read_csv('movies.csv') #movieId, title, genres, year
ratings_df= pd.read_csv('ratings.csv') #userId, movieId, rating, timestamp
#Dropping the columns we do not need
movies_df= movies_df.drop('genres', 1) #movieId, title, year
ratings_df= ratings_df.drop('timestamp', 1) #userId, movieId, rating

In [2]:
userInput = [
{'title':'Free Willy 2: The Adventure Home', 'rating':2.5},
{'title':'Crocodile Dundee II', 'rating':3.0},
{'title':'Departed, The', 'rating':5}
] 
inputMovies= pd.DataFrame(userInput)
inputMovies

Unnamed: 0,rating,title
0,2.5,Free Willy 2: The Adventure Home
1,3.0,Crocodile Dundee II
2,5.0,"Departed, The"


In [3]:
#Filtering out the movies by title
inputId= movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies= pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies= inputMovies.drop('year', 1)
#Final input dataframe
inputMovies

Unnamed: 0,movieId,title,rating
0,169,Free Willy 2: The Adventure Home,2.5
1,2471,Crocodile Dundee II,3.0
2,48516,"Departed, The",5.0


In [4]:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset= ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
491,13,169,1.0
663,14,169,3.0


In [5]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])
userSubsetGroup.get_group(1130)

KeyError: 1130

In [6]:
#Sorting it so users with movies most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True) 
userSubsetGroup[0:3]

[(1,    userId  movieId  rating
  0       1      169     2.5
  1       1     2471     3.0
  2       1    48516     5.0), (815,        userId  movieId  rating
  73839     815      169     2.0
  74961     815     2471     2.0
  77282     815    48516     3.5), (4415,         userId  movieId  rating
  408795    4415      169     0.5
  409585    4415     2471     3.5
  410996    4415    48516     3.0)]

In [7]:
userSubsetGroup = userSubsetGroup[0:100]

In [8]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict= {}
#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies= inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings= len(group)
    #Get the review scores for the movies that they both have in common
    temp_df= inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList= temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList= group['rating'].tolist()
    #Now let's calculate the pearsoncorrelation between two users, so called, x and y
    Sxx= sum([i**2 for i in tempRatingList]) -pow(sum(tempRatingList),2)/float(nRatings)
    Syy= sum([i**2 for i in tempGroupList]) -pow(sum(tempGroupList),2)/float(nRatings)
    Sxy= sum( i*j for i, j in zip(tempRatingList, tempGroupList)) -sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
#If the denominator is different than zero, then divide, else, 0 correlation.
if Sxx!= 0 and Syy!= 0:
    pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
else:
    pearsonCorrelationDict[name] = 0

In [9]:
pearsonCorrelationDict.items()
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns= ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index= range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.981981,133624


In [10]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
0,0.981981,133624


In [11]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.981981,133624,1,2.5
1,0.981981,133624,2,2.5
2,0.981981,133624,3,2.0
3,0.981981,133624,5,1.5
4,0.981981,133624,6,3.5


In [12]:
#Multiplying the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()
#Applying a sum to the topUsersafter grouping it up by userId
tempTopUsersRating= topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns= ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.981981,2.454951
2,0.981981,2.454951
3,0.981981,1.963961
5,0.981981,1.472971
6,0.981981,3.436932


In [13]:
#Creates an empty dataframe
recommendation_df= pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.5,1
2,2.5,2
3,2.0,3
5,1.5,5
6,3.5,6


In [14]:
recommendation_df= recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1236,5.0,1236
1245,5.0,1245
1306,5.0,1306
2318,5.0,2318
4995,4.5,4995
1199,4.5,1199
34405,4.5,34405
923,4.5,923
3949,4.5,3949
3920,4.5,3920


In [15]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
906,923,Citizen Kane,1941.0
1174,1199,Brazil,1985.0
1209,1236,Trust,1990.0
1217,1245,Miller's Crossing,1990.0
1277,1306,Until the End of the World (Bis ans Ende der W...,1991.0
2234,2318,Happiness,1998.0
3828,3920,"Faraway, So Close (In weiter Ferne, so nah!)",1993.0
3856,3949,Requiem for a Dream,2000.0
4900,4995,"Beautiful Mind, A",2001.0
10294,34405,Serenity,2005.0


In [None]:
userInputOriginal = [
{'title':'BreakfastClub, The', 'rating':5},
{'title':'ToyStory', 'rating':3.5},
{'title':'Jumanji', 'rating':2},
{'title':"PulpFiction", 'rating':5},
{'title':'Akira', 'rating':4.5}
] 