In [1]:
import pandas as pd
from math import sqrt
import numpy as np

In [2]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

In [5]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


With that, let's also drop the genres column since we won't need it for this particular recommendation system.

In [6]:
movies_df = movies_df.drop('genres', 1)

In [7]:
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [8]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [9]:
ratings_df = ratings_df.drop('timestamp', 1)

In [10]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


# Collaborative Filtering

In [11]:
userInput = [
            {"title":"Interstellar", "rating":5},
            {"title":"Matrix, The", "rating":5},
            {"title":"Toy Story", "rating":2.5},
            {"title":"Jumanji", "rating":3},
            {"title":"Fight Club", "rating":4.5},
            {"title":"Pulp Fiction","rating":5},
            {"title":"Inception","rating":5},
            {"title":"Intouchables","rating":4}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,rating,title
0,5.0,Interstellar
1,5.0,"Matrix, The"
2,2.5,Toy Story
3,3.0,Jumanji
4,4.5,Fight Club
5,5.0,Pulp Fiction
6,5.0,Inception
7,4.0,Intouchables


In [12]:
#Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('year', 1)
#Final input dataframe
#If a movie you added in above isn't here, then it might not be in the original 
#dataframe or it might spelled differently, please check capitalisation.
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,2.5
1,2,Jumanji,3.0
2,296,Pulp Fiction,5.0
3,2571,"Matrix, The",5.0
4,2959,Fight Club,4.5
5,79132,Inception,5.0
6,92259,Intouchables,4.0
7,109487,Interstellar,5.0


Here is my movies above with their movieIds.

And Lets find users who give points our movies.

In [13]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.shape)
userSubset.head()


(317549, 3)


Unnamed: 0,userId,movieId,rating
3,2,2571,3.5
4,2,109487,4.0
19,4,296,4.0
105,4,2571,4.0
118,4,2959,4.0


In [14]:
userSubsetGroup = userSubset.groupby(['userId'])

In [15]:
userSubsetGroup.get_group(1130)

Unnamed: 0,userId,movieId,rating
104167,1130,1,0.5
104168,1130,2,4.0
104214,1130,296,4.0
104530,1130,2571,2.0
104576,1130,2959,4.5
105069,1130,79132,2.5


In [16]:
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
userSubsetGroup[0:3]

[(178,        userId  movieId  rating
  16123     178        1     2.5
  16124     178        2     3.5
  16165     178      296     5.0
  16423     178     2571     4.0
  16460     178     2959     4.5
  17000     178    79132     4.5
  17018     178    92259     3.5
  17043     178   109487     5.0), (1204,         userId  movieId  rating
  110818    1204        1     4.5
  110819    1204        2     3.5
  110897    1204      296     4.5
  111384    1204     2571     4.5
  111456    1204     2959     5.0
  112087    1204    79132     4.0
  112117    1204    92259     3.5
  112146    1204   109487     4.5), (1848,         userId  movieId  rating
  169378    1848        1     5.0
  169379    1848        2     4.5
  169390    1848      296     4.0
  169495    1848     2571     4.0
  169511    1848     2959     4.5
  169901    1848    79132     4.5
  169969    1848    92259     3.5
  170040    1848   109487     3.0)]

In [17]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [18]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.889296,178
1,0.334077,1204
2,-0.557278,1848
3,0.826788,2452
4,0.336011,2726


In [19]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
40524,1.0,128266
49166,1.0,235435
41068,1.0,134853
45864,1.0,194587
40130,1.0,123552


In [20]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,128266,50,5.0
1,1.0,128266,318,5.0
2,1.0,128266,527,4.5
3,1.0,128266,593,5.0
4,1.0,128266,858,5.0


In [21]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,128266,50,5.0,5.0
1,1.0,128266,318,5.0,5.0
2,1.0,128266,527,4.5,4.5
3,1.0,128266,593,5.0,5.0
4,1.0,128266,858,5.0,5.0


In [22]:
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,43.0,158.0
3,1.0,4.0
4,1.0,2.5
5,1.0,2.0
6,4.0,13.0


In [23]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.674419,1
3,4.0,3
4,2.5,4
5,2.0,5
6,3.25,6


In [24]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
154,5.0,154
110102,5.0,110102
6375,5.0,6375
27317,5.0,27317
4235,5.0,4235
115569,5.0,115569
54503,5.0,54503
1572,5.0,1572
2351,5.0,2351
1211,5.0,1211


In [25]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
152,154,Beauty of the Day (Belle de jour),1967
1185,1211,"Wings of Desire (Himmel über Berlin, Der)",1987
1521,1572,"Contempt (Mépris, Le)",1963
2267,2351,"Nights of Cabiria (Notti di Cabiria, Le)",1957
4142,4235,Amores Perros (Love's a Bitch),2000
6270,6375,Gigantic (A Tale of Two Johns),2002
9283,27317,Audition (Ôdishon),1999
12044,54503,Superbad,2007
23216,110102,Captain America: The Winter Soldier,2014
24581,115569,Nightcrawler,2014
