# Collaborative based Recommendation System

## Importing Libraries

In [1]:
#Dataframe manipulation library
import pandas as pd
#Math functions, we'll only need the sqrt function so let's import only that
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Processing

In [3]:
#Storing the movie information into a pandas dataframe
movies_df = pd.read_csv('movies.csv')
#Storing the user information into a pandas dataframe
ratings_df = pd.read_csv('ratings.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '', regex= True)
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

In [5]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [6]:
#Dropping the genres column
movies_df = movies_df.drop('genres', axis = 1)
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [7]:
ratings_df = ratings_df.drop('timestamp', axis = 1)

In [8]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


<h2> Recommender System </h2>
The process for creating a User Based recommendation system is as follows:
<br>- Select a user with the movies the user has watched
<br>- Based on his rating to movies, find the top X neighbours 
<br>- Get the watched movie record of the user for each neighbour.
<br>- Calculate a similarity score using some formula
<br>- Recommend the items with the highest score


In [12]:
userInput = [
            {'title':'Mission: Impossible - Rogue Nation','rating':4.5},
            {'title':'Maze Runner: Scorch Trials', 'rating':4.5},
            {'title':'Waiting to Exhale', 'rating':4.0},
            {'title':'Grumpier Old Men', 'rating':2},
            {'title':'Father of the Bride Part II', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,Mission: Impossible - Rogue Nation,4.5
1,Maze Runner: Scorch Trials,4.5
2,Waiting to Exhale,4.0
3,Grumpier Old Men,2.0
4,Father of the Bride Part II,4.5


In [13]:
#Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('year', axis = 1)
#Final input dataframe
#If a movie you added in above isn't here, then it might not be in the original 
#dataframe or it might spelled differently, please check capitalisation.
inputMovies

Unnamed: 0,movieId,title,rating
0,3,Grumpier Old Men,2.0
1,4,Waiting to Exhale,4.0
2,5,Father of the Bride Part II,4.5
3,111781,Mission: Impossible - Rogue Nation,4.5
4,117895,Maze Runner: Scorch Trials,4.5


In [14]:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
1,1,3,4.0
561,6,3,5.0
562,6,4,3.0
563,6,5,5.0
1386,14,4,3.0


In [15]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

In [16]:
userSubsetGroup.get_group(6)

Unnamed: 0,userId,movieId,rating
561,6,3,5.0
562,6,4,3.0
563,6,5,5.0


In [17]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [18]:
userSubsetGroup[0:3]

[(448,
         userId  movieId  rating
  68657     448        3     3.0
  68658     448        5     3.0
  70238     448   111781     2.0
  70297     448   117895     1.0),
 (6,
       userId  movieId  rating
  561       6        3     5.0
  562       6        4     3.0
  563       6        5     5.0),
 (43,
        userId  movieId  rating
  6316      43        3     5.0
  6317      43        5     5.0)]

In [19]:
userSubsetGroup = userSubsetGroup[0:100]

In [20]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [21]:
pearsonCorrelationDict.items()

dict_items([(448, -0.5222329678670935), (6, -0.3273268353539887), (43, 0), (58, 1.0), (68, 0), (84, 0), (103, 0), (117, 0), (150, 0), (169, 0), (249, 0), (269, -1.0), (270, 0), (288, -1.0), (321, 0), (337, 0), (414, -1.0), (456, 0), (470, 0), (492, -1.0), (501, -1.0), (561, 0), (567, 0), (590, -1.0), (599, 1.0), (600, 1.0), (1, 0), (14, 0), (19, 0), (21, 0), (31, 0), (32, 0), (42, 0), (44, 0), (45, 0), (51, 0), (63, 0), (64, 0), (66, 0), (91, 0), (100, 0), (102, 0), (107, 0), (111, 0), (116, 0), (119, 0), (120, 0), (121, 0), (147, 0), (151, 0), (162, 0), (170, 0), (179, 0), (181, 0), (184, 0), (200, 0), (217, 0), (226, 0), (229, 0), (240, 0), (262, 0), (276, 0), (289, 0), (294, 0), (302, 0), (305, 0), (307, 0), (308, 0), (328, 0), (330, 0), (339, 0), (353, 0), (368, 0), (380, 0), (389, 0), (402, 0), (408, 0), (410, 0), (411, 0), (418, 0), (437, 0), (451, 0), (458, 0), (474, 0), (477, 0), (480, 0), (483, 0), (489, 0), (490, 0), (509, 0), (521, 0), (534, 0), (544, 0), (552, 0), (555, 0),

In [22]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,-0.522233,448
1,-0.327327,6
2,0.0,43
3,1.0,58
4,0.0,68


In [23]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
25,1.0,600
24,1.0,599
3,1.0,58
65,0.0,305
74,0.0,389


In [24]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,600,1,2.5
1,1.0,600,2,4.0
2,1.0,600,4,1.5
3,1.0,600,5,2.5
4,1.0,600,7,3.5


In [25]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,600,1,2.5,2.5
1,1.0,600,2,4.0,4.0
2,1.0,600,4,1.5,1.5
3,1.0,600,5,2.5,2.5
4,1.0,600,7,3.5,3.5


In [26]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.0,5.5
2,2.0,6.5
3,2.0,4.5
4,1.0,1.5
5,2.0,6.5


In [27]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.75,1
2,3.25,2
3,2.25,3
4,1.5,4
5,3.25,5


In [28]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
27731,5.0,27731
222,5.0,222
475,5.0,475
5060,5.0,5060
293,5.0,293
3435,5.0,3435
1208,5.0,1208
280,5.0,280
1283,5.0,1283
1285,5.0,1285


## Results

In [29]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
189,222,Circle of Friends,1995
242,280,Murder in the First,1995
254,293,Léon: The Professional (a.k.a. The Professiona...,1994
413,475,In the Name of the Father,1993
909,1208,Apocalypse Now,1979
982,1283,High Noon,1952
984,1285,Heathers,1989
2568,3435,Double Indemnity,1944
3673,5060,M*A*S*H (a.k.a. MASH),1970
5686,27731,"Cat Returns, The (Neko no ongaeshi)",2002
