In [31]:
"""
Date   : Nov 12 2017
@author: Archana Neelipalayam Masilamani
Project Description:
Implemented Nearest Neighbor Recommender System
 
The dataset used is MovieLens.

The task is to recommend movie to a user based on the ratings of the user and other users
K-Nearest Neighborrecommandation method is followed
 
"""



'\nDate   : Nov 12 2017\n@author: Archana Neelipalayam Masilamani\nProject Description:\nImplemented Nearest Neighbor Recommender System\n \nThe dataset used is MovieLens.\n\nThe task is to recommend movie to a user based on the ratings of the user and other users\nK-Nearest Neighborrecommandation method is followed\n \n'

In [32]:

import pandas as pd
import numpy as np 
from scipy.spatial.distance import correlation 

#Read the movie.csv and ratings.csv files

ratings_data = pd.read_csv("ml-latest-small/ratings.csv",
                           index_col=False)
movies_data  = pd.read_csv("ml-latest-small/movies.csv",
                           index_col=False, usecols=[0,1])

#Merging movie and rating data
data = ratings_data.merge(movies_data,left_on = 'movieId', right_on = 'movieId')

data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,31,2.5,1260759144,Dangerous Minds (1995)
1,7,31,3.0,851868750,Dangerous Minds (1995)
2,31,31,4.0,1273541953,Dangerous Minds (1995)
3,32,31,4.0,834828440,Dangerous Minds (1995)
4,36,31,3.0,847057202,Dangerous Minds (1995)


In [33]:
#Create a pivot table such that rows are UserId's, columns are all set of movies, and values
#are rating given by the user for that movie. If a rating is not available, then the
#value will be NaN

data_table = pd.pivot_table(data, values='rating',index ='userId',columns='movieId')

data_table.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,


In [34]:
# To find the K nearest neighbors, the similarity between
# two users has to be found. Correlation function from
# 'scipy' library can be used for this purpose

def similarity(user1,user2):   
# Normalize the ratings of user1 and user2 by mean ratings of user1
# for any movie
    user1 = np.array(user1) - np.nanmean(user1)
    user2 = np.array(user2) - np.nanmean(user2)
# The similarty between two users is found only using the rating data
# for movies that has been rated by both the users in common

    cominItemId  = [i for i in range(len(user1)) if user1[i]>0 and user2[i]>0]

    if len(cominItemId) == 0:
        return 0;
    else:
        user1 = np.array([user1[i] for i in cominItemId])
        user2 = np.array([user2[i] for i in cominItemId])
    return correlation(user1,user2)

# correlation(np.array(data_table.loc[1]),np.array(data_table.loc[2]))
 
    
# Function to find the ratings of the nearest neighbors

def kNearestNeighbor(K,currentuser):
    # Create a dataframe which will be used to hold the similarity between the
    # current user and all the other users -> Rows will be all users and 
    #column will be the similarity value between the current user and all users
    similarityDataframe = pd.DataFrame(index = data_table.index, columns = 
                                       ['SimilarityVal'])
    
    #Call the function Similarity between the current user and all the other users
    for i in data_table.index:
        similarityDataframe.loc[i] = similarity(data_table.loc[currentuser],data_table.loc[i])
        
    # Sort the similarity Dataframe    
    similarityDataframe = pd.DataFrame.sort_values(similarityDataframe,
                                          ['SimilarityVal'],ascending=[0])                                                             
    
    nearestNeighbors = similarityDataframe[:K]
    
    return nearestNeighbors
    

# The following script is to test the function for sample data where K is 3 and user id is 1
nearestNeighborList = kNearestNeighbor(3,1)
nearestNeighborList


Unnamed: 0_level_0,SimilarityVal
userId,Unnamed: 1_level_1
265,2
468,2
195,2


In [35]:
# This script is used in the below function 
neighborRatings = data_table.loc[nearestNeighborList.index]
neighborRatings.head()



movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
265,,,,,,,,,,,...,,,,,,,,,,
468,4.0,2.0,2.5,,,3.5,,,,3.0,...,,,,,,,,,,
195,,,,,,4.0,,,,,...,,,,,,,,,,


In [36]:
# Function to find the predicted ratings based on the K-Nearest
# neighbors(by applying the formula). The weight in formula is nothing
# but the similarity value between the user and the neighbor

#First take all the ratings of the nearest neighbors
def predictedRatings(K,currentUser):    
    
    nearestNeighborList = kNearestNeighbor(K,currentUser)
    
    neighborRatings = data_table.loc[nearestNeighborList.index]
    neighborRatings.head()
    # Create a final predicted ratings matrix, where the 
    # rows will be the movies and column will be the predicted ratings

    predictedItemRating = pd.DataFrame(index = data_table.columns,
                                  columns = ['Rating'])
    # Refer formula
    # For every movie 
    for  i in data_table.columns:
        
    #Find the mean of rating of the current user(mean of 'ra'
    #as per the formula)
        currentUserMeanRating = np.nanmean(data_table.loc[currentUser])
        predictedRating = currentUserMeanRating
        ratingsSum = 0
        weightSum = 0
        #For every Neighbor
        for j in neighborRatings.index:
            weightSum += nearestNeighborList.loc[j,'SimilarityVal']
            #If the rating of that movie is given by the neighbor
            if neighborRatings.loc[j,i]>0:  
             # As per the formula, inside the summation, for every user,
             # we have to sum the product of weight(similarity) between the
             # currrent user & the neighbor and the difference between the 
             # rating of the neighbor to the mean of ratings of the neighbor
             # This is added to the mean ratings of current user for all movies
             # In the below formula, we are not normalizing with sum of similarity
             # or weights between the users. But that can be added if required
             ratingsSum += (neighborRatings.loc[j,i]                                   
                  - np.nanmean(neighborRatings.loc[j]))*nearestNeighborList.loc[j,'SimilarityVal']
        
            #Normalizing the weights
            predictedRating  = predictedRating + (ratingsSum/weightSum)           
        # Adding the predicted rating calculated to the predictedItemRating    
        predictedItemRating.loc[i] =  predictedRating
    
    return predictedItemRating

# The following script is to test the function for sample data where K is 4 and user id is 1
predictedRatingsR = predictedRatings(4,1)
pd.DataFrame.sort_values(predictedRatingsR,['Rating'],ascending=[0]).head()


Unnamed: 0_level_0,Rating
movieId,Unnamed: 1_level_1
923,8.26125
2186,7.01863
913,7.01863
1233,6.88625
608,6.59458


In [37]:
#This script gives the movies already watched by user 3. Its 
#used in the below function
data_table.loc[3].loc[data_table.loc[3]>0].head()


movieId
60     3.0
110    4.0
247    3.5
267    3.0
296    4.5
Name: 3, dtype: float64

In [41]:
# Function to use the ablove predicted ratings to recommend the
# top N movies for the user

def movieRecommendation(N,currentUser):
    # Here we take K = 10, for the number of nearest neighbors
    predictedRatingList = predictedRatings(10,currentUser)
    # Extract the list of movies already watched by the current user
    # since we do not have to include it for the recommendation
    moviesWatchedByCurrentUser = list(data_table.loc[currentUser]
                                     .loc[data_table.loc[currentUser]>0].index)
    # Remove the details of already watched movies from 
    # the predictedRatingList
    predictedRatingList = predictedRatingList.drop(moviesWatchedByCurrentUser)
    
    # Now sort the predictedRatingList to find the top N predicted ratings
    
    topMovies = pd.DataFrame.sort_values(predictedRatingList,['Rating'],ascending=[0])[:N]
    
    # Extract the movie titles for the top N movies
    topMoviesTitle = movies_data.loc[movies_data.movieId.isin(topMovies.index)]
    
    return topMoviesTitle
   
currentUser = 180
N = 10
result = movieRecommendation(N,currentUser)['title']

#Remove the index from the dataframe by converting it to a string
result = result.to_string(index=False)

print("The top ", N, " movies recommandation for the user ",currentUser, ":\n")


#Print to top N movies
for movie in result.split('\n'):
    print(movie.strip())


  dist = 1.0 - uv / np.sqrt(uu * vv)


The top  10  movies recommandation for the user  180 :

Sense and Sensibility (1995)
Shawshank Redemption, The (1994)
Piano, The (1993)
Silence of the Lambs, The (1991)
Fargo (1996)
Casablanca (1942)
Raiders of the Lost Ark (Indiana Jones and the...
Cool Hand Luke (1967)
Last Emperor, The (1987)
Lord of the Rings: The Fellowship of the Ring,...


In [39]:
# End