In [1]:
#Importing the libraries
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#Reading in the data
movies_df= pd.read_csv('movies.csv') #movieId, title, genres, year
ratings_df= pd.read_csv('ratings.csv') #userId, movieId, rating, timestamp

In [2]:
#Dropping the columns we do not need
movies_df= movies_df.drop('genres', 1)
ratings_df= ratings_df.drop('timestamp', 1)

In [3]:
# select a target user
targetUser = ratings_df.query('userId == 1')

In [4]:
targetUser

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0


In [5]:
# remove target user from ratings dataset
ratings_df = ratings_df.drop(ratings_df[ratings_df.userId.isin(targetUser.userId.tolist())].index)

In [6]:
def similarUsers(targetUser):
    similarUsers = ratings_df[ratings_df.movieId.isin(targetUser.movieId.tolist())]
    return similarUsers

In [7]:
similarUsers = similarUsers(targetUser)

In [8]:
def targetUserPotentialMovies(targetUser, similarUsers):
    smlarUsrsRtdMvs = ratings_df[ratings_df.userId.isin(similarUsers.userId.tolist())]
    targetUserPotentialMovies = smlarUsrsRtdMvs.drop(smlarUsrsRtdMvs[smlarUsrsRtdMvs.movieId.isin(targetUser.movieId.tolist())].index)
    return targetUserPotentialMovies

In [9]:
#movies not rated by target user, but rated by similar users
targetUserPotentialMovies = targetUserPotentialMovies(targetUser, similarUsers)  

In [10]:
def similarityIndex(targetUser, similarUsers):
    similarityIndex = {}
    #group the users by userId - this will help in computing similarity index with each user
    similarUsersGrp = similarUsers.groupby(similarUsers.userId)
    for similarUserId, similarUserMovieRating in similarUsersGrp:
        #Need scores for the movies that both target and similar user have in common
        #The similar user was chosen only if it had the same movie as target user, and no additional step needed
        #For target user chose the movies that the current similar user has
        targetUserX = targetUser[targetUser.movieId.isin(similarUserMovieRating['movieId'].tolist())]
        # sort the lists based on movie Ids
        targetUserX = targetUserX.sort_values('movieId')
        similarUserMovieRating = similarUserMovieRating.sort_values('movieId')
        #create the list of ratings to be used in pearson coorelation equation
        xs = targetUserX['rating'].tolist()
        ys = similarUserMovieRating['rating'].tolist()
        xlen = float(len(xs))
        Sxy = sum(x*y for x, y in zip(xs, ys)) - (sum(xs)*sum(ys))/xlen
        Sxx = sum(x*x for x in xs) - pow(sum(xs),2)/xlen
        Syy = sum(y*y for y in ys) - pow(sum(ys),2)/xlen
        if (Sxx != 0 and Syy !=0):
            similarityIndex[similarUserId] = Sxy / sqrt(Sxx*Syy)
        else:
            similarityIndex[similarUserId] = 0
    return similarityIndex

In [11]:
#determine similarity index for target user w.r.t similar users
similarityIndex = similarityIndex(targetUser, similarUsers)

In [12]:
similarUsers.head()

Unnamed: 0,userId,movieId,rating
491,13,169,1.0
663,14,169,3.0
1298,17,169,1.0
2059,17,2471,2.0
3880,37,48516,5.0


In [13]:
#similarityIndex.items()
similarityIndexDF = pd.DataFrame.from_dict(similarityIndex, orient ='index')
similarityIndexDF.columns= ['similarityIndex']
similarityIndexDF['userId'] = similarityIndexDF.index
similarityIndexDF.index = range(len(similarityIndexDF))

In [14]:
targetUserPotentialMovies = targetUserPotentialMovies.merge(similarityIndexDF, how='inner', left_on='userId', right_on='userId')
targetUserPotentialMovies['weightedRating'] = targetUserPotentialMovies['rating']*targetUserPotentialMovies['similarityIndex']
normalizedRating = targetUserPotentialMovies.groupby('movieId').sum()[['similarityIndex','weightedRating']]
normalizedRating.columns = ['SumSimilarityIndex','SumWeightedRating']
normalizedRating = normalizedRating.drop(normalizedRating[normalizedRating.SumSimilarityIndex == 0].index)
targetUserMovieRecommendations = pd.DataFrame()
targetUserMovieRecommendations['avg_weighted_rating'] = normalizedRating['SumWeightedRating'] / normalizedRating['SumSimilarityIndex']
targetUserMovieRecommendations['movieId'] = normalizedRating.index
targetUserMovieRecommendations.index=range(len(targetUserMovieRecommendations))
targetUserMovieRecommendations = targetUserMovieRecommendations.sort_values(by='avg_weighted_rating', ascending=False)

In [15]:
top10TargetUserMovieRecommendations = targetUserMovieRecommendations[0:10]

In [16]:
top10TargetUserMovieRecommendations = movies_df[movies_df.movieId.isin(top10TargetUserMovieRecommendations.movieId.tolist())]
top10TargetUserMovieRecommendations

Unnamed: 0,movieId,title,year
397,401,Mirage,1995.0
3045,3131,Broadway Damage,1997.0
4628,4722,All Over the Guy,2001.0
7793,8394,"Hi-Line, The",1999.0
8262,8944,"Dust Factory, The",2004.0
11249,47538,Crime Busters,1977.0
13335,65352,"Have Rocket, Will Travel",1959.0
13883,69485,Air Hawks,1935.0
15281,77852,Tales of Terror (Kaidan Shin Mimibukuro),2004.0
28339,131050,Stargate SG-1 Children of the Gods - Final Cut,2009.0


In [17]:
myMovies = movies_df[movies_df.movieId.isin(targetUser.movieId.tolist())]
myMovies

Unnamed: 0,movieId,title,year
167,169,Free Willy 2: The Adventure Home,1995.0
2387,2471,Crocodile Dundee II,1988.0
11372,48516,"Departed, The",2006.0
