In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
ratings = ratings.drop('timestamp',1)

  ratings = ratings.drop('timestamp',1)


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


# Creating the user profile

This function is for making the user profile to which the movies will be recommended to

In [7]:
user = [
    {'title':'Toy Story (1995)','rating':4},
    {'title':'Jumanji (1995)','rating':3.5},
    {'title':'Casino (1995)','rating':2},
    {'title':'Othello (1995)','rating':2.5},
    {'title':'Babe (1995)','rating':5}
]

input = pd.DataFrame(user)
input

Unnamed: 0,title,rating
0,Toy Story (1995),4.0
1,Jumanji (1995),3.5
2,Casino (1995),2.0
3,Othello (1995),2.5
4,Babe (1995),5.0


In [8]:
inputId = movies[movies['title'].isin(input['title'].tolist())]

input = pd.merge(inputId, input)

input = input.drop('genres',1)

input

  input = input.drop('genres',1)


Unnamed: 0,movieId,title,rating
0,1,Toy Story (1995),4.0
1,2,Jumanji (1995),3.5
2,16,Casino (1995),2.0
3,26,Othello (1995),2.5
4,34,Babe (1995),5.0


In [9]:
similarUser = ratings[ratings['movieId'].isin(input['movieId'].tolist())]
similarUser.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
516,5,1,4.0
518,5,34,4.0
560,6,2,4.0
571,6,16,4.0


In [10]:
usersGrp = similarUser.groupby(['userId'])

In [11]:
usersGrp.get_group(610)

Unnamed: 0,userId,movieId,rating
99534,610,1,5.0
99536,610,16,4.5


In [12]:
#Sorts the user subsets according to the highest priority of similarity to the input user
usersGrp = sorted(usersGrp, key=lambda x: len(x[1]), reverse=True)

In [13]:
#the userId who has watched the most number of common movies with the input user
usersGrp[0]

(68,
        userId  movieId  rating
 10360      68        1     2.5
 10361      68        2     2.5
 10368      68       16     3.5
 10373      68       26     3.0
 10375      68       34     1.0)

In [14]:
#Dataframe of the topmost priority user
usersGrp[0][1]

Unnamed: 0,userId,movieId,rating
10360,68,1,2.5
10361,68,2,2.5
10368,68,16,3.5
10373,68,26,3.0
10375,68,34,1.0


# Pearson Correlation for finding similarity

This function is used to find the similarity between the input user with a subset of common users

In [15]:
usersGrp = usersGrp[0:200]

In [16]:
pearsonCoDict = {}

for name, group in usersGrp:
    
    group = group.sort_values(by='movieId')
    input = input.sort_values(by='movieId')
    
    n = len(group)
    
    temp = input[input['movieId'].isin(group['movieId'].tolist())]
    
    rateList = temp['rating'].tolist()
    
    grpList = group['rating'].tolist()
    
    #scipy.stats.pearsonr(rateList, grpList)[0]
    
    Sxx = sum([i**2 for i in rateList]) - pow(sum(rateList),2)/float(n)
    Syy = sum([i**2 for i in grpList]) - pow(sum(grpList),2)/float(n)
    Sxy = sum(i*j for i, j in zip(rateList, grpList)) - sum(rateList)*sum(grpList)/float(n)
    
    if Sxx != 0 and Syy != 0:
        pearsonCoDict[name] = Sxy/np.sqrt(Sxx*Syy)
        
    else:
        pearsonCoDict[name] = 0

In [17]:
pearsonCoDict.items()

dict_items([(68, -0.9515190335342454), (474, 0.42246305606312334), (6, 0), (18, -0.9173460685716212), (103, -0.8666666666666667), (240, 0.8666666666666667), (274, -0.48989794855663565), (330, -0.06172133998483677), (357, 0.4098780306383839), (373, -0.11547005383792514), (380, 0.5222329678670935), (414, 0.8703882797784892), (470, 0.8320502943378437), (480, -0.11547005383792514), (483, 0), (599, 0.0), (608, -0.4359546962416407), (19, 0.7559289460184538), (27, 0.6546536707079778), (40, 0.9176629354822458), (64, -0.6546536707079773), (82, -0.9819805060619666), (91, -0.576556660197054), (93, 0.18898223650461524), (112, -0.7205766921228919), (117, -0.8029550685469663), (140, 0.6546536707079778), (144, -0.1889822365046185), (160, 0.9449111825230734), (169, -0.18898223650462054), (177, 0.8461538461538475), (182, -0.6933752452815394), (191, 0), (201, 0.9449111825230636), (217, 0.755928946018457), (226, -0.8386278693775367), (288, 0.6185895741317425), (298, -0.7751332793988399), (305, -0.9819805

In [18]:
pearson = pd.DataFrame.from_dict(pearsonCoDict, orient='index')
pearson.head()

Unnamed: 0,0
68,-0.951519
474,0.422463
6,0.0
18,-0.917346
103,-0.866667


In [19]:
pearson.columns = ['similarity']
pearson['userId'] = pearson.index
pearson.index = range(len(pearson))
pearson.head()

Unnamed: 0,similarity,userId
0,-0.951519,68
1,0.422463,474
2,0.0,6
3,-0.917346,18
4,-0.866667,103


# Top 50 similar users to input user

In [20]:
topUsers = pearson.sort_values(by='similarity', ascending = False)[0:50]
topUsers.head()

Unnamed: 0,similarity,userId
143,1.0,610
107,1.0,282
126,1.0,482
127,1.0,484
122,1.0,412


In [21]:
topUsersRating = topUsers.merge(ratings, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarity,userId,movieId,rating
0,1.0,610,1,5.0
1,1.0,610,6,5.0
2,1.0,610,16,4.5
3,1.0,610,32,4.5
4,1.0,610,47,5.0


In [22]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarity']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarity,userId,movieId,rating,weightedRating
0,1.0,610,1,5.0,5.0
1,1.0,610,6,5.0,5.0
2,1.0,610,16,4.5,4.5
3,1.0,610,32,4.5,4.5
4,1.0,610,47,5.0,5.0


In [23]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarity','weightedRating']]
tempTopUsersRating.columns = ['sum_similarity','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarity,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,32.421653,135.673941
2,27.475583,93.764732
3,6.699553,21.942445
5,5.49942,13.375588
6,15.014486,54.964179


In [24]:
#Empty dataframe to store the recommendation score
recommendation = pd.DataFrame()
#Weighted average for calculating the weighted average recommendation score
recommendation['recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarity']
recommendation['movieId'] = tempTopUsersRating.index
recommendation.head()

Unnamed: 0_level_0,recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.184671,1
2,3.412657,2
3,3.27521,3
5,2.432182,5
6,3.660743,6


In [25]:
recommendation = recommendation.sort_values(by='recommendation score', ascending=False)
recommendation.head()

Unnamed: 0_level_0,recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
290,5.0,290
115727,5.0,115727
92420,5.0,92420
5867,5.0,5867
138632,5.0,138632


In [26]:
movies.loc[movies['movieId'].isin(recommendation.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,genres
251,290,Once Were Warriors (1994),Crime|Drama
2543,3404,Titanic (1953),Action|Drama
4095,5867,Thief (1981),Crime|Drama|Thriller
4119,5909,Visitor Q (Bizita Q) (2001),Comedy|Drama|Horror
4572,6791,Babette's Feast (Babettes gæstebud) (1987),Drama
7174,72142,Love Exposure (Ai No Mukidashi) (2008),Action|Comedy|Drama|Romance
7611,86898,"Tree of Life, The (2011)",Drama
7807,92420,Chronicle (2012),Action|Sci-Fi|Thriller
8551,115727,Crippled Avengers (Can que) (Return of the 5 D...,Action|Adventure
8982,138632,Tokyo Tribe (2014),Action|Crime|Drama|Sci-Fi


In [29]:
import pickle
pickle.dump(recommendation, open('recommendation.csv','wb'))