In [5]:
import pandas as pd
import numpy as np


In [6]:
dataFile='u.data'
data=pd.read_csv(dataFile,sep="\t",header=None,
                 names=['userId','itemId','rating','timestamp'])

In [7]:
data.head()


Unnamed: 0,userId,itemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [8]:
movieInfoFile="u.item"
movieInfo=pd.read_csv(movieInfoFile,sep="|", header=None, index_col=False,
                     names=["itemId","title"], usecols=[0,1],encoding = 'latin')

In [9]:
movieInfo.head()


Unnamed: 0,itemId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [10]:
data=pd.merge(data,movieInfo,left_on='itemId',right_on="itemId")

In [11]:
data.head()

Unnamed: 0,userId,itemId,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [12]:
data.tail()

Unnamed: 0,userId,itemId,rating,timestamp,title
99995,840,1674,4,891211682,Mamma Roma (1962)
99996,655,1640,3,888474646,"Eighth Day, The (1996)"
99997,655,1637,3,888984255,Girls Town (1996)
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1..."
99999,655,1641,3,887427810,Dadetown (1995)


In [13]:
userIds=data.userId # a Pandas series object
userIds2=data[['userId']]

In [14]:
userIds.head()
userIds2.head()


Unnamed: 0,userId
0,196
1,63
2,226
3,154
4,306


In [15]:
userIds2.head()


Unnamed: 0,userId
0,196
1,63
2,226
3,154
4,306


In [16]:
data.loc[0:10,['userId']]

Unnamed: 0,userId
0,196
1,63
2,226
3,154
4,306
5,296
6,34
7,271
8,201
9,209


In [17]:
toyStoryUsers=data[data.title=="Toy Story (1995)"]
# This will give us a subset dataframe with only the users who have rated Toy Story
toyStoryUsers.head()

Unnamed: 0,userId,itemId,rating,timestamp,title
3397,308,1,4,887736532,Toy Story (1995)
3398,287,1,5,875334088,Toy Story (1995)
3399,148,1,4,877019411,Toy Story (1995)
3400,280,1,4,891700426,Toy Story (1995)
3401,66,1,3,883601324,Toy Story (1995)


In [18]:
data=pd.DataFrame.sort_values(data,['userId','itemId'],ascending=[0,1]) #Sorting

In [19]:
data.head()

Unnamed: 0,userId,itemId,rating,timestamp,title
23781,943,2,5,888639953,GoldenEye (1995)
65410,943,9,3,875501960,Dead Man Walking (1995)
35098,943,11,4,888639000,Seven (Se7en) (1995)
43773,943,12,5,888639093,"Usual Suspects, The (1995)"
57040,943,22,4,888639042,Braveheart (1995)


In [20]:
numUsers=max(data.userId)
numMovies=max(data.itemId)

In [21]:
numUsers


943

In [22]:
numMovies

1682

In [23]:
moviesPerUser=data.userId.value_counts()
usersPerMovie=data.title.value_counts()


In [24]:
usersPerMovie


Star Wars (1977)                                 583
Contact (1997)                                   509
Fargo (1996)                                     508
Return of the Jedi (1983)                        507
Liar Liar (1997)                                 485
English Patient, The (1996)                      481
Scream (1996)                                    478
Toy Story (1995)                                 452
Air Force One (1997)                             431
Independence Day (ID4) (1996)                    429
Raiders of the Lost Ark (1981)                   420
Godfather, The (1972)                            413
Pulp Fiction (1994)                              394
Twelve Monkeys (1995)                            392
Silence of the Lambs, The (1991)                 390
Jerry Maguire (1996)                             384
Chasing Amy (1997)                               379
Rock, The (1996)                                 378
Empire Strikes Back, The (1980)               

In [27]:
def favoriteMovies(activeUser,N):
    #1. subset the dataframe to have the rows corresponding to the active user
    # 2. sort by the rating in descending order
    # 3. pick the top N rows
    topMovies=pd.DataFrame.sort_values(
        data[data.userId==activeUser],['rating'],ascending=[0])[:N]
    return list(topMovies.title)

In [28]:
print(favoriteMovies(5,3))

['Men in Black (1997)', 'Blade Runner (1982)', 'Empire Strikes Back, The (1980)']


In [29]:
userItemRatingMatrix=pd.pivot_table(data, values='rating',
                                    index=['userId'], columns=['itemId'])

In [30]:
userItemRatingMatrix.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [31]:
from scipy.spatial.distance import correlation 
def similarity(user1,user2):
    user1=np.array(user1)-np.nanmean(user1) # I am first normalizing user1 by 
    # the mean rating of user 1 for any movie
    user2=np.array(user2)-np.nanmean(user2)
    commonItemIds=[i for i in range(len(user1)) if user1[i]>0 and user2[i]>0]
    # Gives us movies for which both users have non NaN ratings 
    if len(commonItemIds)==0:
        # If there are no movies in common 
        return 0
    else:
        user1=np.array([user1[i] for i in commonItemIds])
        user2=np.array([user2[i] for i in commonItemIds])
        return correlation(user1,user2)

In [32]:

# Using this similarity function, let's find the nearest neighbours of the active user
def nearestNeighbourRatings(activeUser,K):
    # This function will find the K Nearest neighbours of the active user, then 
    # use their ratings to predict the activeUsers ratings for other movies 
    similarityMatrix=pd.DataFrame(index=userItemRatingMatrix.index,
                                  columns=['Similarity'])
    for i in userItemRatingMatrix.index:
        similarityMatrix.loc[i]=similarity(userItemRatingMatrix.loc[activeUser],
                                          userItemRatingMatrix.loc[i])
    similarityMatrix=pd.DataFrame.sort_values(similarityMatrix,
                                              ['Similarity'],ascending=[0])
    # Sort the similarity matrix in the descending order of similarity 
    nearestNeighbours=similarityMatrix[:K]
    neighbourItemRatings=userItemRatingMatrix.loc[nearestNeighbours.index]
    predictItemRating=pd.DataFrame(index=userItemRatingMatrix.columns, columns=['Rating'])
    for i in userItemRatingMatrix.columns:
        # for each item 
        predictedRating=np.nanmean(userItemRatingMatrix.loc[activeUser])
        # start with the average rating of the user
        for j in neighbourItemRatings.index:
            # for each neighbour in the neighbour list 
            if userItemRatingMatrix.loc[j,i]>0:
                # If the neighbour has rated that item
                # Add the rating of the neighbour for that item
                #    adjusted by 
                #    the average rating of the neighbour 
                #    weighted by 
                #    the similarity of the neighbour to the active user
                predictedRating += (userItemRatingMatrix.loc[j,i]
                                    -np.nanmean(userItemRatingMatrix.loc[j]))*nearestNeighbours.loc[j,'Similarity']
        # rating to the predicted Rating matrix
        predictItemRating.loc[i,'Rating']=predictedRating
    return predictItemRating

In [33]:
def topNRecommendations(activeUser,N):
    predictItemRating=nearestNeighbourRatings(activeUser,10)
    # Use the 10 nearest neighbours to find the predicted ratings
    moviesAlreadyWatched=list(userItemRatingMatrix.loc[activeUser]
                              .loc[userItemRatingMatrix.loc[activeUser]>0].index)
    # find the list of items whose ratings which are not NaN
    predictItemRating=predictItemRating.drop(moviesAlreadyWatched)
    topRecommendations=pd.DataFrame.sort_values(predictItemRating,
                                                ['Rating'],ascending=[0])[:N]
    # This will give us the list of itemIds which are the top recommendations 
    # Let's find the corresponding movie titles 
    topRecommendationTitles=(movieInfo.loc[movieInfo.itemId.isin(topRecommendations.index)])
    return list(topRecommendationTitles.title)

In [34]:
activeUser=5
print(favoriteMovies(activeUser,5),"\n",topNRecommendations(activeUser,3))

  dist = 1.0 - np.dot(um, vm) / (norm(um) * norm(vm))


['Men in Black (1997)', 'Blade Runner (1982)', 'Empire Strikes Back, The (1980)', 'Wrong Trousers, The (1993)', 'Blues Brothers, The (1980)'] 
 ['Truth About Cats & Dogs, The (1996)', 'Scream (1996)', 'First Wives Club, The (1996)']


In [35]:

def matrixFactorization(R, K, steps=10, gamma=0.001,lamda=0.02):
    N=len(R.index)# Number of users
    M=len(R.columns) # Number of items 
    P=pd.DataFrame(np.random.rand(N,K),index=R.index)
    # This is the user factor matrix i want to find. It will have N rows 
    Q=pd.DataFrame(np.random.rand(M,K),index=R.columns)
    # This is the product factor matrix i want to find. It will have M rows, 
    # one for each product/item/movie. 
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    # For each rating that exists in the training set 
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    # The value in the brackets is the partial derivative of the 
                    # error function ie the slope. Lamda is the value of the 
                    # regularization parameter which penalizes the model for the 
                    # number of factors we are finding. 
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        # At the end of this i have looped through all the ratings once. 
        # Let's check the value of the error function to see if we have reached 
        # the threshold at which we want to stop, else we will repeat the process
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    #Sum of squares of the errors in the rating
                    e= e + pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        print(step)
    return P,Q

# Let's call this function now 
(P,Q)=matrixFactorization(userItemRatingMatrix.iloc[:100,:100],K=2,gamma=0.001,lamda=0.02, steps=100)
# This will be pretty expensive computationally. For now lets just do it over a 
# part of the rating matrix to see how it works. I've kept the steps to 100 too. 
  

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [36]:
activeUser=1
predictItemRating=pd.DataFrame(np.dot(P.loc[activeUser],Q.T),index=Q.index,columns=['Rating'])
topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:3]
# I found the ratings of all movies by the active user and then sorted them to find the top 3 movies 
topRecommendationTitles=movieInfo.loc[movieInfo.itemId.isin(topRecommendations.index)]
print(list(topRecommendationTitles.title))

['Star Wars (1977)', 'Shawshank Redemption, The (1994)', 'Fargo (1996)']
