In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## User & Item based Collaborative Filtering

### Load data

In [2]:
import pandas as pd
import numpy as np

In [3]:
ratings = pd.read_csv("ratings_sub.csv",encoding = "ISO-8859-1")

In [4]:
ratings.shape

(487469, 7)

In [5]:




ratings.userId=ratings.userId.astype(str)
ratings.movieId=ratings.movieId.astype(str)

In [6]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres', 'year'], dtype='object')

### Data Exploration & Transformation

<b> Find the top 10 most popular movies watched </b>


In [7]:
# Total unique users 
print("total unique users - ",len(ratings["userId"].unique()))

total unique users -  2827


In [8]:
ratings["title"].value_counts().head(10)

Lord of the Rings: The Fellowship of the Ring, The (2001)        2481
Shrek (2001)                                                     2356
Lord of the Rings: The Two Towers, The (2002)                    2338
Lord of the Rings: The Return of the King, The (2003)            2235
Gladiator (2000)                                                 2226
Spider-Man (2002)                                                2157
Pirates of the Caribbean: The Curse of the Black Pearl (2003)    2122
Ocean's Eleven (2001)                                            2110
Memento (2000)                                                   2082
Minority Report (2002)                                           2021
Name: title, dtype: int64

<b> Q: Who are the users with maximum no of movies watched? </b>

In [9]:
# Users with max no of movies watches
ratings["userId"].value_counts().head()

120378    200
83313     200
129585    200
40651     200
59148     200
Name: userId, dtype: int64

### Transforming data to surprise format

In [10]:
!pip install surprise



In [11]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))

In [12]:
data = Dataset.load_from_df(ratings[['userId', 'title', 'rating']], reader)

In [13]:
data

<surprise.dataset.DatasetAutoFolds at 0x123370358>

In [14]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

# to build on full data
#trainset = data.build_full_trainset()

In [15]:
type(trainset)

surprise.trainset.Trainset

### Making sense of trainset 

Points to Note:
    
    1) Trainset is no longer a pandas dataframe. Rather, it's a specific datatypes defined by the Surprise library
    2) UserId and ItemId in the pandas dataframe can contain any value (either string/integer etc). However, Trainset convert these raw ids into numeric indexes called as "inner id"
    3) Methods are provided to convert rw id to inner id and vice verca

In [16]:
# user item rating data can be obtained as follows
user_records = trainset.ur
type(user_records)

collections.defaultdict

In [17]:
for keys in user_records.keys():
    print(keys)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [18]:
user_records[0]

[(0, 3.0),
 (195, 4.0),
 (1066, 3.5),
 (999, 3.5),
 (237, 3.0),
 (1577, 3.0),
 (932, 2.0),
 (247, 4.5),
 (2215, 3.0),
 (221, 4.0),
 (745, 3.0),
 (133, 3.0),
 (249, 3.0),
 (1065, 2.5),
 (255, 3.5),
 (167, 4.0),
 (586, 3.5),
 (1234, 4.0),
 (259, 4.5),
 (729, 2.5),
 (236, 3.5),
 (181, 3.5),
 (3245, 3.5),
 (1014, 3.0),
 (577, 5.0),
 (2789, 3.5),
 (91, 4.0),
 (10, 4.0),
 (19, 3.5),
 (274, 4.0),
 (2135, 3.0),
 (1419, 3.5),
 (695, 4.0),
 (1373, 3.5),
 (850, 3.0),
 (334, 4.0),
 (2759, 3.0),
 (222, 3.0),
 (37, 4.0),
 (380, 2.5),
 (544, 4.0),
 (542, 4.5),
 (1135, 5.0),
 (650, 5.0),
 (4625, 3.5),
 (341, 1.0),
 (780, 4.0),
 (2371, 3.0),
 (661, 4.0),
 (4742, 4.5),
 (1660, 3.5),
 (4189, 2.5),
 (110, 2.5),
 (2349, 3.0),
 (2285, 3.5),
 (2623, 3.0),
 (1001, 4.0),
 (1490, 3.0),
 (171, 4.0),
 (465, 4.0),
 (733, 5.0),
 (894, 3.0),
 (3771, 3.0),
 (933, 3.0),
 (1083, 3.0),
 (3003, 3.0),
 (11, 3.0),
 (756, 2.5),
 (604, 3.5),
 (258, 4.0),
 (725, 5.0),
 (320, 3.5),
 (1838, 5.0),
 (383, 3.0),
 (3977, 3.0),
 (19

In [19]:
# However the ids are the inner ids and not the raw ids
# raw ids can be obatined as follows

print(trainset.to_raw_uid(0))
print(trainset.to_raw_iid(1066))

248
Step Up 2 the Streets (2008)


In [20]:
user_records[0]

[(0, 3.0),
 (195, 4.0),
 (1066, 3.5),
 (999, 3.5),
 (237, 3.0),
 (1577, 3.0),
 (932, 2.0),
 (247, 4.5),
 (2215, 3.0),
 (221, 4.0),
 (745, 3.0),
 (133, 3.0),
 (249, 3.0),
 (1065, 2.5),
 (255, 3.5),
 (167, 4.0),
 (586, 3.5),
 (1234, 4.0),
 (259, 4.5),
 (729, 2.5),
 (236, 3.5),
 (181, 3.5),
 (3245, 3.5),
 (1014, 3.0),
 (577, 5.0),
 (2789, 3.5),
 (91, 4.0),
 (10, 4.0),
 (19, 3.5),
 (274, 4.0),
 (2135, 3.0),
 (1419, 3.5),
 (695, 4.0),
 (1373, 3.5),
 (850, 3.0),
 (334, 4.0),
 (2759, 3.0),
 (222, 3.0),
 (37, 4.0),
 (380, 2.5),
 (544, 4.0),
 (542, 4.5),
 (1135, 5.0),
 (650, 5.0),
 (4625, 3.5),
 (341, 1.0),
 (780, 4.0),
 (2371, 3.0),
 (661, 4.0),
 (4742, 4.5),
 (1660, 3.5),
 (4189, 2.5),
 (110, 2.5),
 (2349, 3.0),
 (2285, 3.5),
 (2623, 3.0),
 (1001, 4.0),
 (1490, 3.0),
 (171, 4.0),
 (465, 4.0),
 (733, 5.0),
 (894, 3.0),
 (3771, 3.0),
 (933, 3.0),
 (1083, 3.0),
 (3003, 3.0),
 (11, 3.0),
 (756, 2.5),
 (604, 3.5),
 (258, 4.0),
 (725, 5.0),
 (320, 3.5),
 (1838, 5.0),
 (383, 3.0),
 (3977, 3.0),
 (19

<b> In Class Assignment </b>

Confirm the raw to internal id mapping with original data, for a given user/item combination (uid - 0 & iid - 1066)


### Training the model

In [21]:
from surprise import KNNWithMeans
from surprise import accuracy

In [22]:
algo = KNNWithMeans(k=50, sim_options={'name': 'cosine', 'user_based': False})
algo.fit(trainset)


Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x112047d30>

### Find K most similiar items

<b> In-class assignment </b>

Which movies are most similiar to Finding Nemo? (Hint: Use <b> get_neighbors </b> method of the algo object)

In [23]:
sim_movies = algo.get_neighbors(trainset.to_inner_iid("Finding Nemo (2003)"), k=10)

In [24]:
[trainset.to_raw_iid(x) for x in sim_movies]

['New Kids Turbo (2010)',
 'Maze (2000)',
 'Lone Survivor (2013)',
 'Gangster No. 1 (2000)',
 'Spy Kids: All the Time in the World in 4D (2011)',
 'Darwin Awards, The (2006)',
 'Last Legion, The (2007)',
 'Man Who Copied, The (Homem Que Copiava, O) (2003)',
 'Bootmen (2000)',
 'Glee: The 3D Concert Movie (2011)']

### Evaluating Model Performance

In [25]:
len(testset)

121868

In [26]:
testset[0:5]

[('107317', 'Signs (2002)', 2.5),
 ('103061', 'Inconvenient Truth, An (2006)', 4.5),
 ('84115', 'Battlefield Earth (2000)', 2.5),
 ('130756',
  'Fast and the Furious: Tokyo Drift, The (Fast and the Furious 3, The) (2006)',
  2.0),
 ('24878', 'Drive (2011)', 4.5)]

In [27]:
# Evalute on test set
test_pred = algo.test(testset)

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 0.8120


0.8120332922546626

In [28]:
# View a particular prediction
test_pred[12]

# To access a particular value, say estimate simply mention test_pred[12].est

Prediction(uid='7051', iid='Black Hawk Down (2001)', r_ui=5.0, est=4.017481906647013, details={'actual_k': 50, 'was_impossible': False})

In [29]:
test_pred[12].details["actual_k"]

50

In [30]:
# convert results to dataframe
test_pred_df = pd.DataFrame(test_pred)
test_pred_df["was_impossible"] = [x["was_impossible"] for x in test_pred_df["details"]]

In [31]:
test_pred_df.loc[test_pred_df.was_impossible].head(5)

Unnamed: 0,uid,iid,r_ui,est,details,was_impossible
159,36730,Grill Point (Halbe Treppe) (2002),3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
604,131040,Escape from Planet Earth (2013),2.0,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
827,116349,No Good Deed (2014),3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
1865,124431,Films to Keep You Awake: The Christmas Tale (P...,0.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
2020,21811,Insanitarium (2008),3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True


<b> In class assignment </b>

What does <i>"was impossible": True indicate?</i>  

For how many cases in Test set, the predictions are set to "was_impossible"? And what could be the reasons for it?

In [32]:
test_pred_df["was_impossible"].iloc[0]

False

### Predictions

In [33]:
# Mkae prediction for a single user
algo.predict(uid="user_405",iid="Wrong Trousers, The (1993)")

Prediction(uid='user_405', iid='Wrong Trousers, The (1993)', r_ui=None, est=3.511396303620614, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'})

### Generating top n recommendations

In [39]:
testset_new = trainset.build_anti_testset()

In [40]:
len(testset_new)

17308818

In [41]:
testset_new[0:5]

[('248', 'Disturbia (2007)', 3.511396303620614),
 ('248', 'Hamlet 2 (2008)', 3.511396303620614),
 ('248', 'Unbreakable (2000)', 3.511396303620614),
 ('248', 'Finding Neverland (2004)', 3.511396303620614),
 ('248', 'X2: X-Men United (2003)', 3.511396303620614)]

In [42]:
predictions = algo.test(testset_new[0:10000])

In [43]:
predictions_df = pd.DataFrame([[x.uid,x.est] for x in predictions])

In [44]:
predictions_df.columns = ["userId","est_rating"]
predictions_df.sort_values(by = ["userId", "est_rating"],ascending=False,inplace=True)

In [45]:
predictions_df.head()

Unnamed: 0,userId,est_rating
9147,45844,5.0
9524,45844,5.0
9539,45844,5.0
9877,45844,5.0
9978,45844,5.0


In [46]:
top_10_recos = predictions_df.groupby("userId").head(10).reset_index(drop=True)

## SVD Based Recommendation

In [47]:
# Lets exclude movies with very few ratings, say less than 5
movie_count = ratings["title"].value_counts(ascending=False)
pop_movie = movie_count.loc[movie_count.values > 200].index
len(pop_movie)


567

In [48]:
ratings = ratings.loc[ratings.title.isin(pop_movie)]
ratings.shape

(350710, 7)

In [49]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'title', 'rating']], reader)

In [50]:
ratings.shape

(350710, 7)

In [51]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

# to build on full data
#trainset = data.build_full_trainset()

In [52]:
from surprise import SVD
from surprise import accuracy

In [53]:
svd_model = SVD(n_factors=50,biased=False)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x12311b5c0>

In [54]:
test_pred = svd_model.test(testset)

In [55]:

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 0.7698


0.7698340854126204

<b> In class assignment </b>

What is the impact of changing the number of SVD components on model accuracy?

<b> Recreating the SVD predictions using Matrix multiplcation of User and Item factors </b>

In [56]:
user_factors = svd_model.pu
user_factors.shape
item_factors = svd_model.qi
item_factors.shape

(2827, 50)

(567, 50)

In [57]:
pred = np.dot(user_factors,np.transpose(item_factors))

In [58]:
pred[1523,0:5]

array([3.54782208, 2.90553081, 3.99570312, 4.33002269, 3.784254  ])

In [59]:
svd_model.predict(uid = trainset.to_raw_uid(1523), iid = trainset.to_raw_iid(0))

Prediction(uid='19573', iid='Shanghai Noon (2000)', r_ui=None, est=3.547822084863256, details={'was_impossible': False})

<b> Parameter tuning of SVD Recommendation system </b>

In [60]:
from surprise.model_selection import GridSearchCV
param_grid = {'n_factors' : [5,10,15], "reg_all":[0.01,0.02]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3,refit = True)


In [61]:
gs.fit(data)

In [62]:
# get all parameter combinations
gs.param_combinations

[{'n_factors': 5, 'reg_all': 0.01},
 {'n_factors': 5, 'reg_all': 0.02},
 {'n_factors': 10, 'reg_all': 0.01},
 {'n_factors': 10, 'reg_all': 0.02},
 {'n_factors': 15, 'reg_all': 0.01},
 {'n_factors': 15, 'reg_all': 0.02}]

In [117]:
# get best parameters
gs.best_params

{'rmse': {'n_factors': 15, 'reg_all': 0.01}}

In [63]:
# Use the "best model" for prediction
gs.test(testset)

[Prediction(uid='7531', iid='You Can Count on Me (2000)', r_ui=5.0, est=4.683600861716436, details={'was_impossible': False}),
 Prediction(uid='45291', iid='Eternal Sunshine of the Spotless Mind (2004)', r_ui=4.0, est=3.1894743175380516, details={'was_impossible': False}),
 Prediction(uid='92026', iid='High Fidelity (2000)', r_ui=2.5, est=3.772299484920693, details={'was_impossible': False}),
 Prediction(uid='122811', iid='Curious Case of Benjamin Button, The (2008)', r_ui=3.0, est=3.4569342074962877, details={'was_impossible': False}),
 Prediction(uid='83105', iid='Bruce Almighty (2003)', r_ui=3.0, est=2.9812977798176865, details={'was_impossible': False}),
 Prediction(uid='38696', iid='Serenity (2005)', r_ui=5.0, est=4.117367113868462, details={'was_impossible': False}),
 Prediction(uid='43905', iid='Enemy at the Gates (2001)', r_ui=4.0, est=3.8067255441214796, details={'was_impossible': False}),
 Prediction(uid='118254', iid='Anchorman: The Legend of Ron Burgundy (2004)', r_ui=3.0, 

<b> Computing Similarity Matrix </b>

In [64]:
import numpy as np

In [65]:
item_factors

array([[-0.42054897,  0.505995  ,  0.01714511, ..., -0.55167957,
        -0.20947115,  0.12386859],
       [-0.37623268, -0.03012352,  0.34397297, ..., -0.63617886,
         0.07778947,  0.04204691],
       [-0.54191358,  0.29183657, -0.03637388, ..., -0.49208811,
        -0.33810358,  0.11381796],
       ...,
       [-0.52438769,  0.32267112,  0.0748484 , ..., -0.63865507,
        -0.075364  ,  0.24151644],
       [-0.37086969,  0.54219784,  0.01482208, ..., -0.3031639 ,
        -0.08093024,  0.14756645],
       [-0.50081359,  0.30660498,  0.05868327, ..., -0.47369769,
        -0.20930316,  0.1696412 ]])

In [66]:
item_sim = np.corrcoef(item_factors)
max_val = (-item_sim).argsort()

In [67]:
topk = pd.DataFrame(max_val[:,0:20])

In [68]:
# create item iid dictionary

all_movies = [trainset.to_raw_iid(x) for x in range(0,567)]
movie_iid_dict = dict(zip(range(0,567), all_movies))

In [69]:
topk = topk.replace(movie_iid_dict)

In [70]:
topk["movie"] = all_movies

In [71]:
topk.to_csv("sim_movies_svd.csv",index=False)