In [56]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [57]:
a = 2
b = 3
a
b

2

3

## User & Item based Collaborative Filtering

### Load data

In [58]:
import pandas as pd
import numpy as np

In [59]:
ratings = pd.read_csv("3 - ratings_sub.csv",encoding = "ISO-8859-1")

In [60]:
ratings.shape

(487469, 7)

In [61]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
0,3218,3889,1.0,1172532894,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0
1,3663,3889,1.0,1044474348,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0
2,3704,3889,3.0,971391538,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0
3,8877,3889,1.0,1050744366,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0
4,9599,3889,0.5,1378056755,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0


In [62]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 487469 entries, 0 to 487468
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     487469 non-null  int64  
 1   movieId    487469 non-null  int64  
 2   rating     487469 non-null  float64
 3   timestamp  487469 non-null  int64  
 4   title      487469 non-null  object 
 5   genres     487469 non-null  object 
 6   year       487469 non-null  float64
dtypes: float64(2), int64(3), object(2)
memory usage: 26.0+ MB


In [63]:
ratings.userId=ratings.userId.astype(str)
ratings.movieId=ratings.movieId.astype(str)

In [64]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres', 'year'], dtype='object')

### Data Exploration & Transformation

<b> Find the top 10 most popular movies watched </b>


In [65]:
# Total unique users 
print("total unique users - ",len(ratings["userId"].unique()))

total unique users -  2827


<b> Q: Who are the users with maximum no of movies watched? </b>

In [66]:
# Users with max no of movies watches
ratings["userId"].value_counts().head()

115822    200
11406     200
16501     200
65767     200
30241     200
Name: userId, dtype: int64

### Transforming data to surprise format

In [67]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))

In [68]:
data = Dataset.load_from_df(ratings[['userId', 'title', 'rating']], reader)

In [69]:
data

<surprise.dataset.DatasetAutoFolds at 0x1ce0dbcc7c8>

In [70]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

# to build on full data
#trainset = data.build_full_trainset()

In [71]:
type(trainset)

surprise.trainset.Trainset

### Making sense of trainset 

Points to Note:
    
    1) Trainset is no longer a pandas dataframe. Rather, it's a specific datatypes defined by the Surprise library
    2) UserId and ItemId in the pandas dataframe can contain any value (either string/integer etc). However, Trainset convert these raw ids into numeric indexes called as "inner id"
    3) Methods are provided to convert rw id to inner id and vice verca

In [72]:
# user item rating data can be obtained as follows
user_records = trainset.ur
type(user_records)

collections.defaultdict

In [73]:
for keys in user_records.keys():
    print(keys)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826


In [74]:
user_records[0]

[(0, 3.0),
 (195, 4.0),
 (1066, 3.5),
 (999, 3.5),
 (237, 3.0),
 (1577, 3.0),
 (932, 2.0),
 (247, 4.5),
 (2215, 3.0),
 (221, 4.0),
 (745, 3.0),
 (133, 3.0),
 (249, 3.0),
 (1065, 2.5),
 (255, 3.5),
 (167, 4.0),
 (586, 3.5),
 (1234, 4.0),
 (259, 4.5),
 (729, 2.5),
 (236, 3.5),
 (181, 3.5),
 (3245, 3.5),
 (1014, 3.0),
 (577, 5.0),
 (2789, 3.5),
 (91, 4.0),
 (10, 4.0),
 (19, 3.5),
 (274, 4.0),
 (2135, 3.0),
 (1419, 3.5),
 (695, 4.0),
 (1373, 3.5),
 (850, 3.0),
 (334, 4.0),
 (2759, 3.0),
 (222, 3.0),
 (37, 4.0),
 (380, 2.5),
 (544, 4.0),
 (542, 4.5),
 (1135, 5.0),
 (650, 5.0),
 (4625, 3.5),
 (341, 1.0),
 (780, 4.0),
 (2371, 3.0),
 (661, 4.0),
 (4742, 4.5),
 (1660, 3.5),
 (4189, 2.5),
 (110, 2.5),
 (2349, 3.0),
 (2285, 3.5),
 (2623, 3.0),
 (1001, 4.0),
 (1490, 3.0),
 (171, 4.0),
 (465, 4.0),
 (733, 5.0),
 (894, 3.0),
 (3771, 3.0),
 (933, 3.0),
 (1083, 3.0),
 (3003, 3.0),
 (11, 3.0),
 (756, 2.5),
 (604, 3.5),
 (258, 4.0),
 (725, 5.0),
 (320, 3.5),
 (1838, 5.0),
 (383, 3.0),
 (3977, 3.0),
 (19

In [75]:
# However the ids are the inner ids and not the raw ids
# raw ids can be obatined as follows

print(trainset.to_raw_uid(0))
print(trainset.to_raw_iid(1066))

248
Step Up 2 the Streets (2008)


In [79]:
user_records

defaultdict(list,
            {0: [(0, 3.0),
              (195, 4.0),
              (1066, 3.5),
              (999, 3.5),
              (237, 3.0),
              (1577, 3.0),
              (932, 2.0),
              (247, 4.5),
              (2215, 3.0),
              (221, 4.0),
              (745, 3.0),
              (133, 3.0),
              (249, 3.0),
              (1065, 2.5),
              (255, 3.5),
              (167, 4.0),
              (586, 3.5),
              (1234, 4.0),
              (259, 4.5),
              (729, 2.5),
              (236, 3.5),
              (181, 3.5),
              (3245, 3.5),
              (1014, 3.0),
              (577, 5.0),
              (2789, 3.5),
              (91, 4.0),
              (10, 4.0),
              (19, 3.5),
              (274, 4.0),
              (2135, 3.0),
              (1419, 3.5),
              (695, 4.0),
              (1373, 3.5),
              (850, 3.0),
              (334, 4.0),
              (2759, 3.0),
          

<b> In Class Assignment </b>

Confirm the raw to internal id mapping with original data, for a given user/item combination (uid - 0 & iid - 1066)


### Training the model

In [80]:
from surprise import KNNWithMeans
from surprise import accuracy
from surprise import Prediction

In [81]:
algo = KNNWithMeans(k=51, sim_options={'name': 'cosine', 'user_based': True})
algo.fit(trainset)


Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1ce0c71e948>

### Find K most similiar items

<b> In-class assignment </b>

Which movies are most similiar to Finding Nemo? (Hint: Use <b> get_neighbors </b> method of the algo object)

### Evaluating Model Performance

In [82]:
len(testset)

121868

In [83]:
testset[0:5]

[('107317', 'Signs (2002)', 2.5),
 ('103061', 'Inconvenient Truth, An (2006)', 4.5),
 ('84115', 'Battlefield Earth (2000)', 2.5),
 ('130756',
  'Fast and the Furious: Tokyo Drift, The (Fast and the Furious 3, The) (2006)',
  2.0),
 ('24878', 'Drive (2011)', 4.5)]

In [84]:
# Evalute on test set
test_pred = algo.test(testset)

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 0.8250


0.8250231668784066

In [85]:
# View a particular prediction
test_pred[12]

# To access a particular value, say estimate simply mention test_pred[12].est

Prediction(uid='7051', iid='Black Hawk Down (2001)', r_ui=5.0, est=4.029338108632621, details={'actual_k': 51, 'was_impossible': False})

In [86]:
test_pred[12].details["actual_k"]

51

In [87]:
# convert results to dataframe
test_pred_df = pd.DataFrame(test_pred)
test_pred_df["was_impossible"] = [x["was_impossible"] for x in test_pred_df["details"]]

In [88]:
test_pred_df.loc[test_pred_df.was_impossible].tail(5)

Unnamed: 0,uid,iid,r_ui,est,details,was_impossible
120560,41891,"Devil Dared Me To, The (2007)",4.0,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
120594,101656,Beautiful City (Shah-re ziba) (2004),3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
120919,68172,"Firm, The (2009)",3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
121362,77816,Remember Me (Ricordati di me) (2003),4.0,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
121672,133473,Grill Point (Halbe Treppe) (2002),4.0,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True


<b> In class assignment </b>

What does <i>"was impossible": True indicate?</i>  

For how many cases in Test set, the predictions are set to "was_impossible"? And what could be the reasons for it?

### Predictions

In [89]:
# Mkae prediction for a single user
algo.predict(uid="41891",iid="Wrong Trousers, The (1993)")

Prediction(uid='41891', iid='Wrong Trousers, The (1993)', r_ui=None, est=3.511396303620614, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

### Generating top n recommendations

In [32]:
testset_new = trainset.build_anti_testset()

In [33]:
len(testset_new)

17308818

In [34]:
testset_new[0:5]

[('248', 'Disturbia (2007)', 3.511396303620614),
 ('248', 'Hamlet 2 (2008)', 3.511396303620614),
 ('248', 'Unbreakable (2000)', 3.511396303620614),
 ('248', 'Finding Neverland (2004)', 3.511396303620614),
 ('248', 'X2: X-Men United (2003)', 3.511396303620614)]

In [35]:
predictions = algo.test(testset_new[0:10000])

In [36]:
predictions_df = pd.DataFrame([[x.uid,x.iid,x.est] for x in predictions])

In [37]:
predictions_df.columns = ["userId","movie_name","est_rating"]
predictions_df.sort_values(by = ["userId", "est_rating"],ascending=False,inplace=True)

In [38]:
predictions_df.head(10)

Unnamed: 0,userId,movie_name,est_rating
8580,45844,Shifty (2008),5.0
8641,45844,BlinkyTM (2011),5.0
9152,45844,Phish: Bittersweet Motel (2000),5.0
9413,45844,Dog Pound (2010),5.0
9778,45844,Frozen Planet (2011),5.0
9802,45844,"Child I Never Was, The (Leben lang kurze Hosen...",5.0
9912,45844,Into Eternity (2010),5.0
9553,45844,"Like Father, Like Son (Soshite chichi ni naru)...",4.993386
9147,45844,Lucky Break (2001),4.971207
9945,45844,"Hollow Crown, The (2012)",4.960816


In [39]:
top_10_recos = predictions_df.groupby("userId").head(10).reset_index(drop=True)

In [40]:
top_10_recos

Unnamed: 0,userId,movie_name,est_rating
0,45844,Shifty (2008),5.0
1,45844,BlinkyTM (2011),5.0
2,45844,Phish: Bittersweet Motel (2000),5.0
3,45844,Dog Pound (2010),5.0
4,45844,Frozen Planet (2011),5.0
5,45844,"Child I Never Was, The (Leben lang kurze Hosen...",5.0
6,45844,Into Eternity (2010),5.0
7,45844,"Like Father, Like Son (Soshite chichi ni naru)...",4.993386
8,45844,Lucky Break (2001),4.971207
9,45844,"Hollow Crown, The (2012)",4.960816


## SVD Based Recommendation

In [41]:
# Lets exclude movies with very few ratings, say less than 5
movie_count = ratings["title"].value_counts(ascending=False)
pop_movie = movie_count.loc[movie_count.values > 200].index
len(pop_movie)


567

In [42]:
ratings = ratings.loc[ratings.title.isin(pop_movie)]
ratings.shape

(350710, 7)

In [43]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'title', 'rating']], reader)

In [44]:
ratings.shape

(350710, 7)

In [45]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

# to build on full data
#trainset = data.build_full_trainset()

In [46]:
from surprise import SVD
from surprise import accuracy

In [117]:
svd_model = SVD(n_factors=10)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ce0bb90388>

In [118]:
test_pred = svd_model.test(testset)

In [119]:
test_pred_df = pd.DataFrame([[x.uid,x.iid,x.est] for x in test_pred])

In [120]:
test_pred_df.head()

Unnamed: 0,0,1,2
0,107317,Signs (2002),2.354282
1,103061,"Inconvenient Truth, An (2006)",3.876544
2,84115,Battlefield Earth (2000),1.290586
3,130756,"Fast and the Furious: Tokyo Drift, The (Fast a...",2.53192
4,24878,Drive (2011),4.382859


In [121]:
test_pred_df.columns = ["userId","movie_name","est_rating"]
test_pred_df.sort_values(by = ["userId", "est_rating"],ascending=False,inplace=True)

In [122]:
test_pred_df.head()

Unnamed: 0,userId,movie_name,est_rating
71364,99863,"Lord of the Rings: The Two Towers, The (2002)",3.838628
41462,99863,Big Fish (2003),3.519125
71884,99863,Batman Begins (2005),3.442793
94745,99863,Lost in Translation (2003),3.43786
24930,99863,Punch-Drunk Love (2002),3.420492


In [123]:
top_10_recos = test_pred_df.groupby("userId").head(10).reset_index(drop=True)

In [124]:
top_10_recos.head(30)

Unnamed: 0,userId,movie_name,est_rating
0,99863,"Lord of the Rings: The Two Towers, The (2002)",3.838628
1,99863,Big Fish (2003),3.519125
2,99863,Batman Begins (2005),3.442793
3,99863,Lost in Translation (2003),3.43786
4,99863,Punch-Drunk Love (2002),3.420492
5,99863,Pirates of the Caribbean: The Curse of the Bla...,3.130586
6,99863,Charlie and the Chocolate Factory (2005),3.128276
7,99863,Ocean's Eleven (2001),3.108931
8,99863,28 Weeks Later (2007),3.010552
9,99863,Phone Booth (2002),3.010147


In [125]:

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 0.8079


0.8078613629020462