In [1]:
import pandas as pd 
import pickle
import numpy as np
from collections import Counter
from sklearn.utils import shuffle
from scipy.sparse import lil_matrix, csr_matrix, save_npz, load_npz
from sortedcontainers import SortedList


In [2]:
df1 = pd.read_csv('../data/ratings.csv')
df2 = pd.read_csv('../data/ratings2.csv')

In [3]:
df = pd.concat([df1, df2], axis=0)  

# Preprocessing

In [88]:
df.head()

Unnamed: 0,User,Movie,Rating,Timestamp
0,1,661,3,978302109
1,1,914,3,978301968
2,1,3408,4,978300275
3,1,2355,5,978824291
4,1,1287,5,978302039


In [89]:
df.drop(columns=['Timestamp'] ,inplace=True)

In [90]:
# make the user ids go from 0...N-1
df.User = df.User - 1

In [91]:
unique_User_ids = set(df.User.values)

In [92]:
print(len(unique_User_ids),max(unique_User_ids))

6040 6039


In [93]:
unique_movie_ids = set(df.Movie.values)

In [94]:
print(len(unique_movie_ids),max(unique_movie_ids))

3706 3952


In [95]:
# create a mapping for movie ids
movie2idx = {}
count = 0
for movie_id in unique_movie_ids:
  movie2idx[movie_id] = count
  count += 1

In [96]:
df['movie_idx'] = df.apply(lambda row: movie2idx[row.Movie], axis=1)

In [97]:
df.head()

Unnamed: 0,User,Movie,Rating,movie_idx
0,0,661,3,639
1,0,914,3,853
2,0,3408,4,3177
3,0,2355,5,2162
4,0,1287,5,1195


In [98]:
df.to_csv('../data/edited_rating.csv', index=False)

In [99]:
N = df.User.max() + 1 # number of users
M = df.movie_idx.max() + 1 # number of movies
print(N,M)

6040 3706


In [100]:
user_ids_count = Counter(df.User)
movie_ids_count = Counter(df.movie_idx)

In [101]:
# number of users and movies we would like to keep
n = 1000
m = 800

In [102]:
user_ids = [u for u, c in user_ids_count.most_common(n)]
movie_ids = [m for m, c in movie_ids_count.most_common(m)]

In [103]:
df_small = df[df.User.isin(user_ids) & df.movie_idx.isin(movie_ids)].copy()

In [104]:
# need to remake user ids and movie ids since they are no longer sequential
new_user_id_map = {}
i = 0
for old in user_ids:
  new_user_id_map[old] = i
  i += 1
print("i:", i)

i: 1000


In [105]:
new_movie_id_map = {}
j = 0
for old in movie_ids:
  new_movie_id_map[old] = j
  j += 1
print("j:", j)

j: 800


In [106]:
df_small.loc[:, 'userId'] = df_small.apply(lambda row: new_user_id_map[row.User], axis=1)
df_small.loc[:, 'movie_idx'] = df_small.apply(lambda row: new_movie_id_map[row.movie_idx], axis=1)

In [107]:
print("max user id:", df_small.userId.max())
print("max movie id:", df_small.movie_idx.max())

max user id: 999
max movie id: 799


In [108]:
print("small dataframe size:", len(df_small))

small dataframe size: 317003


In [109]:
df_small.to_csv('../data/small_rating.csv', index=False)

## work on small_rating 

In [110]:
df = pd.read_csv('../data/small_rating.csv')

N = df.userId.max() + 1 # number of users
M = df.movie_idx.max() + 1 # number of movies

In [111]:
# split into train and test
df = shuffle(df)
cutoff = int(0.8*len(df))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]

In [112]:
# a dictionary to tell us which users have rated which movies
user2movie = {}
# a dicationary to tell us which movies have been rated by which users
movie2user = {}
# a dictionary to look up ratings
usermovie2rating = {}

In [113]:
df_train.drop(columns=['User','Movie'],inplace=True)
df_test.drop(columns=['User','Movie'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.drop(columns=['User','Movie'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.drop(columns=['User','Movie'],inplace=True)


In [114]:
count = 0
def update_user2movie_and_movie2user(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/cutoff))

  i = int(row.userId)
  j = int(row.movie_idx)
  if i not in user2movie:
    user2movie[i] = [j]
  else:
    user2movie[i].append(j)

  if j not in movie2user:
    movie2user[j] = [i]
  else:
    movie2user[j].append(i)

  usermovie2rating[(i,j)] = row.Rating
df_train.apply(update_user2movie_and_movie2user, axis=1)


processed: 0.394
processed: 0.789


187151    None
86376     None
188228    None
296517    None
30029     None
          ... 
64802     None
116699    None
185754    None
291351    None
62121     None
Length: 253602, dtype: object

In [115]:
# test ratings dictionary
usermovie2rating_test = {}

count = 0
def update_usermovie2rating_test(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/len(df_test)))

  i = int(row.userId)
  j = int(row.movie_idx)
  usermovie2rating_test[(i,j)] = row.Rating
df_test.apply(update_usermovie2rating_test, axis=1)

68661     None
50670     None
301517    None
221931    None
231483    None
          ... 
215196    None
310211    None
135643    None
208265    None
267141    None
Length: 63401, dtype: object

In [None]:
with open('user2movie.pkl', 'wb') as f:
  pickle.dump(user2movie, f)

with open('movie2user.pkl', 'wb') as f:
  pickle.dump(movie2user, f)

with open('usermovie2rating.pkl', 'wb') as f:
  pickle.dump(usermovie2rating, f)

with open('usermovie2rating_test.pkl', 'wb') as f:
  pickle.dump(usermovie2rating_test, f)

### User Based

In [118]:
N = np.max(list(user2movie.keys())) + 1
# the test set may contain movies the train set doesn't have data on
m1 = np.max(list(movie2user.keys()))
m2 = np.max([m for (u, m), r in usermovie2rating_test.items()])
M = max(m1, m2) + 1
print("N:", N, "M:", M)

N: 1000 M: 800


In [119]:
K = 25 # number of neighbors we'd like to consider
limit = 5 # number of common movies users must have in common in order to consider
neighbors = [] # store neighbors in this list
averages = [] # each user's average rating for later use
deviations = [] # each user's deviation for later use

In [120]:
for i in range(N):
  # find the 25 closest users to user i
  movies_i = user2movie[i]
  movies_i_set = set(movies_i)

  # calculate avg and deviation
  ratings_i = { movie:usermovie2rating[(i, movie)] for movie in movies_i }
  avg_i = np.mean(list(ratings_i.values()))
  dev_i = { movie:(rating - avg_i) for movie, rating in ratings_i.items() }
  dev_i_values = np.array(list(dev_i.values()))
  sigma_i = np.sqrt(dev_i_values.dot(dev_i_values))

  # save these for later use
  averages.append(avg_i)
  deviations.append(dev_i)

  sl = SortedList()
  for j in range(N):
    # don't include yourself
    if j != i:
      movies_j = user2movie[j]
      movies_j_set = set(movies_j)
      common_movies = (movies_i_set & movies_j_set) # intersection
      if len(common_movies) > limit:
        # calculate avg and deviation
        ratings_j = { movie:usermovie2rating[(j, movie)] for movie in movies_j }
        avg_j = np.mean(list(ratings_j.values()))
        dev_j = { movie:(rating - avg_j) for movie, rating in ratings_j.items() }
        dev_j_values = np.array(list(dev_j.values()))
        sigma_j = np.sqrt(dev_j_values.dot(dev_j_values))

        # calculate correlation coefficient
        numerator = sum(dev_i[m]*dev_j[m] for m in common_movies)
        w_ij = numerator / (sigma_i * sigma_j)

        # insert into sorted list and truncate
        # negate weight, because list is sorted ascending
        # maximum value (1) is "closest"
        sl.add((-w_ij, j))
        if len(sl) > K:
          del sl[-1]

  # store the neighbors
  neighbors.append(sl)

  # print out useful things
  if i % 1 == 0:
    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [122]:
with open('neighbors.pkl', 'wb') as f:
    pickle.dump(neighbors, f)
with open('averages.pkl', 'wb') as f:
    pickle.dump(averages, f)
with open('deviations.pkl', 'wb') as f:
    pickle.dump(deviations, f)

In [125]:
def predict(i, m):
  # calculate the weighted sum of deviations
  numerator = 0
  denominator = 0
  for neg_w, j in neighbors[i]:
    # remember, the weight is stored as its negative
    # so the negative of the negative weight is the positive weight
    try:
      numerator += -neg_w * deviations[j][m]
      denominator += abs(neg_w)
    except KeyError:
      # neighbor may not have rated the same movie
      # don't want to do dictionary lookup twice
      # so just throw exception
      pass

  if denominator == 0:
    prediction = averages[i]
  else:
    prediction = numerator / denominator + averages[i]
  prediction = min(5, prediction)
  prediction = max(0.5, prediction) # min rating is 0.5
  return prediction

#### predict training data

In [126]:
train_predictions = []
train_targets = []
for (i, m), target in usermovie2rating.items():
  # calculate the prediction for this movie
  prediction = predict(i, m)

  # save the prediction and target
  train_predictions.append(prediction)
  train_targets.append(target)

test_predictions = []
test_targets = []

In [127]:
test_predictions = []
test_targets = []
# same thing for test set
for (i, m), target in usermovie2rating_test.items():
  # calculate the prediction for this movie
  prediction = predict(i, m)

  # save the prediction and target
  test_predictions.append(prediction)
  test_targets.append(target)

In [128]:
# calculate accuracy
def mse(p, t):
  p = np.array(p)
  t = np.array(t)
  return np.mean((p - t)**2)

print('train mse:', mse(train_predictions, train_targets))
print('test mse:', mse(test_predictions, test_targets))

train mse: 0.6231110092629362
test mse: 0.760474614393592


# single prediction 

In [5]:
with open('usermovie2rating.pkl', 'rb') as f:
  loaded_usermovie2rating = pickle.load(f)

with open('usermovie2rating_test.pkl', 'rb') as f:
  loaded_usermovie2rating_test = pickle.load(f)

with open('neighbors.pkl', 'rb') as f:
    loaded_neighbors = pickle.load(f)

with open('averages.pkl', 'rb') as f:
    loaded_averages = pickle.load(f)
    
with open('deviations.pkl', 'rb') as f:
    loaded_deviations = pickle.load(f)


In [8]:
def predict_exist(i, m,usermovie2rating_train,usermovie2rating_test):
  # calculate the weighted sum of deviations
  numerator = 0
  denominator = 0
  for neg_w, j in loaded_neighbors[i]:
    # remember, the weight is stored as its negative
    # so the negative of the negative weight is the positive weight
    try:
      numerator += -neg_w * loaded_deviations[j][m]
      denominator += abs(neg_w)
    except KeyError:
      # neighbor may not have rated the same movie
      # don't want to do dictionary lookup twice
      # so just throw exception
      pass

  if denominator == 0:
    prediction = loaded_averages[i]
  else:
    prediction = numerator / denominator + loaded_averages[i]
  prediction = min(5, prediction)
  prediction = max(0.5, prediction) # min rating is 0.5
  if (i, m) in usermovie2rating_train:
    print(f"{(i, m)} exists in training data with value : {usermovie2rating_train[(i, m)]} and the prediction value is : {prediction}")
  elif (i, m) in usermovie2rating_test: 
    print(f"{(i, m)} exists in test data with value : {usermovie2rating_test[(i, m)]} and the prediction value is : {prediction}")
  else:
    print(f"{(i, m)} does not exist in training data and the pridiction is :{prediction}") 

In [10]:
predict_exist(200,100,loaded_usermovie2rating,loaded_usermovie2rating_test)

(200, 100) exists in training data with value : 3 and the prediction value is : 3.8018745461967995


In [148]:
predict_exist(100,200,loaded_usermovie2rating,loaded_usermovie2rating_test)

(100, 200) exists in training data with value : 4 and the prediction value is : 4.129668753807558
