In [2]:
import csv

data = []

with open('/Users/cpwang/desktop/movielens/ml-latest-small/ratings.csv') as ratings:
    rating_reader = csv.reader(ratings, delimiter=',')
    line_count = 0
    for row in rating_reader:
        if line_count == 0:
            line_count += 1
            continue
        else:
            # row is a list of string with the form [userId, movieId, rating, timestamp]
            data.append(row)
            line_count += 1
    print('Processed ' + str(line_count) + ' rows.')

Processed 100837 rows.


In [61]:
from random import shuffle
from collections import defaultdict
shuffle(data)

training_set = data[0:len(data)//3]
validation_set = data[len(data)//3:len(data)//3*2]
testing_set = data[len(data)//3*2:len(data)]

allRatings = []
userRatings = defaultdict(list)

for l in training_set:
    user,movie = l[0],l[1]
    allRatings.append(float(l[2]))
    userRatings[user].append(float(l[2]))

globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
for u in userRatings:
    userAverage[u] = sum(userRatings[u]) / len(userRatings[u])

MSE = 0
# calculating prediction accuracy
for l in testing_set:
    u,m,r = l[0],l[1],float(l[2])
    if u in userAverage:
        prediction = userAverage[u]
    else:
        prediction = globalAverage
    MSE += pow(prediction-r, 2)
MSE = MSE / len(testing_set)
print('The performance of this predictior(in terms of MSE) on the testing set is ' + str(MSE))

The performance of this predictior(in terms of MSE) on the testing set is 0.886038345028


In [62]:
alpha = globalAverage
print("The alpha value is: " + str(alpha))

The alpha value is: 3.50218671903


In [63]:
import numpy as np
items = list(set(entry[1] for entry in training_set))
users = list(set(entry[0] for entry in training_set))
userID = dict(zip(users, range(len(users))))
itemID = dict(zip(items, range(len(items))))
Ratings = np.zeros((len(users), len(items)))

In [64]:
UI = defaultdict(list)
IU = defaultdict(list)

for d in training_set:
    user = userID[d[0]]
    item = itemID[d[1]]
    Ratings[user][item] = d[2]
    UI[d[0]].append(d[1])
    IU[d[1]].append(d[0])

In [65]:
from __future__ import division
beta_u = {}
beta_i = {}
for user in UI:
    bias = 0
    counter = 0
    for item in UI[user]:
        counter += 1
        bias += Ratings[userID[user]][itemID[item]] - alpha
    if counter == 0:
        beta_u[user] = 0
    else:
        beta_u[user] = bias / counter
for item in IU:
    bias = 0
    counter = 0
    for user in IU[item]:
        counter += 1
        bias += Ratings[userID[user]][itemID[item]] - alpha
    if counter == 0:
        beta_i[item] = 0
    else:
        beta_i[item] = bias / counter

In [66]:
def gradientDescent(alpha, beta_i, beta_u, training_set, iterations, lamda):
    curIter = 0
    validation_mse_list = []
    training_mse_list = []
    alpha_list = []
    beta_u_list = []
    beta_i_list = []
    while curIter < iterations:
        alpha = np.sum([Ratings[userID[d[0]]][itemID[d[1]]]] - (beta_u[d[0]] + beta_i[d[1]]) for d in training_set) / float(len(training_set))
        
        for user in UI:
            userIndex = userID[user]
            for item in UI[user]:
                itemIndex = itemID[item]
                beta_u[user] += (Ratings[userIndex][itemIndex] - alpha - beta_i[item])
            beta_u[user] = beta_u[user] / (lamda + float(len(UI[user])))
            
        for item in IU:
            itemIndex = itemID[item]
            for user in IU[item]:
                userIndex = userID[user]
                beta_i[item] += (Ratings[userIndex][itemIndex] - alpha - beta_u[user])
            beta_i[item] = beta_i[item] / (lamda + float(len(IU[item])))
            
        y_predict = []
        for d in validation_set:
            prediction = 0
            user = d[0]
            item = d[1]
            if user in userID:
                prediction += beta_u[user]
            if item in itemID:
                prediction += beta_i[item]
            prediction += alpha
            y_predict.append(prediction)
        
        validation_mse = sum([(float(validation_set[index][2]) - y_predict[index]) * (float(validation_set[index][2]) - y_predict[index]) for index in range(0, len(validation_set))]) / float(len(validation_set))
        
        training_mse = sum([(alpha + beta_u[training_set[index][0]] + beta_i[training_set[index][1]] - float(training_set[index][2])) * \
                            (alpha + beta_u[training_set[index][0]] + beta_i[training_set[index][1]] - float(training_set[index][2])) \
                            for index in range(0, len(training_set))]) / float(len(training_set))
        curIter += 1
        validation_mse_list.append(validation_mse)
        training_mse_list.append(training_mse)
        beta_u_list.append(beta_u)
        beta_i_list.append(beta_i)
        alpha_list.append(alpha)
        if (curIter % 10 == 0):
            print("# of iteration: " + str(curIter))
            print("MSE on training set: " + str(training_mse)) 
            print("MSE on validation set: " + str(validation_mse))
    return alpha_list, beta_u_list, beta_i_list, validation_mse_list, training_mse_list
    

In [67]:
alpha_list, beta_u_list, beta_i_list, validation_mse_list, training_mse_list = gradientDescent(alpha, beta_i, beta_u, training_set, 100, 10.0)

  if __name__ == '__main__':


# of iteration: 10
MSE on training set: [0.69771541]
MSE on validation set: [0.79637664]
# of iteration: 20
MSE on training set: [0.69787677]
MSE on validation set: [0.79656586]
# of iteration: 30
MSE on training set: [0.69794419]
MSE on validation set: [0.79664578]
# of iteration: 40
MSE on training set: [0.69796653]
MSE on validation set: [0.79667228]
# of iteration: 50
MSE on training set: [0.69797367]
MSE on validation set: [0.79668074]
# of iteration: 60
MSE on training set: [0.69797592]
MSE on validation set: [0.79668341]
# of iteration: 70
MSE on training set: [0.69797663]
MSE on validation set: [0.79668426]
# of iteration: 80
MSE on training set: [0.69797685]
MSE on validation set: [0.79668452]
# of iteration: 90
MSE on training set: [0.69797692]
MSE on validation set: [0.7966846]
# of iteration: 100
MSE on training set: [0.69797694]
MSE on validation set: [0.79668463]
