In [1]:
import csv

data = []

with open('./ml-latest-small/ratings.csv') as ratings:
    rating_reader = csv.reader(ratings, delimiter=',')
    line_count = 0
    for row in rating_reader:
        if line_count == 0:
            line_count += 1
            continue
        else:
            # row is a list of string with the form [userId, movieId, rating, timestamp]
            data.append(row)
            line_count += 1
    print('Processed ' + str(line_count) + ' rows.')

Processed 100837 rows.


In [2]:
from random import shuffle
from collections import defaultdict
shuffle(data)

training_set = data[0:len(data)//3]
validation_set = data[len(data)//3:len(data)//3*2]
testing_set = data[len(data)//3*2:len(data)]

allRatings = []
userRatings = defaultdict(list)
itemRatings = defaultdict(list)

for l in training_set:
    user,movie = l[0],l[1]
    allRatings.append(float(l[2]))
    userRatings[user].append(float(l[2]))
    itemRatings[movie].append(float(l[2]))

globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
itemAverage = {}
for u in userRatings:
    userAverage[u] = sum(userRatings[u]) / len(userRatings[u])

for i in itemRatings:
    itemAverage[i] = sum(itemRatings[i]) / len(itemRatings[i])

MSE = 0
# calculating prediction accuracy
for l in testing_set:
    u,m,r = l[0],l[1],float(l[2])
    if u in userAverage:
        prediction = userAverage[u]
    else:
        prediction = globalAverage
    MSE += pow(prediction-r, 2)

MSE = MSE / len(testing_set)

print('The performance of this predictior(in terms of MSE) on the testing set is ' + str(MSE))

The performance of this predictior(in terms of MSE) on the testing set is 0.888865416005


In [3]:
def converge(alpha_p, betau_p, betai_p, gammau, gammai, lam):
    alpha = alpha_p
    betau = betau_p
    betai = betai_p
    
    
    for x in range(10):
        alpha_upper = 0
        betau_upper = dict()
        betai_upper = dict()
        gammau_upper = dict()
        gammai_upper = dict()
        
        u_i = defaultdict(list)
        i_u = defaultdict(list)
        
        for l in training_set:
            u, i, r = l[0], l[1], float(l[2])
        
            alpha_upper += r - betau[u] - betai[i] - gammau[u]*gammai[i]
                
        alpha = alpha_upper / len(training_set)
            
        
        for l in training_set:
            u, i, r = l[0], l[1], float(l[2])
            if u in betau_upper:
                betau_upper[u] += r - alpha - betai[i] - gammau[u]*gammai[i]
            else:
                betau_upper[u] = r - alpha - betai[i] - gammau[u]*gammai[i]
            if u not in i_u:
                i_u[u] = [i]
            else:
                i_u[u].append(i)
            
        for u in betau_upper:
            betau[u] = betau_upper[u] / (lam + len(i_u[u]))
        
        for l in training_set:
            u, i, r = l[0], l[1], float(l[2])
            if i in betai_upper:
                betai_upper[i] += r - alpha - betau[u] - gammau[u]*gammai[i]
            else:
                betai_upper[i] = r - alpha - betau[u] - gammau[u]*gammai[i]
            if i not in u_i:
                u_i[i] = [u]
            else:
                u_i[i].append(u)
        
         
        for i in betai_upper:
            betai[i] = betai_upper[i] / (lam + len(u_i[i]))
        
        parai = dict()
        for l in training_set:
            u, i, r = l[0], l[1], float(l[2])
            if u in gammau_upper:
                gammau_upper[u] += r - alpha - betau[u] - betai[i]
            else:
                gammau_upper[u] = r - alpha - betau[u] - betai[i]
            parai[u] = parai.get(u, 0) + gammai[i]*gammai[i]
            
        
        for u in gammau_upper:
            gammau[u] = gammau_upper[u] / (lam + parai[u])
        
        parau = dict()
        for l in training_set:
            u, i, r = l[0], l[1], float(l[2])
            if i in gammai_upper:
                gammai_upper[i] += r - alpha - betau[u] - betai[i]
            else:
                gammai_upper[i] = r - alpha - betau[u] - betai[i]
            parau[i] = parau.get(i, 0) + gammau[u]*gammau[u]
            
        
        for i in gammai_upper:
            gammai[i] = gammai_upper[i] / (lam + parau[i])
            
    return alpha, betau, betai, gammau, gammai

In [4]:
def converge_base(alpha_p, betau_p, betai_p, lam):
    alpha = alpha_p
    betau = betau_p
    betai = betai_p
    
    
    for x in range(30):
        alpha_upper = 0
        betau_upper = dict()
        betai_upper = dict()
        
        u_i = defaultdict(list)
        i_u = defaultdict(list)
        
        for l in training_set:
            u, i, r = l[0], l[1], float(l[2])
        
            alpha_upper += r - betau[u] - betai[i]
                
        alpha = alpha_upper / len(training_set)
            
        
        for l in training_set:
            u, i, r = l[0], l[1], float(l[2])
            if u in betau_upper:
                betau_upper[u] += r - alpha - betai[i]
            else:
                betau_upper[u] = r - alpha - betai[i]
            if u not in i_u:
                i_u[u] = [i]
            else:
                i_u[u].append(i)
            
        for u in betau_upper:
            betau[u] = betau_upper[u] / (lam + len(i_u[u]))
        
        for l in training_set:
            u, i, r = l[0], l[1], float(l[2])
            if i in betai_upper:
                betai_upper[i] += r - alpha - betau[u]
            else:
                betai_upper[i] = r - alpha - betau[u]
            if i not in u_i:
                u_i[i] = [u]
            else:
                u_i[i].append(u)
        
         
        for i in betai_upper:
            betai[i] = betai_upper[i] / (lam + len(u_i[i]))
            
    return alpha, betau, betai

In [5]:
MSE = 0
count = 0
betau_initial = dict()
betai_initial = dict()
gammau_initial = dict()
gammai_initial = dict()
for l in training_set:
    u, i, r = l[0], l[1], float(l[2])
    betau_initial[u] = globalAverage - userAverage[u]
    betai_initial[i] = globalAverage - itemAverage[i]
    gammau_initial[u] = 0
    gammai_initial[i] = 0
alpha_base, betau_base, betai_base = converge_base(globalAverage, betau_initial, betai_initial, 6)
#alpha, betau, betai, gammau, gammai = converge(alpha_base, betau_base, betai_base, gammau_initial, gammai_initial, 35)

In [6]:
MSE = 0
# calculating prediction accuracy
for l in testing_set:
    u,m,r = l[0],l[1],float(l[2])
    if u in userAverage:
        #prediction = alpha + betau[u] + betai[i] + gammau[u] * gammai[i]
        prediction = alpha_base + betau_base[u] + betai_base[i]
    else:
        prediction = globalAverage
    MSE += pow(prediction-r, 2)

MSE = MSE / len(testing_set)

print('The performance of this predictior(in terms of MSE) on the testing set is ' + str(MSE))

The performance of this predictior(in terms of MSE) on the testing set is 0.958597554462


In [7]:
print(betau_base)

{'344': -0.1300545150863343, '345': 0.1428222796094614, '346': -0.02902497335206534, '347': 0.0626172128949458, '340': 0.19876610070549786, '341': 0.133901037131643, '342': -0.3294352656217439, '343': 0.040462453637185966, '348': 0.6135548846091438, '349': 0.02608738593079431, '595': 0.4811141745512672, '298': -1.1504869047038455, '299': 0.23612802190550058, '296': 0.2849342569986239, '297': -0.6927255957658061, '294': -0.6939469439174094, '295': 0.0710296312014417, '292': -0.25397792242385986, '293': -0.49008144541680687, '290': 0.3860062127122332, '291': 0.226254050973137, '591': -0.21069386013711205, '590': -0.2301029257152995, '593': -0.6028137264391854, '592': -8.562653784802211e-05, '199': -0.28582541122098737, '198': -0.021721311029558026, '597': 0.356171696186739, '596': -0.11344079651221897, '195': -0.1635034418151078, '598': -0.10950526425898013, '197': 0.06631465791461584, '196': -0.09552401289071902, '191': 0.1716712863230172, '190': 0.10692748446228832, '193': 0.1154131084