In [129]:
import numpy as np
import scipy.sparse as sp
import scipy.sparse.linalg
from matplotlib import pyplot as plt

filename = "frena297"
nr_users = 2000
nr_movies = 1500

def load_data(name):
    data = np.genfromtxt(name,delimiter=',',dtype=int)
    data[:,0:2] -= 1
    return data

def getA(data):
    nr_ratings = len(data)

    r = np.concatenate((np.arange(nr_ratings,dtype=int), np.arange(nr_ratings,dtype=int)))
    c = np.concatenate((data[:,0], data[:,1]+nr_users))
    d = np.ones((2*nr_ratings,))

    A = sp.csr_matrix((d,(r,c)),shape=(nr_ratings,nr_users+nr_movies))

    return A

def getR(data):
    nr_ratings = len(data)
    u = data[:,0]
    m = data[:,1]
    r = data[:,2]
    

    R = sp.csr_matrix((r,(u,m)),shape=(nr_users,nr_movies))

    return R

training_data = load_data(filename+'.training')
test_data = load_data(filename+'.test')

In [143]:
u = training_data[:,0]
m = training_data[:,1]
r = training_data[:,2]

r_average = r.sum()/r.size

rmatrix = getR(training_data).toarray()


In [145]:
A = getA(training_data)

    

c = r-r_average

At = A.transpose()

b = np.linalg.lstsq((At@A).toarray(), At@c)[0]

bu = b[:nr_users]
bi = b[nr_users:]


print(bu)


um = list(zip(u,m))
rhat = np.zeros((nr_users, nr_movies))

for user, movie in um:
    val = (r_average+bu[user]+bi[movie]).round(3)
    if(val>5):
        val = 5
    elif(val<1):
        val = 1
    rhat[user][movie] = val
    

def getRMSE(pairs,r, rhat):
    C = len(pairs)
    tmp = []
    for user, movie in pairs:
        tmp.append(((rmatrix[user][movie]-rhat[user][movie])**2)/C)
    
    RMSE = sum(tmp)**(1/2)
    return RMSE

print(getRMSE(um, rmatrix, rhat))


  b = np.linalg.lstsq((At@A).toarray(), At@c)[0]


[-0.15176171  0.04559473 -0.76555923 ...  0.23604297 -0.21201489
 -0.48848209]
0.8801027311014326


In [138]:
print(r.shape, rhat.shape)

(179341,) (2000, 1500)


In [23]:
test = list(set(zip(u, r)))
test.sort()
userRatings = {}

for user, rating in test:
    if(rating!= None):
        if(user in userRatings):
            userRatings[user].append(rating)
        else:
            userRatings[user] = [rating]

userAverageRating = {}
for user, ratings in userRatings.items():
    userAverageRating[user] = sum(ratings)/len(ratings)

bUu = {}

for user, averageRating in userAverageRating.items():
    bUu[user] = averageRating-r_average

In [16]:
test = list(set(zip(m, r)))
test.sort()
movieRatings = {}

for movie, rating in test:
    if(rating!= None):
        if(movie in movieRatings):
            movieRatings[movie].append(rating)
        else:
            movieRatings[movie] = [rating]
movieAverageRating = {}
for movie, ratings in movieRatings.items():
    movieAverageRating[movie] = sum(ratings)/len(ratings)

bMm = {}
for movie, averageRating in movieAverageRating.items():
    bMm[movie] = averageRating-r_average


In [24]:
test = list(set(zip(u, m)))
rum = {}

for user, movie in test:
    rum[(user,movie)] = r_average + bUu[user] + bMm[movie]


In [50]:



A = getA(training_data)

A.shape


(179341, 3500)

In [62]:
test = list(zip(m, u, r))
test.sort()
test


[(0, 55, 1),
 (0, 57, 5),
 (0, 70, 3),
 (0, 102, 3),
 (0, 104, 3),
 (0, 110, 3),
 (0, 126, 4),
 (0, 129, 3),
 (0, 136, 3),
 (0, 145, 4),
 (0, 148, 4),
 (0, 153, 3),
 (0, 155, 5),
 (0, 166, 3),
 (0, 176, 3),
 (0, 177, 5),
 (0, 184, 3),
 (0, 220, 5),
 (0, 261, 3),
 (0, 286, 4),
 (0, 289, 1),
 (0, 291, 3),
 (0, 299, 3),
 (0, 301, 4),
 (0, 302, 3),
 (0, 315, 5),
 (0, 327, 2),
 (0, 334, 2),
 (0, 350, 3),
 (0, 359, 2),
 (0, 369, 4),
 (0, 372, 2),
 (0, 381, 2),
 (0, 383, 1),
 (0, 394, 2),
 (0, 399, 3),
 (0, 421, 3),
 (0, 425, 3),
 (0, 449, 4),
 (0, 451, 4),
 (0, 474, 2),
 (0, 495, 3),
 (0, 498, 3),
 (0, 509, 3),
 (0, 512, 1),
 (0, 523, 4),
 (0, 525, 3),
 (0, 537, 3),
 (0, 560, 2),
 (0, 570, 3),
 (0, 583, 3),
 (0, 587, 2),
 (0, 594, 2),
 (0, 613, 3),
 (0, 629, 4),
 (0, 637, 3),
 (0, 642, 2),
 (0, 658, 2),
 (0, 662, 3),
 (0, 668, 3),
 (0, 686, 3),
 (0, 687, 1),
 (0, 689, 3),
 (0, 694, 3),
 (0, 703, 5),
 (0, 716, 4),
 (0, 726, 3),
 (0, 731, 2),
 (0, 743, 2),
 (0, 763, 4),
 (0, 779, 5),
 (0, 780,