In [2]:
from math import sqrt
from operator import itemgetter

import numpy as np
from scipy.linalg import svd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn import metrics

In [16]:
class MatrixFactorization:
    def __init__(self, records_train, records_test):
        records = np.vstack([records_train, records_test])
        self.n = len(np.unique(np.sort(records[:, 0])))
        self.m = len(np.unique(np.sort(records[:, 1])))

        # Initial R
        self.R = np.zeros([self.n, self.m], dtype=np.int32)

        for record in records_train:
            self.R[record[0], record[1]] = record[2]

        # Initial indicator
        y = np.where(self.R, 1, 0)
        y_user = np.sum(y, axis=1)
        y_item = np.sum(y, axis=0)

        # Global average of rating
        self.r = np.sum(self.R) / np.sum(y)

        # average rating of user
        self.r_u = np.where(y_user,
                            np.sum(self.R, axis=1) / y_user,
                            self.r)

        # average rating of item
        self.r_i = np.where(y_item,
                            np.sum(self.R, axis=0) / y_item,
                            self.r)

        # bias of user
        self.b_u = np.where(y_user,
                            np.sum(y * (self.R - self.r_i), axis=1) / y_user,
                            0)

        # bias of item
        self.b_i = np.where(y_item,
                            np.sum(y * (self.R - self.r_u.reshape(-1, 1)), axis=0) / y_item,
                            0)
        
    def alternative_least_square(self, n_iter=5):

        d = 20
        alpha = 0.01
        # Initialize
        U = (np.random.randint(0, 1, size=(self.n, d)) - 0.5) * 0.01
        V = (np.random.randint(0, 1, size=(self.m, d)) - 0.5) * 0.01

   
        for cur_iter in range(n_iter):
            print(cur_iter)

            for user in range(self.n):
                items = np.where(self.R[user, :] != 0)[0]
                A_u = np.zeros([d, d])
                for item in items:
                    A_u += V[item].T.dot(V[item]) + np.identity(d) * alpha
                U[user, :] = self.R[user, :].dot(V).dot(np.linalg.pinv(A_u))
                
            for item in range(self.m):
                users = np.where(self.R[:, item] != 0)[0]
                A_i = np.zeros([d, d])
                for user in users:
                    A_i += U[user].T.dot(U[user]) + np.identity(d) * alpha
                V[item, :] = self.R[:, item].dot(U).dot(np.linalg.pinv(A_i))
                
            ratings_predict_rsvd = performance(U, V, records_test)
            print(score(np.clip(ratings_predict_rsvd, 1, 5), ratings_test))

        return U, V

In [7]:
def score(ratings_test, ratings_predict):
    return [round(sqrt(metrics.mean_squared_error(ratings_test, ratings_predict)), 4),
            round(metrics.mean_absolute_error(ratings_test, ratings_predict), 4)]
def performance(U, V, records_test):
        return U.dot(V.T)[records_test[:, 0], records_test[:, 1]]

In [8]:
# Load the records
records_train = np.loadtxt('../data/ml-100k/u1.base', dtype=np.int32)
records_test = np.loadtxt('../data/ml-100k/u1.test', dtype=np.int32)

# Preprocess
records_train[:, :2] -= 1
records_test[:, :2] -= 1
ratings_test = records_test[:, 2]
records = np.vstack([records_train, records_test])


In [17]:
%%time
mf = MatrixFactorization(records_train, records_test)

mf.alternative_least_square(20)




0
[2.786, 2.5359]
1
[2.786, 2.5359]
2
[2.786, 2.5359]
3
[1.8614, 1.4617]
4
[1.8614, 1.4617]
5
[2.786, 2.5359]
6
[2.786, 2.5359]
7
[2.786, 2.5359]
8
[2.786, 2.5359]
9
[2.786, 2.5359]
10
[2.786, 2.5359]
11
[1.8614, 1.4617]
12


KeyboardInterrupt: 

In [11]:
tt = np.array([[0, 1, 3], [4, 1, 6]])
gg = np.array([[1, 2], [1, 2], [1, 2]])
print(tt[1, :].dot(gg))

[11 22]
