# Matrix Factorization

In [1]:
from __future__ import print_function 
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 

class MF(object):
    def __init__(self, Y, K, lam = 0.02, Xinit = None, Winit = None, 
                 learning_rate = 0.005, max_epoch = 20, print_every = 2):
        self.Y      = Y
        self.user   = Y[:, 0]
        self.item   = Y[:, 1]
        self.rating = Y[:, 2]
        self.n_users       = int(np.max(self.user)) + 1 
        self.n_items       = int(np.max(self.item)) + 1
        self.n_ratings     = Y.shape[0] # number of known ratings
        self.X = .1*np.random.randn(self.n_items, K) if Xinit is None else Xinit 
        self.W = .1*np.random.randn(self.n_users, K) if Winit is None else Winit 
        self.b = np.zeros(self.n_items) # item biases
        self.d = np.zeros(self.n_users) # user biases
        self.mu = np.mean(self.rating)
        self.K      = K    # 
        self.lam    = lam  # regularization parameter 
        self.learning_rate = learning_rate
        self.max_epoch      = max_epoch # maximum number of iterations 
        self.print_every   = print_every # print loss+ RMSE on training data after each ? iters 

    def _loss(self):
        L = 0 
        for n in range(self.n_ratings):
            # user_id, item_id, rating
            u, i, rating = self.user[n], self.item[n], self.rating[n]
            pred_rating = self.X[i].dot(self.W[u]) + self.b[i] + self.d[u] + self.mu 
            L += 0.5*(pred_rating - rating)**2 + .5*self.lam*(self.b[i]*2 + 
                    self.d[u]*2 + np.sum(self.X[i]**2) + np.sum(self.W[u]**2))
        
        return L/self.n_ratings
    
    def fit(self):
        for it in range(self.max_epoch):
            # mix data 
            idx = np.random.permutation(self.n_ratings)
            for n in idx:
                u, i, rating = self.user[n], self.item[n], self.rating[n]
                pred_rating = self.X[i].dot(self.W[u]) + self.b[i] + self.d[u] + self.mu 
                error = pred_rating - rating 
                self.b[i] -= self.learning_rate*(error + self.lam*self.b[i])
                self.d[u] -= self.learning_rate*(error + self.lam*self.d[u])
                self.X[i] -= self.learning_rate*(error*self.W[u] + self.lam*self.X[i])
                self.W[u] -= self.learning_rate*(error*self.X[i] + self.lam*self.W[u])

            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y)
                print('iter = %d, loss = %.4f, RMSE train = %.4f'%(it + 1, self._loss(), rmse_train))
    
    def pred(self, u, i):
        """ 
        predict the rating of user u for item i 
        """
        u, i = int(u), int(i)
        pred = self.X[i].dot(self.W[u]) + self.b[i] + self.d[u] + self.mu# + bias
        return max(0, min(5, pred)) # pred should be between 0 and 5 in MoviesLen 
    
    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0] # number of test 
        SE = 0 # squared error
        for n in range(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2 

        RMSE = np.sqrt(SE/n_tests)
        return RMSE
        

# Áp dụng lên MovieLens 100k

In [2]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols)
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols)

rate_train = ratings_base[r_cols].values
rate_test = ratings_test[r_cols].values

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [3]:
rs = MF(rate_train, K = 100, lam = .1, print_every = 1, learning_rate = 0.005, max_epoch = 50)
rs.fit()
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print('\nMatrix Factorization CF, RMSE = %.4f' %RMSE)

iter = 1, loss = 0.5714, RMSE train = 0.9851
iter = 2, loss = 0.5266, RMSE train = 0.9502
iter = 3, loss = 0.5016, RMSE train = 0.9325
iter = 4, loss = 0.4852, RMSE train = 0.9213
iter = 5, loss = 0.4730, RMSE train = 0.9135
iter = 6, loss = 0.4638, RMSE train = 0.9073
iter = 7, loss = 0.4567, RMSE train = 0.9026
iter = 8, loss = 0.4508, RMSE train = 0.8985
iter = 9, loss = 0.4458, RMSE train = 0.8950
iter = 10, loss = 0.4418, RMSE train = 0.8919
iter = 11, loss = 0.4384, RMSE train = 0.8889
iter = 12, loss = 0.4354, RMSE train = 0.8861
iter = 13, loss = 0.4322, RMSE train = 0.8833
iter = 14, loss = 0.4298, RMSE train = 0.8807
iter = 15, loss = 0.4271, RMSE train = 0.8777
iter = 16, loss = 0.4246, RMSE train = 0.8748
iter = 17, loss = 0.4229, RMSE train = 0.8720
iter = 18, loss = 0.4204, RMSE train = 0.8687
iter = 19, loss = 0.4185, RMSE train = 0.8656
iter = 20, loss = 0.4159, RMSE train = 0.8622
iter = 21, loss = 0.4144, RMSE train = 0.8591
iter = 22, loss = 0.4122, RMSE train = 0.85

In [4]:
rs = MF(rate_train, K = 100, lam = .1, print_every = 1, learning_rate = 0.005, max_epoch = 50, Xinit = rs.X, Winit = rs.W)
rs.fit()
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print('\nMatrix Factorization CF, RMSE = %.4f' %RMSE)

iter = 1, loss = 0.4397, RMSE train = 0.8358
iter = 2, loss = 0.4137, RMSE train = 0.8014
iter = 3, loss = 0.4015, RMSE train = 0.7846
iter = 4, loss = 0.3948, RMSE train = 0.7743
iter = 5, loss = 0.3903, RMSE train = 0.7665
iter = 6, loss = 0.3866, RMSE train = 0.7605
iter = 7, loss = 0.3840, RMSE train = 0.7557
iter = 8, loss = 0.3817, RMSE train = 0.7513
iter = 9, loss = 0.3798, RMSE train = 0.7475
iter = 10, loss = 0.3786, RMSE train = 0.7443
iter = 11, loss = 0.3772, RMSE train = 0.7413
iter = 12, loss = 0.3760, RMSE train = 0.7385
iter = 13, loss = 0.3748, RMSE train = 0.7359
iter = 14, loss = 0.3738, RMSE train = 0.7333
iter = 15, loss = 0.3728, RMSE train = 0.7310
iter = 16, loss = 0.3721, RMSE train = 0.7288
iter = 17, loss = 0.3714, RMSE train = 0.7267
iter = 18, loss = 0.3709, RMSE train = 0.7246
iter = 19, loss = 0.3701, RMSE train = 0.7226
iter = 20, loss = 0.3695, RMSE train = 0.7209
iter = 21, loss = 0.3687, RMSE train = 0.7189
iter = 22, loss = 0.3683, RMSE train = 0.71

In [5]:
rs.X.shape
rs.n_items

1682