# Low Rank Matrix Factorization

## Formulation
Let's assume that our system has $N_{user}$ users and $M_{movie}$ movies. We assign $L_{latent}$ features to each user and movie in the system. We can construct a matrix factorization as follows:

$$
\begin{vmatrix}
x_{0,0} & x_{0,1} & x_{0, 2} & ... & x_{0, L} \\
x_{1,0} & ...     & ...      & ... & ...      \\
x_{2,0} & ...     & ...      & ... & ...      \\
...     & ...     & ...      & ... & ...      \\
x_{N,0} & ...     & ...      & ... & x_{N, L}
\end{vmatrix}
\begin{vmatrix}
\theta_{0,0} & \theta_{0,1} & \theta_{0, 2} & ... & \theta_{0, L} \\
\theta_{1,0} & ...     & ...      & ... & ...      \\
\theta_{2,0} & ...     & ...      & ... & ...      \\
...     & ...     & ...      & ... & ...      \\
\theta_{M,0} & ...     & ...      & ... & \theta_{M, L}
\end{vmatrix}^{T}
=
\begin{vmatrix}
r_{0,0} & r_{0,1} & r_{0, 2} & ... & r_{0, M} \\
r_{1,0} & ...     & ...      & ... & ...      \\
r_{2,0} & ...     & ...      & ... & ...      \\
...     & ...     & ...      & ... & ...      \\
r_{N,0} & ...     & ...      & ... & r_{N, M}
\end{vmatrix}
$$

$$
X\Theta^{T} = \hat{R}
$$

$X$ represents the latent feature matrix for all users in our system. $\Theta$ represents the latent feature matrix for all all movies in our system. The matrix product of $X$ and $\Theta^{T}$ is the model predicated rating.

Let $R$ represents the actual rating we received from the MovieLens dataset. For every missing value in $R$, we will place them with average rating each movie received from the poll of users. Then we define the loss function as follows:

$$
L_{X, \Theta} = \frac{1}{2}\Sigma_{i,j} (X\Theta^{T} - R)^{2} + \frac{\lambda}{2}\Sigma_{i, k}X^{2} + \frac{\lambda}{2}\Sigma_{j, k}\Theta^{2}
$$

In [9]:
import numpy as np


def loss(U, M, R, reg=0.0):
    diff = np.dot(U, M.T) - R
    loss = 0.5 * np.sum(diff * diff)
    loss += reg * np.sum(U * U) / 2
    loss += reg * np.sum(M * M) / 2
    return loss

def rel_error(x, y):
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))


def compute_grad(U, M, R, reg=0.0):
    u_grad = np.zeros(U.shape)
    m_grad = np.zeros(M.shape)
    
    num_user, lat_dim = U.shape
    num_movie, lat_dim = M.shape
    
    diff = np.dot(U, M.T) - R
    for i in range(num_user):
        u_grad[i] = np.sum(diff[i].reshape(num_movie, 1) * M, axis=0) + (reg * U[i])

    for j in range(num_movie):
        m_grad[j] = np.sum(diff.T[j].reshape(num_user, 1) * U, axis=0) + (reg * M[j])
        
    return u_grad, m_grad


def compute_num_grad(U, M, R, loss_func, reg=0.0, h=1e-5):
    num_grad_u = np.zeros(U.shape)
    num_grad_m = np.zeros(M.shape)
    
    U_dim, L_dim = U.shape
    M_dim, L_dim = M.shape
    
    for i in range(U_dim):
        for k in range(L_dim):
            old_val = U[i][k]
            
            U[i][k] = old_val + h
            fuph = loss_func(U, M, R, reg)
            
            U[i][k] = old_val - h
            fumh = loss_func(U, M, R, reg)
            
            U[i][k] = old_val
            num_grad_u[i][k] = (fuph - fumh) / (2 * h)
    
    for j in range(M_dim):
        for k in range(L_dim):
            old_val = M[j][k]
            
            M[j][k] = old_val + h
            fmph = loss_func(U, M, R, reg)
            
            M[j][k] = old_val - h
            fmmh = loss_func(U, M, R, reg)
            
            M[j][k] = old_val
            num_grad_m[j][k] = (fmph - fmmh) / (2 * h)
    
    return num_grad_u, num_grad_m
            

num_user = 3
num_movie = 3
lat_dim = 4
reg = 0

R = np.random.rand(num_user, num_movie)
U = np.random.rand(num_user, lat_dim)
M = np.random.randn(num_movie, lat_dim)

np.dot(U, M.T)

grad_u, grad_m = compute_grad(U, M, R, reg)
num_grad_u, num_grad_m = compute_num_grad(U, M, R, loss, reg)

print "Gradient of U relative error %s" % str(rel_error(grad_u, num_grad_u))
print "Gradient of M relative error %s" % str(rel_error(grad_m, num_grad_m))

Gradient of U relative error 3.4214177312233686e-11
Gradient of M relative error 3.469683656310976e-11


In [47]:
num_user = 3
num_movie = 6
lat_dim = 2
U = np.random.randn(num_user, lat_dim)
M = np.random.randn(num_movie, lat_dim)
diff = np.random.randn(num_user, num_movie)

grad_u = np.zeros(U.shape)
grad_m = np.zeros(M.shape)
for i in range(num_user):
    grad_u[i] = np.sum(diff[i].reshape(diff.shape[1], 1) * M, axis=0) # vector dimension is now (lat_dim,) 

for j in range(num_movie):
    grad_m[j] = np.sum(diff.T[j].reshape(diff.shape[0], 1) * U, axis=0)
    
print grad_u
print grad_m

[[ 0.66028601 -6.06697783]
 [ 1.45826095 -0.48272723]
 [ 0.76705498  1.47248015]]
[[ 2.16512191 -0.14728376]
 [-2.44554935  0.48069103]
 [-0.35991685 -2.06944839]
 [ 3.34090557  1.02299273]
 [ 1.25544474 -1.62462961]
 [-2.17379815  1.23300123]]
