<p style="font-family: Arial; font-size:3em;color:black;"> Session 6 - Matrix Factorization</p>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.linalg import svd

In [2]:
# Singular Value Decompostion Ex. 1
a = np.array([[1, 0, 0, 0, 2], [0, 0, 3, 0, 0], [0, 0, 0, 0, 0], [0, 2, 0, 0, 0]])
u, s, vh = np.linalg.svd(a)

print(a,'\n')
print(u.shape,  s.shape, vh.shape,'\n') # s is a 1D array of a’s singular values
print(u,'\n')
print(s,'\n')
print(vh,'\n')

[[1 0 0 0 2]
 [0 0 3 0 0]
 [0 0 0 0 0]
 [0 2 0 0 0]] 

(4, 4) (4,) (5, 5) 

[[ 0.  1.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 0.  0.  0. -1.]
 [ 0.  0.  1.  0.]] 

[3.         2.23606798 2.         0.        ] 

[[-0.          0.          1.          0.          0.        ]
 [ 0.4472136   0.          0.          0.          0.89442719]
 [-0.          1.          0.          0.          0.        ]
 [ 0.          0.          0.          1.          0.        ]
 [-0.89442719  0.          0.          0.          0.4472136 ]] 



In [3]:
m, n = 4, 5
sigma = np.zeros((m, n)) # a m-by-n matrix of zero values
for i in range(min(m, n)):
    sigma[i, i] = s[i]   # transforming s into the diagonal matrix sigma 

print(sigma,'\n')
print(np.matmul(u, np.matmul(sigma, vh)))

[[3.         0.         0.         0.         0.        ]
 [0.         2.23606798 0.         0.         0.        ]
 [0.         0.         2.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]] 

[[1. 0. 0. 0. 2.]
 [0. 0. 3. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 2. 0. 0. 0.]]


In [4]:
# Singular Value Decompostion Ex. 2
a = np.array([[1, 0, 0, 2], [0, 3, 2, 0], [1, 4, 3, 0]])
u, s, vh = np.linalg.svd(a)

m, n = 3, 4
sigma = np.zeros((m, n))
for i in range(min(m, n)):
    sigma[i, i] = s[i]


print(a,'\n')
print(u.shape,  s.shape, vh.shape,'\n')
print(u,'\n')
print(s,'\n')
print(vh,'\n')
print(sigma,'\n')
print(np.matmul(u, np.matmul(sigma, vh)))

[[1 0 0 2]
 [0 3 2 0]
 [1 4 3 0]] 

(3, 3) (3,) (4, 4) 

[[-0.02431531  0.99228208  0.12159371]
 [-0.57414244 -0.11343137  0.8108599 ]
 [-0.81839429  0.0500958  -0.57246938]] 

[6.21752084 2.24732851 0.5403232 ] 

[[-0.13553788 -0.80353643 -0.57956666 -0.00782155]
 [ 0.46382978 -0.06225655 -0.03407395  0.88307702]
 [-0.8344555   0.26410523 -0.17709463  0.4500777 ]
 [-0.26490647 -0.52981294  0.79471941  0.13245324]] 

[[6.21752084 0.         0.         0.        ]
 [0.         2.24732851 0.         0.        ]
 [0.         0.         0.5403232  0.        ]] 

[[ 1.00000000e+00  3.77658883e-16  3.03551481e-16  2.00000000e+00]
 [-1.20734153e-16  3.00000000e+00  2.00000000e+00 -8.05181830e-18]
 [ 1.00000000e+00  4.00000000e+00  3.00000000e+00 -1.43724366e-16]]


In [5]:
# Matrix Factorization with missing elements
M, N = 20, 10  # creating a 20-by-10 rating matrix
np.random.seed(15)
A_orig = np.random.randint(low=0.0, high=6.0, size=(M,N)).astype(np.float32)
print (pd.DataFrame(A_orig).head(20))

      0    1    2    3    4    5    6    7    8    9
0   0.0  5.0  4.0  5.0  0.0  4.0  3.0  3.0  5.0  5.0
1   1.0  5.0  0.0  2.0  4.0  1.0  5.0  3.0  4.0  5.0
2   4.0  5.0  0.0  2.0  1.0  1.0  0.0  5.0  2.0  2.0
3   3.0  2.0  1.0  0.0  5.0  1.0  2.0  0.0  0.0  0.0
4   4.0  3.0  4.0  2.0  2.0  0.0  5.0  3.0  0.0  5.0
5   4.0  5.0  3.0  1.0  0.0  0.0  5.0  5.0  4.0  0.0
6   4.0  1.0  2.0  0.0  4.0  3.0  2.0  1.0  3.0  5.0
7   5.0  2.0  2.0  2.0  2.0  0.0  5.0  2.0  0.0  4.0
8   4.0  4.0  3.0  4.0  5.0  0.0  1.0  0.0  4.0  0.0
9   3.0  2.0  4.0  2.0  4.0  2.0  2.0  1.0  3.0  3.0
10  4.0  3.0  5.0  4.0  3.0  0.0  3.0  3.0  0.0  1.0
11  2.0  2.0  1.0  4.0  4.0  4.0  0.0  2.0  2.0  3.0
12  3.0  5.0  5.0  2.0  4.0  5.0  4.0  3.0  4.0  4.0
13  3.0  5.0  0.0  1.0  4.0  1.0  4.0  4.0  3.0  1.0
14  0.0  4.0  5.0  0.0  5.0  3.0  1.0  3.0  0.0  4.0
15  0.0  3.0  1.0  1.0  2.0  4.0  5.0  3.0  3.0  4.0
16  3.0  0.0  3.0  3.0  1.0  1.0  3.0  5.0  3.0  5.0
17  1.0  4.0  0.0  1.0  4.0  0.0  3.0  2.0  4.

In [6]:
# Let's add some NaN(s) to the rating matrix A
A = A_orig.copy()
A[0, 0] = np.NaN
A[3, 1] = np.NaN
A[4, 2] = np.NaN
A[4, 8] = np.NaN
A[2, 5] = np.NaN
A[2, 3] = np.NAN

A_df = pd.DataFrame(A)
print (A_df.head())

     0    1    2    3    4    5    6    7    8    9
0  NaN  5.0  4.0  5.0  0.0  4.0  3.0  3.0  5.0  5.0
1  1.0  5.0  0.0  2.0  4.0  1.0  5.0  3.0  4.0  5.0
2  4.0  5.0  0.0  NaN  1.0  NaN  0.0  5.0  2.0  2.0
3  3.0  NaN  1.0  0.0  5.0  1.0  2.0  0.0  0.0  0.0
4  4.0  3.0  NaN  2.0  2.0  0.0  5.0  3.0  NaN  5.0


In [2]:
# Initializing P and Q matrices with K = 5 features
K = 5
P = np.abs(np.random.uniform(low=0, high=5, size=(M, K)))
Q = np.abs(np.random.uniform(low=0, high=5, size=(K, N)))
P = np.divide(P, K*P.max())
Q = np.divide(Q, K*Q.max())

NameError: name 'np' is not defined

In [4]:
P
Q

NameError: name 'P' is not defined

In [9]:
def matrix_factorization(Rating_Matrix, P, Q, K, steps, alpha=0.001, beta=0.002):
    Q = Q.T                                          # transposes Q
    for step in range(steps):                        # number of iterations
        for i in range(len(Rating_Matrix)):          # i varies in the range of the number of matrix rows
            for j in range(len(Rating_Matrix[i])):   # j varies in the range of the number of matrix columns
                if ~np.isnan(Rating_Matrix[i][j]):   # prforms the operation for any cell that is a number
                    eij = Rating_Matrix[i][j] - np.dot(P[i,:],Q[:,j])  # calculating the error
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])  # updating the matrices using gradients of the loss function including regularization term
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eRating_Matrix = np.dot(P,Q)                 # estimating the Rating_Matrix 
        e = 0
        for i in range(len(Rating_Matrix)):
            for j in range(len(Rating_Matrix[i])):
                if ~np.isnan(Rating_Matrix[i][j]):
                    e = e + pow(Rating_Matrix[i][j] - np.dot(P[i,:],Q[:,j]), 2) # brings to the power of 2 to calculate the cost function
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))    # calculating the cost function including regularization term
        print("Total error at step", step, "is", e)
        if e < 0.0001:     # acceptable error threshold
            break
    return P, Q.T    # returns P and Q

In [13]:
eQ.T

array([[ 0.78627153,  0.72351264,  0.22514551,  1.58005806, -1.55000672,
         0.71747127,  0.86877658,  2.03081305,  1.17054179,  1.45621826],
       [ 1.75213195,  2.24222548,  0.6482283 ,  1.76867251,  1.64731249,
         0.03009043,  0.41722571,  0.89036675,  1.21038811, -0.63980365],
       [ 0.53248048,  1.21750954,  0.29766033, -0.92042997,  0.99493942,
        -0.49558366,  2.8380022 ,  0.9120025 ,  0.71238418,  0.70240034],
       [ 1.92788418, -0.33185242,  2.3721702 ,  0.95665096,  0.87043838,
         0.49528957,  0.73958502,  0.62391629, -0.53768326,  1.56508613],
       [-0.29492447,  1.24050234,  0.61063227,  0.30186518,  1.98372569,
         2.15994602,  0.40775411,  0.39502213,  1.37329261,  1.96542446]])

In [14]:
Q.T

array([[ 0.78627153,  1.75213195,  0.53248048,  1.92788418, -0.29492447],
       [ 0.72351264,  2.24222548,  1.21750954, -0.33185242,  1.24050234],
       [ 0.22514551,  0.6482283 ,  0.29766033,  2.3721702 ,  0.61063227],
       [ 1.58005806,  1.76867251, -0.92042997,  0.95665096,  0.30186518],
       [-1.55000672,  1.64731249,  0.99493942,  0.87043838,  1.98372569],
       [ 0.71747127,  0.03009043, -0.49558366,  0.49528957,  2.15994602],
       [ 0.86877658,  0.41722571,  2.8380022 ,  0.73958502,  0.40775411],
       [ 2.03081305,  0.89036675,  0.9120025 ,  0.62391629,  0.39502213],
       [ 1.17054179,  1.21038811,  0.71238418, -0.53768326,  1.37329261],
       [ 1.45621826, -0.63980365,  0.70240034,  1.56508613,  1.96542446]])

In [3]:
eP, eQ = matrix_factorization(A, P, Q.T, K, steps = 1000)
eA = np.matmul(eP, eQ.T)   # returns estimated Rating_Matrix
print(A,'\n')
print(eA)

NameError: name 'matrix_factorization' is not defined