In [2]:
import pandas as pd
import numpy as np
import pickle as pkl

# To create plots
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, lil_matrix
from sklearn.model_selection import train_test_split

In [3]:
smp_train = pkl.load(open('./pkl/smp_train.pkl', 'rb'))
smp_test = pkl.load(open('./pkl/smp_test.pkl', 'rb'))
probe = pkl.load(open('./pkl/probe_.pkl', 'rb'))

In [4]:
smp_test_marked = smp_test.copy()
smp_test_marked.Rating = np.NaN
smp = pd.concat([smp_train, smp_test_marked])
pvt = smp.pivot_table(index='CustomerID', columns='MovieID', values='Rating')
print('Shape User-Item matrix:\t{}'.format(pvt.shape))
pvt.head()

Shape User-Item matrix:	(347500, 3572)


MovieID,8,17,18,26,28,30,33,44,46,52,...,17714,17724,17725,17730,17743,17756,17761,17762,17764,17769
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,5.0,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
42,,,,,,,,,,,...,,,,,,,,,,


In [5]:
print(f'there are {(~np.isnan(pvt.values)).sum()} element that is not nan')

there are 17417312 element that is not nan


In [6]:
"""
@INPUT:
    R     : a matrix to be factorized, dimension N x M
    P     : an initial matrix of dimension N x K
    Q     : an initial matrix of dimension k x M
    K     : the number of latent features
    steps : the maximum number of steps to perform the optimisation
    alpha : the learning rate
    beta  : the regularization parameter
@OUTPUT:
    the final matrices P and Q
"""
def matrix_factorization(R, P, Q, B, K, steps=5000, alpha=0.002, beta=1e-5):
    print(f'R:{R.shape}, P:{P.shape}, Q:{Q.shape}, B:{B.shape}')
    rows, cols = np.where(~np.isnan(R))
    narows, nacols =  np.where(np.isnan(R))
    N = rows.shape[0]
    
    Mask = np.zeros(R.shape)#lil_matrix(R.shape, )
    Mask[rows, cols] = 1
    R[narows, nacols] = 0
    errorHistory = []
    
    p = 0
    for step in range(steps):
        import time
        s = time.time()
        R_hat = B + np.dot(P, Q)
        E = R - R_hat
        gradP = -np.dot(E, Q.T) + beta*P
        gradQ = -np.dot(E.T, P).T + beta*Q
        gradB = -E + beta*B
        
        # SGD
        P = P - alpha * gradP
        Q = Q - alpha * gradQ
        B = B - alpha * gradB
        
        error = np.sqrt( np.square(
            np.multiply(E, Mask)).sum()/N 
                       )
        errorHistory.append(error)
        
        p += 1
        print(f'\nprogress {p} time:{time.time()-s:.2f} s step:{step} Error:{error} ')
        
        if error < 1e-5:
            break
    else:
        print(f'step:{step} Error:{error}')
    
    return P, Q, errorHistory

In [7]:
nLatentFactor = 20
R = pvt.values
P = np.random.rand(R.shape[0], nLatentFactor)
Q = np.random.rand(nLatentFactor, R.shape[1])
B = np.random.rand(R.shape[0], R.shape[1])

P_, Q_, H_ = matrix_factorization(R, P, Q, B, K=nLatentFactor, steps=3)

R:(347500, 3572), P:(347500, 20), Q:(20, 3572), B:(347500, 3572)

progress 1% time:1637.01 s step:0 Error:2.3806479327312613 

progress 2% time:1702.80 s step:1 Error:737017.9092495725 

progress 3% time:1166.34 s step:2 Error:2.100099774703387e+21 
step:2 Error:2.100099774703387e+21


In [393]:
P_

array([[0.46054652, 0.46054652, 0.46054652, 0.46054652, 0.46054652,
        0.46054652, 0.46054652, 0.46054652, 0.46054652, 0.46054652],
       [0.38673789, 0.38673789, 0.38673789, 0.38673789, 0.38673789,
        0.38673789, 0.38673789, 0.38673789, 0.38673789, 0.38673789],
       [0.4819484 , 0.4819484 , 0.4819484 , 0.4819484 , 0.4819484 ,
        0.4819484 , 0.4819484 , 0.4819484 , 0.4819484 , 0.4819484 ],
       [0.58778036, 0.58778036, 0.58778036, 0.58778036, 0.58778036,
        0.58778036, 0.58778036, 0.58778036, 0.58778036, 0.58778036],
       [0.60849309, 0.60849309, 0.60849309, 0.60849309, 0.60849309,
        0.60849309, 0.60849309, 0.60849309, 0.60849309, 0.60849309],
       [0.61576868, 0.61576868, 0.61576868, 0.61576868, 0.61576868,
        0.61576868, 0.61576868, 0.61576868, 0.61576868, 0.61576868],
       [0.75695852, 0.75695852, 0.75695852, 0.75695852, 0.75695852,
        0.75695852, 0.75695852, 0.75695852, 0.75695852, 0.75695852],
       [0.68255469, 0.68255469, 0.6825546

In [394]:
Q_

array([[ 0.04916235, -0.1957449 , -0.137971  ,  0.05929473],
       [ 0.04916235, -0.1957449 , -0.137971  ,  0.05929473],
       [ 0.04916235, -0.1957449 , -0.137971  ,  0.05929473],
       [ 0.04916235, -0.1957449 , -0.137971  ,  0.05929473],
       [ 0.04916235, -0.1957449 , -0.137971  ,  0.05929473],
       [ 0.04916235, -0.1957449 , -0.137971  ,  0.05929473],
       [ 0.04916235, -0.1957449 , -0.137971  ,  0.05929473],
       [ 0.04916235, -0.1957449 , -0.137971  ,  0.05929473],
       [ 0.04916235, -0.1957449 , -0.137971  ,  0.05929473],
       [ 0.04916235, -0.1957449 , -0.137971  ,  0.05929473]])

In [383]:
R

array([[1., 3., 0., 4.],
       [0., 1., 1., 0.],
       [2., 2., 2., 5.],
       [0., 0., 1., 4.],
       [5., 0., 2., 2.],
       [4., 0., 1., 2.],
       [3., 0., 0., 5.],
       [3., 0., 0., 3.],
       [2., 2., 0., 5.],
       [1., 1., 1., 0.],
       [0., 0., 1., 0.],
       [1., 1., 1., 5.],
       [2., 0., 0., 3.],
       [3., 0., 0., 0.],
       [4., 4., 4., 4.],
       [4., 0., 1., 4.],
       [3., 0., 0., 5.],
       [2., 0., 0., 3.],
       [5., 0., 1., 0.]])

In [384]:
P

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [385]:
Q

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [364]:
np.square(
    np.multiply(E, Mask)).sum()/N 

ValueError: dimension mismatch

In [363]:
Error = np.sqrt( np.square(
    np.multiply(E, Mask)).sum()/N 
               )

ValueError: dimension mismatch

In [324]:
type(gradP)

numpy.matrix

In [319]:
alphaP = 0
alphaQ = 0
alphaB = 0
alphaP = alphaP+ gradP**2
alphaQ = gradQ**2
alphaB = gradB**2

LinAlgError: Last 2 dimensions of the array must be square

In [298]:
np.dot(P, Q)
R_hat = B + np.dot(P, Q)
E = R - R_hat
E.shape

(19, 4)

In [318]:
beta = 0.02
R_hat = B + np.dot(P, Q)
E = R - R_hat
#         gradP = -np.dot(E, Q.T) + beta*P
#         gradQ = -np.dot(E.T, P) + beta*Q
gradP = -E @ Q.T + beta*P
gradQ = (-(E.T) @ P).T + beta*Q
gradB = -E

8885000

In [52]:
P

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])