## Load Requirements

In [96]:
import numpy as np


## SVD for a Noisy Matrix


#### Matrix Generation

In [97]:
# Generate the matrix

N,M = 100,50

# Form 2 random vectors
u=np.random.rand(N,1)
v=np.random.rand(M,1)

# Compute the rank-1 matrix using outer product
R_f = u @ v.T

R_f_variannce = np.var(R_f)

noise_variance = 0.01 * R_f_variannce

noise = np.random.normal(0, np.sqrt(noise_variance), (N,M)) 

R = R_f + noise

# Display the shape of the matrix

print("The matrix Shape:", R.shape)


The matrix Shape: (100, 50)


#### SVD Decomposition

In [98]:
# Decompose the matrix using SVD

U, s, V = np.linalg.svd(R, full_matrices=False)

print("U Shape:", U.shape)

print("s Shape:", s.shape)

print("V Shape:", V.shape)


U Shape: (100, 50)
s Shape: (50,)
V Shape: (50, 50)


#### Matrix Reconstruction

In [99]:
# Reconstruct the matrix R using only the first singular value and the corresponidng singular vectors

R_reconstructed = s[0] * np.outer(U[:,0], V[0,:])

#### Analysis

In [100]:
# output the Reconstructed values of U and V

U_reconstructed, s_constructed, V_reconstructed = np.linalg.svd(R_reconstructed, full_matrices=False)

U_no_noise, s_no_noise, V_no_noise = np.linalg.svd(R_f, full_matrices=False)


print("Reconstructed U:", U_reconstructed)

print("Reconstructed V:", V_reconstructed)

Reconstructed U: [[-1.58853015e-01 -9.03305987e-01 -2.05647825e-02 ...  9.99250643e-04
   1.29078554e-02  3.90735679e-01]
 [-3.61870429e-02  1.82662444e-02 -6.84124603e-03 ... -3.86679075e-02
  -1.87485907e-01  2.28802546e-02]
 [-7.41074272e-02  4.82230489e-02  1.77721037e-01 ... -2.09417701e-01
   1.44309495e-01  9.01980436e-02]
 ...
 [-9.43860307e-02 -2.30633290e-02 -7.43519931e-02 ...  1.08672757e-01
   2.40216060e-01 -1.03382094e-01]
 [-1.04698953e-01  1.49053296e-02 -5.87016611e-02 ...  9.77849463e-02
  -1.25084030e-01  5.44218714e-03]
 [-1.29455555e-01  4.25009123e-04  2.74446929e-02 ... -1.27965451e-01
   8.77236311e-02 -7.99790072e-02]]
Reconstructed V: [[-1.90080103e-01 -1.39387274e-01 -1.48969266e-01 ... -7.55533737e-02
  -2.34619178e-01 -7.53913079e-02]
 [-4.55665366e-01 -1.46359149e-01  2.96844473e-02 ...  7.92121352e-02
   6.44868590e-02  3.51972772e-02]
 [ 0.00000000e+00 -2.08852729e-01  6.23098318e-02 ...  4.43360141e-03
  -1.77167039e-01  1.16023548e-01]
 ...
 [ 0.00000

In [101]:
# Computer the Root Mean Squared Error between the original matrix R and the reconstructed matrix R_reconstructed 

RMSE = np.sqrt(np.mean((R_f - R_reconstructed)**2))

print("RMSE:", RMSE)

# Computer the Root Mean Squared Errors of U and V

RMSE_U = np.sqrt(np.mean((U_no_noise - U_reconstructed)**2))

RMSE_V = np.sqrt(np.mean((V_no_noise - V_reconstructed)**2))

print("RMSE_U:", RMSE_U)
print("RMSE_V:", RMSE_V)

RMSE: 0.004113459642564886
RMSE_U: 0.1400753762639409
RMSE_V: 0.19980025146763342


**The impact of noise:** : 

1. The small RMSE between Rf and reconstructed R suggests that overall matrix structure remains well-preserved.  
2. The high RMSE values for u and v indicate that noise impacts the individual components significantly.  
3. This shows that though the matrix structure can be largely recovered, the exact original vectors u and v is more difficult due to the added noises.

## Matrix Factorization of an Imcomplete Matrix

#### Matrix Generation

In [102]:
## Generate the Matrix 

# Calculate the total number of elements and 30% of that
total_elements = N * M
num_missing = int(0.3 * total_elements)

# Randomly select indices to set as missing
missing_indices = np.unravel_index(
    np.random.choice(total_elements, num_missing, replace=False), (N, M)
)

# Create a copy of the original matrix and set the missing values to nan

R_missing = np.copy(R_f)

R_missing[missing_indices] = np.nan



#### Matrix Factorization

In [103]:
# SGD for matrix factorization

def SGD_factorization(learning_rate, regularization, num_epochs, R_missing, output, Rank):

    K= Rank
    U_factorized = np.random.rand(N,K)
    V_factorized = np.random.rand(M,K) # initialze the U and V matrices

    for epoch in range(num_epochs):

        for i in range(N):
            for j in range(M):
                
                # Only consider the observed values
                if not np.isnan(R_missing[i,j]):

                    prediction = np.dot(U_factorized[i,:], V_factorized[j,:])
                    error = R_missing[i,j] - prediction

                    # Update the U and V matrices
                    U_factorized[i,:] += learning_rate * (error * V_factorized[j,:] - regularization * U_factorized[i,:])
                    V_factorized[j,:] += learning_rate * (error * U_factorized[i,:] - regularization * V_factorized[j,:]) 
                    
        if output:
            if epoch % 10 == 0:
            
                # Calculatre the total loss on observed entrices
                observed_indices = ~np.isnan(R_missing)
                loss = np.sum((R_missing[observed_indices] - (U_factorized @ V_factorized.T)[observed_indices])**2)
                print("Epoch:", epoch, "Loss:", loss)
    
    return U_factorized, V_factorized

U_factorized,V_factorized = SGD_factorization(0.01, 0, 1000, R_missing,True,1)

Epoch: 0 Loss: 249.76260322806968
Epoch: 10 Loss: 22.78780964872795
Epoch: 20 Loss: 1.7995300314279858
Epoch: 30 Loss: 0.1710255187868417
Epoch: 40 Loss: 0.018692102559455345
Epoch: 50 Loss: 0.002294365948007942
Epoch: 60 Loss: 0.0003098107080645085
Epoch: 70 Loss: 4.501471812531755e-05
Epoch: 80 Loss: 6.896587272745627e-06
Epoch: 90 Loss: 1.0964224478645974e-06
Epoch: 100 Loss: 1.7881771314641975e-07
Epoch: 110 Loss: 2.968768157752536e-08
Epoch: 120 Loss: 4.9919760603810695e-09
Epoch: 130 Loss: 8.473464279519105e-10
Epoch: 140 Loss: 1.4487488658712918e-10
Epoch: 150 Loss: 2.4913198223509648e-11
Epoch: 160 Loss: 4.30453818095305e-12
Epoch: 170 Loss: 7.467341055041587e-13
Epoch: 180 Loss: 1.2999028249484452e-13
Epoch: 190 Loss: 2.2697534946680503e-14
Epoch: 200 Loss: 3.973964593635715e-15
Epoch: 210 Loss: 6.974734612706469e-16
Epoch: 220 Loss: 1.2268455395745962e-16
Epoch: 230 Loss: 2.162346511087704e-17
Epoch: 240 Loss: 3.818205581980371e-18
Epoch: 250 Loss: 6.753478689137359e-19
Epoch

#### Missing Data Imputation

In [104]:
# Use the factorized matrices U and V to reconstruct the matrix R

R_reconstructed_missing = U_factorized @ V_factorized.T



#### Analysis

In [105]:
# Output the reconstructed values of U and V

print(U_factorized,V_factorized.T)

[[0.8864491 ]
 [0.1963928 ]
 [0.42000427]
 [0.45195127]
 [0.28718183]
 [0.30574408]
 [0.7234049 ]
 [0.30136577]
 [0.60256873]
 [0.08108224]
 [0.42919782]
 [0.79379676]
 [0.5490156 ]
 [0.69604221]
 [0.03670088]
 [0.92669153]
 [0.2044386 ]
 [0.92576061]
 [0.34726032]
 [0.12036504]
 [0.40691797]
 [0.92566398]
 [0.8469623 ]
 [0.73724138]
 [0.24309767]
 [0.058216  ]
 [0.47377404]
 [0.96869281]
 [0.19863594]
 [0.85030231]
 [0.44825668]
 [0.49313405]
 [0.63655575]
 [0.4579314 ]
 [0.80337166]
 [0.50054596]
 [0.78505977]
 [0.58878291]
 [0.95976393]
 [0.36181498]
 [0.91886391]
 [0.6891091 ]
 [0.6787517 ]
 [0.79201201]
 [0.4693551 ]
 [0.33603296]
 [0.93186124]
 [0.08907638]
 [0.45543723]
 [0.85151821]
 [0.34808815]
 [0.32102142]
 [0.07009835]
 [0.03362056]
 [0.60284801]
 [0.32416284]
 [0.67396381]
 [0.90849333]
 [0.00660163]
 [0.01356489]
 [0.29448663]
 [0.76648865]
 [0.54734252]
 [0.19032889]
 [0.61415607]
 [0.88311911]
 [0.58189399]
 [0.38891028]
 [0.37842485]
 [0.25398771]
 [0.51377026]
 [0.33

In [106]:
# Compute the RMSE between the original matrix R and the reconstructed matrix R_reconstructed

RMSE_missing = np.sqrt(np.mean((R_f - R_reconstructed_missing)**2))

print("RMSE_missing:", RMSE_missing)

# Compute the RMSE of U and V

RMSE_U_missing = np.sqrt(np.mean((u - U_factorized)**2))

RMSE_V_missing = np.sqrt(np.mean((v - V_factorized)**2))

print("RMSE_U_missing:", RMSE_U_missing)
print("RMSE_V_missing:", RMSE_V_missing)

RMSE_missing: 2.1188915087976775e-15
RMSE_U_missing: 0.016151981573954834
RMSE_V_missing: 0.016973077671913125


In [107]:
# Discuss other missing proportions for data generation and discuss their impacts in the reconstruction process

# missing protions from 0.1 to 0.9

missing_proportions = [0.1*i for i in range(1,10)]

for missing_proportion in missing_proportions:

    num_missing = int(missing_proportion * total_elements)

    missing_indices = np.unravel_index(
        np.random.choice(total_elements, num_missing, replace=False), (N, M)
    )

    R_missing = np.copy(R_f)
    
    R_missing[missing_indices] = np.nan

    U_factorized,V_factorized = SGD_factorization(0.01, 0,1000, R_missing,False,1)

    R_reconstructed_missing = U_factorized @ V_factorized.T

    RMSE_missing = np.sqrt(np.mean((R_f - R_reconstructed_missing)**2))

    print("******* Missing Proportion:", round(missing_proportion,1),"********")
    print("RMSE_missing:", RMSE_missing)

    RMSE_U_missing = np.sqrt(np.mean((u - U_factorized)**2))

    RMSE_V_missing = np.sqrt(np.mean((v - V_factorized)**2))

    print("RMSE_U_missing:", RMSE_U_missing)

    print("RMSE_V_missing:", RMSE_V_missing)



******* Missing Proportion: 0.1 ********
RMSE_missing: 2.4052637664011314e-15
RMSE_U_missing: 0.002903838579411324
RMSE_V_missing: 0.002980951874122636
******* Missing Proportion: 0.2 ********
RMSE_missing: 2.9979264203027447e-15
RMSE_U_missing: 0.02272930551813848
RMSE_V_missing: 0.02416856307586616
******* Missing Proportion: 0.3 ********
RMSE_missing: 2.0339542859740246e-15
RMSE_U_missing: 0.019673670629673425
RMSE_V_missing: 0.020804598065549313
******* Missing Proportion: 0.4 ********
RMSE_missing: 2.5091924606756215e-15
RMSE_U_missing: 0.032227756013432245
RMSE_V_missing: 0.034866752088528304
******* Missing Proportion: 0.5 ********
RMSE_missing: 2.0882499147952527e-15
RMSE_U_missing: 0.008849151535654218
RMSE_V_missing: 0.009179326085041645
******* Missing Proportion: 0.6 ********
RMSE_missing: 5.634315082766128e-13
RMSE_U_missing: 0.004462408611580909
RMSE_V_missing: 0.004593396604888851
******* Missing Proportion: 0.7 ********
RMSE_missing: 3.241459235943753e-08
RMSE_U_missing

**Discussion**: 

**Reconstructed Matrix**:  
1. For the low and moderate missing value proportion such as 10% - 50%, the RMSE is very low, indicating that the factorization can approximate the matrix accuractely. 
2. For the moderate and high missing proportion such as 60% - 80%, the RMSE increases as the missing proportion increases, showing that a slight degradation in the matrix reconstruction quality. And the degradation becomes notable as the missing proportion reaches 80%.It shows that matrix factorization methods are resilient to a moderate amount of missing data.
3. For the very high missing proportion such as 90%, the RMSE rises significantly. The factorization struggles to approximate the original matrix since the data in the matrix becomes pretty sparse, and there is insufficient data to capture the underlying structure effectively.

**Reconstructed U and V**:
1. For the low missing proportions such as 10% - 30%, the RMSEs for U and V are very low at these missing proportions, indicating that the latent factors U and V are accurately estimated.  
2. For the moderate missing proportions such as 40% - 60%, the RMSEs remains very low too at these missing proportions, indicating that the latent factors U and V can be accurately estimated.    
3. For the high missing proportions such as 70% - 90%, the RMSEs for U and V becomes relatively high. It shows that for the sparse matrix, the latent factors are difficult to be estimated because the information becomes less and less.

In [108]:
# Generate Matrix with Rank 10

N,M = 100,50

full_rank_matrix = np.random.rand(N, M)
    
# Perform SVD decomposition
U_10, s_10, Vt_10 = np.linalg.svd(full_rank_matrix, full_matrices=False)

# Keep only the top `rank` singular values
s_10[10:] = 0
    
# Reconstruct the matrix

R_10 = U_10 @ np.diag(s_10) @ Vt_10

# choose a moderate missing data proportions
num_missing = int(0.3 * total_elements)

missing_indices = np.unravel_index(
    np.random.choice(total_elements, num_missing, replace=False), (N, M)
)

R_missing_10 = np.copy(R_10)

R_missing_10[missing_indices] = np.nan

# decompose the matrix using different number of singular components

singular_components = [1,2,3,4,5,6,7,8,9,10, 20, 30, 40, 50]

for K in singular_components:

    U_reconstructed, s_constructed, V_reconstructed = np.linalg.svd(R_10, full_matrices=False)
    
    s_constructed[K:] = 0

    R_reconstructed_k = U_reconstructed @ np.diag(s_constructed) @ V_reconstructed

    RMSE_missing = np.sqrt(np.mean((R_10 - R_reconstructed_k)**2))

    print("******* Singular Number:", K ,"********")
    print("RMSE_reconstruction:", RMSE_missing)
    

******* Singular Number: 1 ********
RMSE_reconstruction: 0.1785662344904074
******* Singular Number: 2 ********
RMSE_reconstruction: 0.16612216880883932
******* Singular Number: 3 ********
RMSE_reconstruction: 0.1537641110171347
******* Singular Number: 4 ********
RMSE_reconstruction: 0.1405963076679247
******* Singular Number: 5 ********
RMSE_reconstruction: 0.12698460343689363
******* Singular Number: 6 ********
RMSE_reconstruction: 0.112475592395448
******* Singular Number: 7 ********
RMSE_reconstruction: 0.09641838014146753
******* Singular Number: 8 ********
RMSE_reconstruction: 0.07733966356176836
******* Singular Number: 9 ********
RMSE_reconstruction: 0.05442717199982621
******* Singular Number: 10 ********
RMSE_reconstruction: 2.460319285765109e-15
******* Singular Number: 20 ********
RMSE_reconstruction: 2.4639862668001584e-15
******* Singular Number: 30 ********
RMSE_reconstruction: 2.4676066743274875e-15
******* Singular Number: 40 ********
RMSE_reconstruction: 2.4701214049

**Discussion**:   
I use the a rank-10 matrix with 30% missing values.
1. For the number of singular value smaller than or equal to the Rank of the matrix. As the number of singular value increases, the reconstruction effect increases because we use more features. And the MSE becomes very small as the number of singualr value is equal to the Rank of the matrix.
2. For the number of singular values bigger than the Rank of the matrix. As the number of singular value increases, the reconstruction effect remains stable because singular values after the 10th are quite small so there is very little information in those singular components of the matrix.