## Load Requirements

In [13]:
import numpy as np


# Part 1

## 1.2 SVD for a Noisy Matrix


#### Matrix Generation

In [14]:
# Generate the matrix

N,M = 100,50

# Form 2 random vectors
u=np.random.rand(N,1)
v=np.random.rand(M,1)

# Compute the rank-1 matrix using outer product
R_1 = u @ v.T

# Calculate the Frobenius norm of this matrix
R_f = np.linalg.norm(R_1, 'fro')

noise_variance = 0.01 * R_f

noise = np.random.normal(0, np.sqrt(noise_variance), (N,M)) 

R_ = R_1 + noise

# Display the shape of the matrix

print("The matrix Shape:", R_.shape)


The matrix Shape: (100, 50)


#### SVD Decomposition

In [15]:
# Decompose the matrix using SVD

U1, s1, V1 = np.linalg.svd(R_, full_matrices=False)

print("U Shape:", U1.shape)

print("s Shape:", s1.shape)

print("V Shape:", V1.shape)


U Shape: (100, 50)
s Shape: (50,)
V Shape: (50, 50)


#### Matrix Reconstruction

In [16]:
# Reconstruct the matrix R using only the first singular value and the corresponidng singular vectors

R_reconstructed = s1[0] * np.outer(U1[:,0], V1[0,:])

#### Analysis

In [17]:
# calculate and output the original values of U and V

U_orginal, s_original, V_original = np.linalg.svd(R_1, full_matrices=False)

print("U_original Shape:", U_orginal)

print("s_original Shape:", s_original)

print("V_original Shape:", V_original)

U_original Shape: [[-0.08465639 -0.72666733 -0.00614252 ... -0.00125815 -0.01037617
   0.67820294]
 [-0.11146218  0.02206445  0.0461166  ... -0.00514486 -0.01868697
   0.00619947]
 [-0.1382825  -0.18979262 -0.06258836 ... -0.11249273 -0.18736701
  -0.25258603]
 ...
 [-0.14352204  0.09731988  0.14556701 ...  0.05277509  0.01800429
   0.09337201]
 [-0.18286614  0.04777537  0.56919572 ... -0.10889249 -0.01610159
   0.02145267]
 [-0.03709111  0.00093313 -0.03299787 ... -0.0451366   0.09285429
  -0.00253355]]
s_original Shape: [2.15300713e+01 2.15062500e-15 2.15062500e-15 2.15062500e-15
 2.15062500e-15 2.15062500e-15 2.15062500e-15 2.15062500e-15
 2.15062500e-15 2.15062500e-15 2.15062500e-15 2.15062500e-15
 2.15062500e-15 2.15062500e-15 2.15062500e-15 2.15062500e-15
 2.15062500e-15 2.15062500e-15 2.15062500e-15 2.15062500e-15
 2.15062500e-15 2.15062500e-15 2.15062500e-15 2.15062500e-15
 2.15062500e-15 2.15062500e-15 2.15062500e-15 2.15062500e-15
 2.15062500e-15 2.15062500e-15 2.15062500e-15

In [18]:
# calculate and output the Reconstructed values of U and V

U_reconstructed, s_constructed, V_reconstructed = np.linalg.svd(R_reconstructed, full_matrices=False)

print("Reconstructed U:", U_reconstructed)

print("Reconstructed V:", V_reconstructed)

print("Reconstructed s:", s_constructed)

Reconstructed U: [[-6.50665695e-02  2.09492012e-01  1.19701387e-02 ...  6.98470640e-03
   2.82974468e-03 -9.68128566e-01]
 [-1.05575840e-01  7.48150215e-02  4.74005384e-02 ...  1.00900811e-01
  -1.72478829e-02  2.48344094e-02]
 [-1.02932253e-01  3.14728178e-02  3.53124619e-02 ...  2.52801136e-01
   4.07280328e-03  3.63305367e-02]
 ...
 [-1.10313373e-01  8.75014350e-02 -5.45398955e-02 ... -1.11684883e-01
  -1.75853630e-01 -8.61989518e-04]
 [-1.95308088e-01 -2.10994990e-01  1.01905919e-01 ...  2.55047448e-02
   3.36976179e-01 -4.46585724e-02]
 [-8.42752198e-02 -2.49296178e-01 -1.16643015e-01 ... -3.38866142e-02
  -8.50586960e-02 -4.11574822e-02]]
Reconstructed V: [[-1.62238013e-02 -1.28815215e-01 -3.31239772e-02 ... -7.12497626e-02
   2.54461296e-02 -8.82746726e-02]
 [ 0.00000000e+00 -7.67619751e-02 -1.95218306e-02 ... -6.29389348e-02
  -6.14522381e-03  1.31958618e-01]
 [ 0.00000000e+00  5.89378368e-02  8.56414617e-03 ...  1.67823013e-01
  -2.60590486e-02 -8.59844055e-03]
 ...
 [ 0.00000

In [19]:
# Computer the Root Mean Squared Error between the original matrix R and the reconstructed matrix R_reconstructed 

RMSE = np.sqrt(np.mean((R_1 - R_reconstructed)**2))

print("RMSE:", RMSE)

# Compute the Root Mean Squared Errors of U and V 
# From the value of U_original, V_original and U_reconstructed, V_reconstructed, there is no need to adjust the sign of the singular vectors when calculating the RMSE

RMSE_U = np.sqrt(np.mean((U_orginal - U_reconstructed)**2))

RMSE_V = np.sqrt(np.mean((V_original- V_reconstructed)**2))

print("RMSE_U:", RMSE_U)
print("RMSE_V:", RMSE_V)

RMSE: 0.08540524039884285
RMSE_U: 0.1423131211660104
RMSE_V: 0.19817176210417836


**The impact of noise:**

1. Effect on the Matrix Reconstruction: The RMSE for the reconstructed matrix is pretty small, indicating that the reconstruction closely approximates the original matrix despite the added noise. This is potentially because the reconstruction uses only the first singular value and the corresponding singular vectors, which contains most information of the matrix. This filters out higher-order singular values, which captures a portion of the noise, reducing the effect of noise on the reconstructed matrix.  
2. Effect on the Singular Vectors: The RMSE for the singular vectors suggests that the noise has introduced some discrepancy between the original and reconstructed singular vectors. Singular vectors associated with significant singular values are typically less affected by noise because they represent more prominent structural components of the matrix. But the loss indicates that noise might alter the direction slightly, resulting the overall discrepancies. 


## 1.3 Matrix Factorization of an Imcomplete Matrix

#### Matrix Generation

In [20]:
# Generate the Matrix 

# Calculate the total number of elements and 30% of that
total_elements = N * M
num_missing = int(0.3 * total_elements)

# Randomly select indices to set as missing
missing_indices = np.unravel_index(
    np.random.choice(total_elements, num_missing, replace=False), (N, M)
)

# Create a copy of the original matrix and set the missing values to nan

R_missing = np.copy(R_1)

R_missing [missing_indices] = np.nan



#### Matrix Factorization

In [21]:
# SGD for matrix factorization

def SGD_factorization(learning_rate, regularization, num_epochs, R_missing, output):

    U_factorized = np.random.rand(N,1)
    V_factorized = np.random.rand(M,1) # initialze the U and V matrices

    for epoch in range(num_epochs):

        for i in range(N):
            for j in range(M):
                
                # Only consider the observed values
                if not np.isnan(R_missing[i,j]):

                    prediction = np.dot(U_factorized[i], V_factorized[j])
                    error = R_missing[i,j] - prediction

                    # Update the U and V matrices
                    U_factorized[i] += learning_rate * (error * V_factorized[j] - regularization * U_factorized[i])
                    V_factorized[j] += learning_rate * (error * U_factorized[i] - regularization * V_factorized[j]) 
                    
        if output:
            if epoch % 10 == 0:
            
                # Calculatre the total loss on observed entrices
                observed_indices = ~np.isnan(R_missing)
                loss = np.sum((R_missing[observed_indices] - (U_factorized @ V_factorized.T)[observed_indices])**2)
                print("Epoch:", epoch, "Loss:", loss)
    
    return U_factorized, V_factorized

U_factorized,V_factorized = SGD_factorization(0.01, 0, 200, R_missing,True)

Epoch: 0 Loss: 199.13892867024185
Epoch: 10 Loss: 26.514630769703217
Epoch: 20 Loss: 3.8713736755834796
Epoch: 30 Loss: 0.5818592065935589
Epoch: 40 Loss: 0.09068988128701874
Epoch: 50 Loss: 0.014664976023926586
Epoch: 60 Loss: 0.002457359750247406
Epoch: 70 Loss: 0.0004260976587323207
Epoch: 80 Loss: 7.636194278034394e-05
Epoch: 90 Loss: 1.4129479257189108e-05
Epoch: 100 Loss: 2.6965645672952626e-06
Epoch: 110 Loss: 5.301398896761745e-07
Epoch: 120 Loss: 1.0719106204163478e-07
Epoch: 130 Loss: 2.2244196449008695e-08
Epoch: 140 Loss: 4.726050062296373e-09
Epoch: 150 Loss: 1.0252470167123422e-09
Epoch: 160 Loss: 2.264641346552285e-10
Epoch: 170 Loss: 5.0797292295502243e-11
Epoch: 180 Loss: 1.1541751781420576e-11
Epoch: 190 Loss: 2.650540858492429e-12


#### Missing Data Imputation

In [22]:
# Use the factorized matrices U and V to reconstruct the matrix R

R_reconstructed_missing = U_factorized @ V_factorized.T



#### Analysis

In [23]:
# Compute the RMSE between the original matrix R and the reconstructed matrix R_reconstructed

RMSE_missing = np.sqrt(np.mean((R_1 - R_reconstructed_missing)**2))

print("RMSE_missing:", RMSE_missing)

RMSE_missing: 1.662674717443701e-08


In [24]:
# Discuss other missing proportions for data generation and discuss their impacts in the reconstruction process

# missing protions from 0.1 to 0.9

missing_proportions = [0.1*i for i in range(1,10)]

missing_proportions.extend([0.95,0.97,0.99])

for missing_proportion in missing_proportions:

    num_missing = int(missing_proportion * total_elements)

    missing_indices = np.unravel_index(
        np.random.choice(total_elements, num_missing, replace=False), (N, M)
    )

    R_missing = np.copy(R_1)
    
    R_missing[missing_indices] = np.nan

    U_factorized,V_factorized = SGD_factorization(0.01, 0,1000, R_missing,False)

    R_reconstructed_missing = U_factorized @ V_factorized.T

    RMSE_missing = np.sqrt(np.mean((R_1 - R_reconstructed_missing)**2))

    print("******* Missing Proportion:", round(missing_proportion,2),"********")
    print("RMSE of recoverd matrix:", RMSE_missing)



******* Missing Proportion: 0.1 ********
RMSE of recoverd matrix: 2.0498713390486303e-15
******* Missing Proportion: 0.2 ********
RMSE of recoverd matrix: 1.4673931766634846e-15
******* Missing Proportion: 0.3 ********
RMSE of recoverd matrix: 1.959739858559067e-15
******* Missing Proportion: 0.4 ********
RMSE of recoverd matrix: 2.3659949996473913e-15
******* Missing Proportion: 0.5 ********
RMSE of recoverd matrix: 2.0342106441211844e-15
******* Missing Proportion: 0.6 ********
RMSE of recoverd matrix: 3.2093130276906625e-15
******* Missing Proportion: 0.7 ********
RMSE of recoverd matrix: 6.53815675203914e-10
******* Missing Proportion: 0.8 ********
RMSE of recoverd matrix: 2.426424989553833e-06
******* Missing Proportion: 0.9 ********
RMSE of recoverd matrix: 0.012485523144217336
******* Missing Proportion: 0.95 ********
RMSE of recoverd matrix: 0.08137264496784324
******* Missing Proportion: 0.97 ********
RMSE of recoverd matrix: 0.1682469591988202
******* Missing Proportion: 0.99

**Discussion impact of the missing value proportion on reconstructed matrix**: 

1. For the low and moderate missing value proportion such as 10% - 70%, the RMSE is very low, indicating that the factorization can approximate the missing value and the observed value accurately.  
2. For the very high missing proportion such as 80% - 99%, the RMSE continues to increase. For the missing value proportion above 90%, the discrepancy of original and reconstructed matrix is notable, indicating the matrix factorization struggles to impute the missing value of the matrix when the matrix is extremely sparse.


In [25]:
# Generate Matrix with Rank 10

N,M = 100,50

full_rank_matrix = np.random.rand(N, M)
    
# Perform SVD decomposition
U_10, s_10, Vt_10 = np.linalg.svd(full_rank_matrix, full_matrices=False)

# Keep only the top `rank` singular values
s_10[10:] = 0
    
# Reconstruct the matrix

R_10 = U_10 @ np.diag(s_10) @ Vt_10

# choose a moderate missing data proportions
num_missing = int(0.3 * total_elements)

missing_indices = np.unravel_index(
    np.random.choice(total_elements, num_missing, replace=False), (N, M)
)

R_missing_10 = np.copy(R_10)

R_missing_10[missing_indices] = np.nan

# decompose the matrix using different number of singular components

singular_components = [1,2,3,4,5,6,7,8,9,10, 20, 30, 40, 50]

for K in singular_components:

    U_reconstructed, s_constructed, V_reconstructed = np.linalg.svd(R_10, full_matrices=False)
    
    s_constructed[K:] = 0

    R_reconstructed_k = U_reconstructed @ np.diag(s_constructed) @ V_reconstructed

    RMSE_missing = np.sqrt(np.mean((R_10 - R_reconstructed_k)**2))

    print("******* Singular Number:", K ,"********")
    print("RMSE of recovered matrix:", RMSE_missing)
    

******* Singular Number: 1 ********
RMSE of recovered matrix: 0.1789345727631257
******* Singular Number: 2 ********
RMSE of recovered matrix: 0.1657988250842047
******* Singular Number: 3 ********
RMSE of recovered matrix: 0.15250822927225222
******* Singular Number: 4 ********
RMSE of recovered matrix: 0.13887584136248748
******* Singular Number: 5 ********
RMSE of recovered matrix: 0.12449781589669884
******* Singular Number: 6 ********
RMSE of recovered matrix: 0.10956572965749556
******* Singular Number: 7 ********
RMSE of recovered matrix: 0.09305250762048164
******* Singular Number: 8 ********
RMSE of recovered matrix: 0.0745090721377148
******* Singular Number: 9 ********
RMSE of recovered matrix: 0.05167785365512812
******* Singular Number: 10 ********
RMSE of recovered matrix: 6.304889345978013e-16
******* Singular Number: 20 ********
RMSE of recovered matrix: 6.426193871953939e-16
******* Singular Number: 30 ********
RMSE of recovered matrix: 6.547843211891478e-16
******* Si

**Discussion**:   
I use the a rank-10 matrix with 30% missing values.
1. For the number of singular components smaller than or equal to the Rank of the matrix. As the number of singular components increases, the reconstruction effect increases because for the kth singular compoents (k<=10), the singular components contain part information of the matrix. And the MSE becomes very small as the number of singualr components is equal to the Rank of the matrix because we use nearly all useful information to reconstruct the matrix.  
2. For the number of singular values bigger than the Rank of the matrix. As the number of singular value increases, the reconstruction effect remains stable because singular values after the 10th are quite small so there is little information in those singular components of the matrix. So there is no improvements for reconstruction using k th singular components (k>10).  

## 1.4 Matrix Factorization with Regularization

#### Generate Matrix

In [26]:
# Calculate the total number of elements and 30% of that
total_elements = N * M

num_missing = int(0.8 * total_elements)

# Randomly select indices to set as missing
missing_indices = np.unravel_index(
    np.random.choice(total_elements, num_missing, replace=False), (N, M)
)

# Create a copy of the original matrix and set the missing values to nan

R_missing = np.copy(R_1)

R_missing[missing_indices] = np.nan

In [27]:
np.isnan(R_missing).sum()

np.int64(4000)

#### Rank-one factorization using regularization



In [28]:
# The result of Rank-one factorization using regularization

# set regularization to 0.1 as an example

U_factorized,V_factorized = SGD_factorization(0.01,1e-5, 1000, R_missing,False)

R_reconstructed_missing = U_factorized @ V_factorized.T

RMSE_missing = np.sqrt(np.mean((R_1 - R_reconstructed_missing)**2))

print("R_reconstrcuion", R_reconstructed_missing)

print("R_original",R_1)

print("RMSE of recovered matrix:", RMSE_missing)


R_reconstrcuion [[0.04518783 0.25290654 0.11041299 ... 0.19522499 0.03464141 0.13322169]
 [0.05949556 0.3329838  0.14537282 ... 0.25703866 0.04560984 0.17540339]
 [0.07381104 0.41310444 0.1803516  ... 0.31888582 0.05658422 0.21760794]
 ...
 [0.07660827 0.42875993 0.18718641 ... 0.33097069 0.0587286  0.22585466]
 [0.09760982 0.5463011  0.2385021  ... 0.42170371 0.07482858 0.28777094]
 [0.0197984  0.11080736 0.04837586 ... 0.08553502 0.01517763 0.05836916]]
R_original [[0.04518875 0.25291064 0.11040776 ... 0.19522902 0.03464269 0.1332241 ]
 [0.05949741 0.33299283 0.14536751 ... 0.25704677 0.04561203 0.17540847]
 [0.07381383 0.41311843 0.18034622 ... 0.31889804 0.05658732 0.21761572]
 ...
 [0.07661065 0.42877155 0.18717957 ... 0.33098113 0.05873142 0.22586121]
 [0.09761214 0.5463119  0.23849163 ... 0.42171392 0.07483163 0.28777718]
 [0.01979887 0.11080956 0.04837374 ... 0.08553709 0.01517825 0.05837043]]
RMSE of recovered matrix: 3.285609057185665e-05


#### Discussion the selection of lamda on the reconstruction accuracy

In [32]:
lamda_all= [1e-7* i for i in range(0,11)]

for lamda in lamda_all:
    
    U_factorized,V_factorized = SGD_factorization(0.01, lamda, 1000, R_missing,False)

    R_reconstructed_missing = U_factorized @ V_factorized.T

    RMSE_missing = np.sqrt(np.mean((R_1 - R_reconstructed_missing)**2))

    print("******* Regularization:", lamda ,"********")
    print("RMSE of recovered matrix:", RMSE_missing)

    

******* Regularization: 0.0 ********
RMSE of recovered matrix: 1.903031548464582e-05
******* Regularization: 1e-07 ********
RMSE of recovered matrix: 1.2540066835041894e-05
******* Regularization: 2e-07 ********
RMSE of recovered matrix: 5.130881445049577e-06
******* Regularization: 3e-07 ********
RMSE of recovered matrix: 5.496409683986112e-07
******* Regularization: 4e-07 ********
RMSE of recovered matrix: 5.344890136547401e-06
******* Regularization: 5e-07 ********
RMSE of recovered matrix: 1.0730243046580843e-05
******* Regularization: 6e-07 ********
RMSE of recovered matrix: 2.1021034504679266e-05
******* Regularization: 7e-07 ********
RMSE of recovered matrix: 9.258896347500448e-06
******* Regularization: 8e-07 ********
RMSE of recovered matrix: 9.1804264710524e-06
******* Regularization: 9e-07 ********
RMSE of recovered matrix: 6.120092060473727e-06
******* Regularization: 1e-06 ********
RMSE of recovered matrix: 3.3281929747496266e-06


**Discussion**:  
I select lambda from 1e-7 to 1e-6.  
1. When the lambda is small such as 1e-7 and 2e-7, the regularization term has minimal effect, allowing U and V to fit the observed entries of R more closely. This increases the risk of overfitting. When the lambda is below 3e-7, the effect of reconstruction with regularization is similar to the effect of no regularization reconstruction.  
2. When the lambda is moderate such as 3e-7, the regularization term could balance the generalization ability of learning to the observed entries and fittiing performance on the observed entries. So the RMSE is smaller than RMSE without regularization.  
3. When the lambda is large such as bigger than 4e-7, the generalization ability arises but the regularization term dominates the objective function, causing the values in U and V to be smaller in magnitude. So the factorization suffers a underfitting and impact the overall effect of missing value imputation and factorization reconstruction.

#### Calculate and compare the RMSE

I compared the RMSE of recovered matrix, U and V between using regularization and no regularization. We use the regularization parameter as 3e-7. 

In [33]:
# Calculate the RMSE of the recovere matrix

U_factorized,V_factorized = SGD_factorization(0.01,3e-7, 1000, R_missing,False)

R_reconstructed_missing = U_factorized @ V_factorized.T

RMSE_missing = np.sqrt(np.mean((R_1 - R_reconstructed_missing)**2))

print("RMSE of the recovered matrix with regulrization:", RMSE_missing)

print("RMSE of the U with regularization", np.sqrt(np.mean((u - U_factorized)**2)))

print("RMSE of the V with regularization", np.sqrt(np.mean((v - V_factorized)**2)))

RMSE of the recovered matrix with regulrization: 2.1122483841265654e-06
RMSE of the U with regularization 0.003884774457684544
RMSE of the V with regularization 0.0040542834453287544


In [37]:
U_factorized_noreg,V_factorized_noreg = SGD_factorization(0.01,0, 1000, R_missing,False)

R_reconstructed_missing_noreg = U_factorized_noreg @ V_factorized_noreg.T

RMSE_missing = np.sqrt(np.mean((R_1 - R_reconstructed_missing_noreg)**2))

print("RMSE of the recovered matrix with no regularization:", RMSE_missing)

print("RMSE of the U with no regularization", np.sqrt(np.mean((u - U_factorized_noreg)**2)))

print("RMSE of the V with no regularization", np.sqrt(np.mean((v - V_factorized_noreg)**2)))

RMSE of the recovered matrix with no regularization: 7.29834679081841e-06
RMSE of the U with no regularization 0.00865926886323159
RMSE of the V with no regularization 0.008830485668457294


**Discussion**:  
1. Regularization effect on the recoverd matrix. For the moderate regularization hyperparameter, the RMSE of regularization on the recovered matrix is lower than with no regularization. It shows that moderate regularization can fit the observed data well with good generalizing ability for the reconstruction on matrix.  
2. Regularization effect on the recoverd singular vector. For the moderate regularization hyperparemeter, the RMSE of regularization on the recovered singular vector is smaller than with no regularization. I shows that moderate regularization can fit the observed data well though it constrains the magnitude of the recovered singular vector. It predicts the singular vectors well and as a result, the multiplication of those singular vectors predicts the overall matrix well.

# Part2 

## Load Data

In [39]:
import pandas as pd
import numpy as np
ratings_list = [i.strip().split("::") for i in open('ratings.dat', 'r').readlines()]
users_list = [i.strip().split("::") for i in open('users.dat', 'r').readlines()]
movies_list = [i.strip().split("::") for i in open('movies.dat', 'r', encoding="ISO-8859-1").readlines()]
ratings = np.array(ratings_list)
users = np.array(users_list)
movies = np.array(movies_list)
ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID','Rating', 'Timestamp'])
movies_df = pd.DataFrame(movies_list, columns = ['MovieID', 'Title', 'Genres'])
movies_df['MovieID'] = movies_df['MovieID'].apply(pd.to_numeric)

In [40]:
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [41]:
movies_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [42]:
R_df = ratings_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
R_df.head()

MovieID,1,10,100,1000,1002,1003,1004,1005,1006,1007,...,99,990,991,992,993,994,996,997,998,999
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1001,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2.1 Recover rating matrix

#### Data Processing

In [43]:
# Step 1: Create a dictionary for quick lookup of actual ratings
ratings_dict = {(row['UserID'], row['MovieID']): row['Rating'] for _, row in ratings_df.iterrows()}

# Step 2: Define a function to replace 0s in R_df with values from ratings_dict if they exist
def impute_rating(user_id, movie_id, rating):
    # Check if the current rating is 0 and if there is an actual rating in ratings_dict
    if rating == 0:
        return ratings_dict.get((user_id, movie_id), rating)
    return rating

# Step 3: Apply the impute function to each element in R_df
R_df2 = R_df.apply(lambda row: [impute_rating(row.name, col, row[col]) for col in R_df.columns], axis=1, result_type='broadcast')

# Convert R_df back to a DataFrame with the correct columns
R_df2 = pd.DataFrame(R_df2, index=R_df.index, columns=R_df.columns)

# Display the first few rows of the imputed R_df
R_df2.head()


MovieID,1,10,100,1000,1002,1003,1004,1005,1006,1007,...,99,990,991,992,993,994,996,997,998,999
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1001,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
