In [3]:
import pandas as pd
import numpy as np
import os
import sys
import sklearn
import missingno as msno

In [4]:
#!pip install missingno

In [6]:
raw_df = pd.read_csv('./mimic_cut_3d.csv').set_index('stay_id').drop(columns=['Unnamed: 0'])
raw_df.head(n=10)

Unnamed: 0_level_0,creatinine,fraction_inspired_oxygen,lactate,urine_output,diastolic_blood_pressure,systolic_blood_pressure,gcs,partial_pressure_of_oxygen,heart_rate,temperature,respiratory_rate
stay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
33751102.0,1.15,0.88,6.14,280.0,57.058333,111.058333,15.0,98.5625,112.104167,36.303175,25.824811
33751102.0,1.15,0.833333,6.65,257.142857,56.162963,109.051852,15.0,97.714286,110.759259,36.465278,22.955387
33751102.0,1.15,0.833333,7.5,232.5,55.746667,107.546667,15.0,103.166667,109.683333,36.558025,23.659848
33751102.0,1.166667,0.833333,8.2,232.5,55.587879,106.951515,15.0,103.166667,108.712121,36.558025,24.236226
33751102.0,1.166667,0.8,8.2,231.666667,55.538889,107.288889,15.0,103.166667,107.486111,36.572222,24.71654
33751102.0,1.15,0.766667,8.2,219.0,54.319444,106.736111,15.0,108.6,105.569444,36.719444,25.340278
35520609.0,1.575,0.5,1.966667,55.181818,76.717172,120.684343,15.0,62.75,130.594017,38.074074,22.940171
35520609.0,1.633333,0.533333,1.7,53.7,76.194444,120.638889,15.0,72.25,129.388889,37.977778,23.388889
35520609.0,1.633333,0.533333,1.7,57.2,76.833333,122.083333,15.0,72.25,128.416667,37.977778,23.833333
35520609.0,1.633333,0.533333,1.7,59.7,74.5,119.833333,15.0,69.333333,127.333333,38.018519,23.75


In [7]:
def mask_random_data(data, mask_fraction=0.1):
    """
    Randomly masks a given fraction of the data in a DataFrame.

    :param data: Pandas DataFrame to be masked.
    :param mask_fraction: Fraction of the data to be masked. Default is 0.1 (10%).
    :return: DataFrame with masked data.
    """
    # Flatten the DataFrame to work with individual elements
    flattened_data = data.to_numpy().flatten()

    # Calculate the number of elements to mask
    num_to_mask = int(len(flattened_data) * mask_fraction)

    # Randomly select indices to mask
    mask_indices = np.random.choice(len(flattened_data), num_to_mask, replace=False)

    # Mask the selected indices
    for idx in mask_indices:
        flattened_data[idx] = np.nan

    # Reshape the data back to its original shape and create a new DataFrame
    return pd.DataFrame(flattened_data.reshape(data.shape), columns=data.columns)

In [8]:
mu, std = raw_df.mean(axis=0).values, raw_df.std(axis=0).values
normalized_df = (raw_df - mu)/std
masked_df = mask_random_data(normalized_df)
masked_df.head(n=10)

Unnamed: 0,creatinine,fraction_inspired_oxygen,lactate,urine_output,diastolic_blood_pressure,systolic_blood_pressure,gcs,partial_pressure_of_oxygen,heart_rate,temperature,respiratory_rate
0,,1.732144,0.701248,2.429507,0.108413,0.43236,0.264455,-0.579785,0.955194,-0.728371,0.953613
1,-0.54357,1.463702,0.857793,2.149416,0.005184,0.24989,0.264455,-0.594241,0.886037,-0.541851,0.362902
2,-0.54357,,1.118701,,-0.042812,0.113009,0.264455,-0.501313,0.830711,,0.507925
3,-0.53245,1.463702,1.333567,1.847442,-0.061119,0.058886,0.264455,-0.501313,0.78077,-0.435134,0.626581
4,-0.53245,1.271958,1.333567,1.83723,-0.066767,0.089567,0.264455,,0.717727,-0.418798,0.72546
5,-0.54357,,1.333567,1.682013,-0.207359,0.039297,0.264455,-0.40871,0.61917,-0.249401,
6,-0.260024,-0.453738,-0.57976,-0.325415,2.374915,1.307749,0.264455,-1.190158,1.905966,1.309268,0.35977
7,-0.221106,-0.261994,-0.661613,-0.343573,2.314649,1.303615,0.264455,-1.028244,1.843997,,0.452144
8,-0.221106,-0.261994,-0.661613,-0.300684,2.388307,1.434973,0.264455,-1.028244,1.794004,1.198468,0.54364
9,-0.221106,-0.261994,-0.661613,-0.270049,2.119293,1.230358,0.264455,-1.077955,,1.245345,0.526484


In [14]:
import numpy as np

def svd_imputation(matrix, k=5, max_iter=100, tol=1e-4):
    # Replace NaN values with mean
    matrix = np.nan_to_num(matrix, nan=np.nanmean(matrix))

    for _ in range(max_iter):
        # Perform SVD
        U, S, Vt = np.linalg.svd(matrix, full_matrices=False)

        # Truncate to the top k singular values
        U_k = U[:, :k]
        S_k = np.diag(S[:k])
        Vt_k = Vt[:k, :]

        # Reconstruct matrix with the low-rank approximation
        matrix_approx = np.dot(U_k, np.dot(S_k, Vt_k))

        # Update missing values in the original matrix with the imputed values
        matrix[np.isnan(matrix)] = matrix_approx[np.isnan(matrix)]

        # Check for convergence
        if np.linalg.norm(matrix_approx - matrix) < tol:
            break

    return matrix

# Sample data
#imputed_matrix = svd_imputation(masked_df)

In [None]:
def svd_per_patient():
    

In [15]:
def RMSD_global(y, pred, na_indices):
    results = []
    assert y.shape == pred.shape
    assert y.shape == na_indices.shape
    for i in range(y.shape[1]):
        y_i, pred_i = y[:, i][na_indices[:, i]], pred[:, i][na_indices[:, i]]
        var = np.square(y_i - pred_i) 
        rmsd = np.sum(var) / y_i.shape[0]
        results.append(rmsd)
    return np.array(results)



def RMSD_local(y, pred, na_indices):
    results = []
    assert y.shape == pred.shape
    assert y.shape == na_indices.shape
    for i in range(y.shape[1]):
        y_i, pred_i = y[:, i][na_indices[:, i]], pred[:, i][na_indices[:, i]]
        diff = (y_i - pred_i) / (np.max(y_i) - np.min(y_i))
        var = np.square(diff) 
        rmsd = np.sum(var) / y_i.shape[0]
        results.append(rmsd)
    return np.array(results)
        
        

In [16]:
na_indices = masked_df.isna().to_numpy()
y = normalized_df.to_numpy()
rmsd_global = RMSD_global(y, imputed_matrix, na_indices)
na_indices = masked_df.isna().to_numpy()
y = normalized_df.to_numpy()
rmsd_local = RMSD_local(y, imputed_matrix, na_indices)

In [17]:
print(np.mean(rmsd_global))
print(np.mean(rmsd_local))

0.9872264002758844
0.030115236838752665
