In [1]:
import pandas as pd
import numpy as np
import os
import sys
import sklearn
import missingno as msno

In [2]:
raw_df = pd.read_csv('./mimic_cut_3d.csv').set_index('stay_id').drop(columns=['Unnamed: 0'])

In [3]:
raw_df.head(n=10)

Unnamed: 0_level_0,creatinine,fraction_inspired_oxygen,lactate,urine_output,diastolic_blood_pressure,systolic_blood_pressure,gcs,partial_pressure_of_oxygen,heart_rate,temperature,respiratory_rate
stay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
33751102.0,1.15,0.88,6.14,280.0,57.058333,111.058333,15.0,98.5625,112.104167,36.303175,25.824811
33751102.0,1.15,0.833333,6.65,257.142857,56.162963,109.051852,15.0,97.714286,110.759259,36.465278,22.955387
33751102.0,1.15,0.833333,7.5,232.5,55.746667,107.546667,15.0,103.166667,109.683333,36.558025,23.659848
33751102.0,1.166667,0.833333,8.2,232.5,55.587879,106.951515,15.0,103.166667,108.712121,36.558025,24.236226
33751102.0,1.166667,0.8,8.2,231.666667,55.538889,107.288889,15.0,103.166667,107.486111,36.572222,24.71654
33751102.0,1.15,0.766667,8.2,219.0,54.319444,106.736111,15.0,108.6,105.569444,36.719444,25.340278
35520609.0,1.575,0.5,1.966667,55.181818,76.717172,120.684343,15.0,62.75,130.594017,38.074074,22.940171
35520609.0,1.633333,0.533333,1.7,53.7,76.194444,120.638889,15.0,72.25,129.388889,37.977778,23.388889
35520609.0,1.633333,0.533333,1.7,57.2,76.833333,122.083333,15.0,72.25,128.416667,37.977778,23.833333
35520609.0,1.633333,0.533333,1.7,59.7,74.5,119.833333,15.0,69.333333,127.333333,38.018519,23.75


In [4]:
raw_df.columns

Index(['creatinine', 'fraction_inspired_oxygen', 'lactate', 'urine_output',
       'diastolic_blood_pressure', 'systolic_blood_pressure', 'gcs',
       'partial_pressure_of_oxygen', 'heart_rate', 'temperature',
       'respiratory_rate'],
      dtype='object')

In [5]:
def mask_random_data(data, mask_fraction=0.1):
    """
    Randomly masks a given fraction of the data in a DataFrame.

    :param data: Pandas DataFrame to be masked.
    :param mask_fraction: Fraction of the data to be masked. Default is 0.1 (10%).
    :return: DataFrame with masked data.
    """
    # Flatten the DataFrame to work with individual elements
    flattened_data = data.to_numpy().flatten()

    # Calculate the number of elements to mask
    num_to_mask = int(len(flattened_data) * mask_fraction)

    # Randomly select indices to mask
    mask_indices = np.random.choice(len(flattened_data), num_to_mask, replace=False)

    # Mask the selected indices
    for idx in mask_indices:
        flattened_data[idx] = np.nan

    # Reshape the data back to its original shape and create a new DataFrame
    return pd.DataFrame(flattened_data.reshape(data.shape), columns=data.columns)

In [6]:
#mu, std = raw_df.mean(axis=0).values, raw_df.std(axis=0).values
#normalized_df = (raw_df - mu)/std
#masked_df = mask_random_data(normalized_df)
#masked_df.head(n=30)

In [7]:
# Normalize to [0, 1] range
df_min, df_max = raw_df.min(axis=0), raw_df.max(axis=0)
normalized_df = (raw_df - df_min)/(df_max - df_min)

In [8]:
#!pip install scikit-learn==1.3.2

In [9]:
from sklearn.decomposition import NMF, non_negative_factorization
import sklearn
print(sklearn.__version__)
def NMF_impute_per_patient(masked_df):
    imputed_matrix = np.array([]).reshape(0, 11)
    for i in range(len(masked_df) // 6):
        patient_df = masked_df.iloc[i:i+6]
        old_patient_matrix = patient_df.to_numpy()

        na_indices = patient_df.isna().to_numpy()
        patient_df = patient_df.fillna(method='ffill').fillna(patient_df.mean())
        patient_matrix = patient_df.to_numpy()

        W, H, n_iter = non_negative_factorization(patient_matrix, init='random', random_state=0, max_iter=500, tol=1e-4, alpha_W=0.5)
        #print(f'Number of iterations: {n_iter}')
        #W = model.fit_transform(patient_matrix)
        #H = model.components_
        pred = W @ H
        old_patient_matrix[na_indices] = pred[na_indices]
        imputed_matrix = np.concatenate((imputed_matrix, old_patient_matrix), axis=0)
        
    return imputed_matrix

1.3.2


In [10]:
masked_df = mask_random_data(normalized_df)
imputed_matrix = NMF_impute_per_patient(masked_df)

In [11]:
imputed_matrix.shape

(3066, 11)

In [12]:
def RMSD_global(y, pred, na_indices):
    results = []
    assert y.shape == pred.shape
    assert y.shape == na_indices.shape
    for i in range(y.shape[1]):
        y_i, pred_i = y[:, i][na_indices[:, i]], pred[:, i][na_indices[:, i]]
        var = np.square(y_i - pred_i) 
        rmsd = np.sum(var) / y_i.shape[0]
        results.append(rmsd)
    return np.array(results)



def RMSD_local(y, pred, na_indices):
    results = []
    assert y.shape == pred.shape
    assert y.shape == na_indices.shape
    for i in range(y.shape[1]):
        y_i, pred_i = y[:, i][na_indices[:, i]], pred[:, i][na_indices[:, i]]
        diff = (y_i - pred_i) / (np.max(y_i) - np.min(y_i))
        var = np.square(diff) 
        rmsd = np.sum(var) / y_i.shape[0]
        results.append(rmsd)
    return np.array(results)
        
        

In [13]:
na_indices = masked_df.isna().to_numpy()
y = normalized_df.to_numpy()
rmsd_global = RMSD_global(y, imputed_matrix, na_indices)

In [14]:
na_indices = masked_df.isna().to_numpy()
y = normalized_df.to_numpy()
rmsd_local = RMSD_local(y, imputed_matrix, na_indices)

In [15]:
print(np.mean(rmsd_global))
print(np.mean(rmsd_local))

0.060897377136205535
0.08398389835057431
