# Preparing the data for model input

In [1]:
# DONE : Imports
from _Setup import *

In [3]:
train_data = pd.read_csv(r"C:\Users\Marin\OneDrive\Documents\Documents\Angelo\Important\Homework\R code\train.csv",
                        low_memory = False)
test_data = pd.read_csv(r"C:\Users\Marin\OneDrive\Documents\Documents\Angelo\Important\Homework\R code\test.csv", 
                       low_memory = False)

In [5]:
def backward_fill_first_na(df, patient_id_col='patient'):
    # Create a copy of the original DataFrame
    df_imputed = df.copy()
    
    # Get unique patient IDs
    patient_ids = df_imputed[patient_id_col].unique()
    
    # Iterate over each patient ID
    for patient in patient_ids:
        # Select the rows corresponding to the current patient ID
        patient_data = df_imputed[df_imputed[patient_id_col] == patient]
        
        # Find the first NaN index
        first_na_index = patient_data.first_valid_index()  # Find the first valid index
        first_na_row = patient_data.isna().any(axis=1).idxmax()  # Index of the first NaN row
        
        # Only proceed if there's an initial NaN
        if pd.isna(first_na_row) or first_na_row > first_na_index:
            continue
        
        # Perform backward fill for only the first NaN
        if pd.isna(patient_data.loc[first_na_row]).any():
            next_value = patient_data.loc[first_na_row:].bfill().iloc[0]
            df_imputed.loc[first_na_row] = df_imputed.loc[first_na_row].fillna(next_value)
    
    return df_imputed

In [None]:
train_back_fill = backward_fill_first_na(train_data) 

In [None]:
def forward_fill_by_patient(df, patient_id_col='patient'):
    # Create a copy of the original DataFrame
    df_imputed = df.copy()
    
    # Perform forward fill for the entire DataFrame
    df_imputed = df_imputed.groupby(patient_id_col).ffill()
    
    # If there are patients with no entries for a column, those columns will remain unchanged
    return df_imputed

In [None]:
train_filled = forward_fill_by_patient(train_back_fill)

## Note on backward_fill function:
The results of this code were run outside of this notebook, though imported through the tensor_decomp_pre_knn.pkl file.

# Using K Means Imputation

In [31]:
# DONE : Apply initial imputation to train data
# train_data_imputed = backward_fill_first_na(train_data)
# train_data_imputed.head(5)

In [4]:
# TODO : Separate Train/Test
pre_imputed_train = pd.read_pickle(tensorDecompTrainFilePickle)
pre_imputed_train.head(5)

Unnamed: 0,time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,1.0,78.0,99.5,35.25,108.5,84.5,69.0,12.0,,0.0,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,1.0,0.0
1,2.0,78.0,99.5,35.25,108.5,84.5,69.0,12.0,,0.0,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,2.0,0.0
2,3.0,80.0,99.0,36.2,113.0,87.0,73.0,12.0,,0.0,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,3.0,0.0
3,4.0,79.0,100.0,36.5,112.0,83.0,68.0,12.0,,-2.5,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,4.0,0.0
4,5.0,73.0,100.0,36.5,115.0,80.0,64.0,11.0,,-2.5,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,5.0,0.0


# Scale data
Data decomposed into train and test data using standardscaler, scaled, then recomposed

In [33]:
# DONE : Track initial null indexes to be imputed
indices = pre_imputed_train.isna()

# DONE : Creat simple imputer to fill NaN values with the mean
imputer = SimpleImputer(strategy='mean')
imputed_train = pd.DataFrame(imputer.fit_transform(pre_imputed_train), columns = pre_imputed_train.columns)

# TODO : Separate train/test before scaling
imputed_train_x_unscaled = imputed_train.drop("SepsisLabel", axis = 1, inplace = False)
train_y = imputed_train['SepsisLabel']

# DONE : Scale the variables
scaler = StandardScaler()
scaled_train_x = pd.DataFrame(scaler.fit_transform(imputed_train_x_unscaled), columns = imputed_train_x_unscaled.columns)

# DONE : Put back together for full dataframe
scaled_train = scaled_train_x.join(train_y)
scaled_train.head()

Unnamed: 0,time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,-0.868997,-0.366247,0.747819,-2.225465,-0.654227,0.108104,0.372134,-1.285029,6.868005e-15,0.05456,...,0.429484,-1.828037e-15,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.887791,0.0
1,-0.835162,-0.366247,0.747819,-2.225465,-0.654227,0.108104,0.372134,-1.285029,6.868005e-15,0.05456,...,0.429484,-1.828037e-15,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.853849,0.0
2,-0.801326,-0.251474,0.58859,-0.91134,-0.460376,0.26025,0.680747,-1.285029,6.868005e-15,0.05456,...,0.429484,-1.828037e-15,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.819907,0.0
3,-0.767491,-0.30886,0.907047,-0.496353,-0.503454,0.016817,0.294981,-1.285029,6.868005e-15,-0.931047,...,0.429484,-1.828037e-15,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.785964,0.0
4,-0.733655,-0.653182,0.907047,-0.496353,-0.37422,-0.165758,-0.013631,-1.478158,6.868005e-15,-0.931047,...,0.429484,-1.828037e-15,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.752022,0.0


In [34]:
# DONE : Use KMeans clusetering 
k = 8
k_means_class = KMeans(n_clusters= k, max_iter=300)

clusters = k_means_class.fit_predict(scaled_train)

In [35]:
# DONE : Impute the missing values based upon means
for cluster in range(k):
    cluster_indices = np.where(clusters == cluster)[0]
    cluster_data = scaled_train.iloc[cluster_indices]
    cluster_mean = cluster_data.mean()

    for col in scaled_train.columns:
        for index in cluster_indices:
            if (indices.at[index, col] == True):
                scaled_train.at[index, col] = cluster_mean[col]

In [36]:
# DONE : Ensure the filtering worked
print(scaled_train.isna().sum())

time                0
HR                  0
O2Sat               0
Temp                0
SBP                 0
MAP                 0
DBP                 0
Resp                0
EtCO2               0
BaseExcess          0
HCO3                0
FiO2                0
pH                  0
PaCO2               0
SaO2                0
AST                 0
BUN                 0
Alkalinephos        0
Calcium             0
Chloride            0
Creatinine          0
Bilirubin_direct    0
Glucose             0
Lactate             0
Magnesium           0
Phosphate           0
Potassium           0
Bilirubin_total     0
TroponinI           0
Hct                 0
Hgb                 0
PTT                 0
WBC                 0
Fibrinogen          0
Platelets           0
Age                 0
Gender              0
Unit1               0
Unit2               0
HospAdmTime         0
ICULOS              0
SepsisLabel         0
dtype: int64


# Using Experimental Iterative Imputations

In [21]:
# DONE : Try the experimental imputer
# NOTE: THIS TAKES FOREVER, DON'T RUN UNLESS 100% NECESSARY
#imputer = IterativeImputer(random_state=0)
#train_data_imputed = imputer.fit(train_data)
# train_data_imputed = imputer.transform(train_data)
# train_data_imputed_df = pd.DataFrame(train_data_imputed, columns = imputer.get_feature_names_out())#, columns = imputer.get_feature_names_out())
# train_data_imputed_df.head(5)
# train_data_imputed_df.to_pickle("../Data/imputedData/iterative_imputed.pkl", index = False)

# SMOTE to increase proportion of Sepsis cases

In [38]:
X = scaled_train.drop(columns = ['SepsisLabel'])
y = scaled_train['SepsisLabel']


sm = SMOTE(sampling_strategy=0.3, k_neighbors=5, random_state=100)
X_train_syn, y_train_syn = sm.fit_resample(X, y)

print(scaled_train.iloc[:, 0:4].describe())
print(X_train_syn.iloc[:, 0:4].describe())

               time            HR         O2Sat          Temp
count  1.088197e+06  1.088197e+06  1.088197e+06  1.088197e+06
mean  -5.307211e-17 -1.045374e-05  4.793531e-06  6.158938e-04
std    1.000000e+00  1.000003e+00  1.000002e+00  1.000160e+00
min   -8.689972e-01 -3.694685e+00 -2.456949e+01 -4.960314e+01
25%   -5.644774e-01 -7.105684e-01 -3.667800e-01 -6.346818e-01
50%   -2.261220e-01 -7.931298e-02  2.701335e-01 -8.136589e-02
75%    2.475755e-01  6.093293e-01  5.885903e-01  6.102790e-01
max    1.347727e+01  7.266204e+00  9.070471e-01  1.817806e+01
               time            HR         O2Sat          Temp
count  1.389147e+06  1.389147e+06  1.389147e+06  1.389147e+06
mean   2.158691e-01  7.793527e-02 -1.044757e-02  8.165705e-02
std    1.367046e+00  1.029218e+00  1.021742e+00  1.084604e+00
min   -8.689972e-01 -3.694685e+00 -2.456949e+01 -4.960314e+01
25%   -5.634181e-01 -6.531815e-01 -3.667800e-01 -5.793502e-01
50%   -1.584510e-01  3.774459e-03  2.701335e-01  4.313019e-02
75%    3

In [39]:
print(scaled_train['SepsisLabel'].describe())
print(y_train_syn.describe())

count    1.088197e+06
mean     1.803166e-02
std      1.330659e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: SepsisLabel, dtype: float64
count    1.389147e+06
mean     2.307690e-01
std      4.213250e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: SepsisLabel, dtype: float64


In [40]:
# DONE : Convert to pickle
scaled_train.to_pickle(kmeansTrainFilePickle)

 # Test Data preparation 


In [None]:
# preparing test data 

test_backward = backward_fill_first_na(test_data)