# Preparing the data for model input

In [9]:
# DONE : Imports
from _Setup import *

In [2]:
# DONE : Input Angelo and Pin's Code
def backward_fill_first_na(df, patient_id_col='patient'):
    # Create a copy of the original DataFrame to avoid modifying it directly
    df_imputed = df.copy()

    # Group by patient ID and iterate through each group
    for patient, patient_data in df_imputed.groupby(patient_id_col):
        # Find the index of the first NaN
        first_na_index = patient_data[patient_data.isna().any(axis=1)].index.min()
        
        if first_na_index is not None:  # Check if there's a NaN
            # Perform backward fill from the first NaN onwards
            next_value = patient_data.loc[first_na_index:].bfill().iloc[0]
            df_imputed.loc[first_na_index] = df_imputed.loc[first_na_index].fillna(next_value)

    return df_imputed

In [3]:
# DONE : Import train data
train_data = pd.read_csv(rawTrainFile)
train_data.head(5)

Unnamed: 0,patient,time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,16426,1,,,,,,,,,...,,,,57.03,1.0,0.0,1.0,-4.25,1.0,0.0
1,16426,2,78.0,99.5,35.25,108.5,84.5,69.0,12.0,,...,14.2,,,57.03,1.0,0.0,1.0,-4.25,2.0,0.0
2,16426,3,80.0,99.0,36.2,113.0,87.0,73.0,12.0,,...,,,,57.03,1.0,0.0,1.0,-4.25,3.0,0.0
3,16426,4,79.0,100.0,36.5,112.0,83.0,68.0,12.0,,...,,,,57.03,1.0,0.0,1.0,-4.25,4.0,0.0
4,16426,5,73.0,100.0,36.5,115.0,80.0,64.0,11.0,,...,,,,57.03,1.0,0.0,1.0,-4.25,5.0,0.0


# Using KNN Imputation

In [4]:
# DONE : Apply initial imputation to train data
# train_data_imputed = backward_fill_first_na(train_data)
# train_data_imputed.head(5)

In [5]:
# TODO : Separate Train/Test
pre_imputed_train = pd.read_csv(tensorDecompTrainFile)
pre_imputed_train.head(5)

Unnamed: 0,time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,1.0,78.0,99.5,35.25,108.5,84.5,69.0,12.0,,0.0,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,1.0,0.0
1,2.0,78.0,99.5,35.25,108.5,84.5,69.0,12.0,,0.0,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,2.0,0.0
2,3.0,80.0,99.0,36.2,113.0,87.0,73.0,12.0,,0.0,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,3.0,0.0
3,4.0,79.0,100.0,36.5,112.0,83.0,68.0,12.0,,-2.5,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,4.0,0.0
4,5.0,73.0,100.0,36.5,115.0,80.0,64.0,11.0,,-2.5,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,5.0,0.0


In [6]:
pre_imputed_train.to_pickle(tensorDecompTrainFilePickle)

In [15]:
# DONE : Track initial null indexes to be imputed
indices = pre_imputed_train.isna()

# DONE : Creat simple imputer to fill NaN values with the mean
imputer = SimpleImputer(strategy='mean')
imputed_train = pd.DataFrame(imputer.fit_transform(pre_imputed_train), columns = pre_imputed_train.columns)

# DONE : Scale the variables
scaler = StandardScaler()
scaled_train = pd.DataFrame(scaler.fit_transform(imputed_train), columns = imputed_train.columns)

In [16]:
# DONE : Use KMeans clusetering 
k = 8
k_means_class = KMeans(n_clusters= k, max_iter=300)

clusters = k_means_class.fit_predict(scaled_train)
#print(pre_imputed_train.filter(attributes).isna().sum())
#print(pre_imputed_train[pre_imputed_train["HR"].isna()])

In [20]:
# DONE : Impute the missing values based upon means
for cluster in range(k):
    cluster_indices = np.where(clusters == cluster)[0]
    cluster_data = scaled_train.iloc[cluster_indices]
    cluster_mean = cluster_data.mean()

    for col in scaled_train.columns:
        for index in cluster_indices:
            if (indices.at[index, col] == True):
                scaled_train.at[index, col] = cluster_mean[col]

In [23]:
# TODO : Ensure the filtering worked
print(scaled_train.isna().sum())

# TODO : Upload to csv for ease of use
scaled_train.to_csv("../Data/imputedData/kmeans_imputed.csv")

time                0
HR                  0
O2Sat               0
Temp                0
SBP                 0
MAP                 0
DBP                 0
Resp                0
EtCO2               0
BaseExcess          0
HCO3                0
FiO2                0
pH                  0
PaCO2               0
SaO2                0
AST                 0
BUN                 0
Alkalinephos        0
Calcium             0
Chloride            0
Creatinine          0
Bilirubin_direct    0
Glucose             0
Lactate             0
Magnesium           0
Phosphate           0
Potassium           0
Bilirubin_total     0
TroponinI           0
Hct                 0
Hgb                 0
PTT                 0
WBC                 0
Fibrinogen          0
Platelets           0
Age                 0
Gender              0
Unit1               0
Unit2               0
HospAdmTime         0
ICULOS              0
SepsisLabel         0
dtype: int64


In [None]:
train_data_knn_df = pd.DataFrame(imputed_data_knn, columns = knnImputer.get_feature_names_out())
train_data_knn_df.head(5)

In [None]:
imputed_train_knn.to_csv("../Data/imputedData/knn_imputed.csv", index = False)

# Using Experimental Iterative Imputations

In [13]:
# DONE : Try the experimental imputer
# NOTE: THIS TAKES FOREVER, DON'T RUN UNLESS 100% NECESSARY
#imputer = IterativeImputer(random_state=0)
#train_data_imputed = imputer.fit(train_data)

KeyboardInterrupt: 

In [14]:
train_data_imputed = imputer.transform(train_data)

In [16]:
train_data_imputed_df = pd.DataFrame(train_data_imputed, columns = imputer.get_feature_names_out())#, columns = imputer.get_feature_names_out())
train_data_imputed_df.head(5)

Unnamed: 0,patient,time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,16426.0,1.0,84.183402,97.322764,37.092722,122.885858,82.873563,63.662701,17.769149,32.934801,...,12.827441,320.558297,176.572396,57.03,1.0,0.0,1.0,-4.25,1.0,0.0
1,16426.0,2.0,78.0,99.5,35.25,108.5,84.5,69.0,12.0,39.872276,...,14.2,132.82622,104.498612,57.03,1.0,0.0,1.0,-4.25,2.0,0.0
2,16426.0,3.0,80.0,99.0,36.2,113.0,87.0,73.0,12.0,26.633213,...,12.356886,273.348615,181.854264,57.03,1.0,0.0,1.0,-4.25,3.0,0.0
3,16426.0,4.0,79.0,100.0,36.5,112.0,83.0,68.0,12.0,31.926064,...,14.177234,290.367439,170.629748,57.03,1.0,0.0,1.0,-4.25,4.0,0.0
4,16426.0,5.0,73.0,100.0,36.5,115.0,80.0,64.0,11.0,26.165961,...,13.427547,279.265355,177.97198,57.03,1.0,0.0,1.0,-4.25,5.0,0.0


In [25]:
train_data_imputed_df.to_pickle("../Data/imputedData/iterative_imputed.pkl", index = False)

NameError: name 'train_data_imputed_df' is not defined

In [7]:
iter = pd.read_csv(imputedTrainFile)
kmeans = pd.read_csv(kmeansTrainFile)

In [13]:
iter.head()

Unnamed: 0,patient,time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,16426.0,1.0,84.183402,97.322764,37.092722,122.885858,82.873563,63.662701,17.769149,32.934801,...,12.827441,320.558297,176.572396,57.03,1.0,0.0,1.0,-4.25,1.0,0.0
1,16426.0,2.0,78.0,99.5,35.25,108.5,84.5,69.0,12.0,39.872276,...,14.2,132.82622,104.498612,57.03,1.0,0.0,1.0,-4.25,2.0,0.0
2,16426.0,3.0,80.0,99.0,36.2,113.0,87.0,73.0,12.0,26.633213,...,12.356886,273.348615,181.854264,57.03,1.0,0.0,1.0,-4.25,3.0,0.0
3,16426.0,4.0,79.0,100.0,36.5,112.0,83.0,68.0,12.0,31.926064,...,14.177234,290.367439,170.629748,57.03,1.0,0.0,1.0,-4.25,4.0,0.0
4,16426.0,5.0,73.0,100.0,36.5,115.0,80.0,64.0,11.0,26.165961,...,13.427547,279.265355,177.97198,57.03,1.0,0.0,1.0,-4.25,5.0,0.0


In [11]:
iter.to_pickle(imputedTrainFilePickle)
kmeans.to_pickle("../Data/imputedData/kmeans_imputed.pkl")

In [12]:
kmeans.head()

Unnamed: 0.1,Unnamed: 0,time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,0,-0.868997,-0.366247,0.747819,-2.225465,-0.654227,0.108104,0.372134,-1.285029,-0.03065,...,0.429484,-0.15979,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.887791,-0.135509
1,1,-0.835162,-0.366247,0.747819,-2.225465,-0.654227,0.108104,0.372134,-1.285029,-0.03065,...,0.429484,-0.15979,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.853849,-0.135509
2,2,-0.801326,-0.251474,0.58859,-0.91134,-0.460376,0.26025,0.680747,-1.285029,-0.03065,...,0.429484,-0.15979,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.819907,-0.135509
3,3,-0.767491,-0.30886,0.907047,-0.496353,-0.503454,0.016817,0.294981,-1.285029,-0.03065,...,0.429484,-0.15979,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.785964,-0.135509
4,4,-0.733655,-0.653182,0.907047,-0.496353,-0.37422,-0.165758,-0.013631,-1.478158,-0.03065,...,0.429484,-0.15979,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.752022,-0.135509


In [16]:
testPickle = pd.read_pickle("../Data/testImputedData/imputedData/kmeans_imputed.pkl")
testPickle.head()

Unnamed: 0.1,Unnamed: 0,time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,0,-0.868997,-0.366247,0.747819,-2.225465,-0.654227,0.108104,0.372134,-1.285029,-0.03065,...,0.429484,-0.15979,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.887791,-0.135509
1,1,-0.835162,-0.366247,0.747819,-2.225465,-0.654227,0.108104,0.372134,-1.285029,-0.03065,...,0.429484,-0.15979,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.853849,-0.135509
2,2,-0.801326,-0.251474,0.58859,-0.91134,-0.460376,0.26025,0.680747,-1.285029,-0.03065,...,0.429484,-0.15979,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.819907,-0.135509
3,3,-0.767491,-0.30886,0.907047,-0.496353,-0.503454,0.016817,0.294981,-1.285029,-0.03065,...,0.429484,-0.15979,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.785964,-0.135509
4,4,-0.733655,-0.653182,0.907047,-0.496353,-0.37422,-0.165758,-0.013631,-1.478158,-0.03065,...,0.429484,-0.15979,-0.44147,-0.299265,0.885411,-1.2722,1.2722,0.312758,-0.752022,-0.135509
