# Preparing the data for model input

In [1]:
# DONE : Imports
from _Setup import *

In [2]:
train_data = pd.read_csv(r"C:\Users\Marin\OneDrive\Documents\Documents\Angelo\Important\Homework\R code\train.csv",
                        low_memory = False)
test_data = pd.read_csv(r"C:\Users\Marin\OneDrive\Documents\Documents\Angelo\Important\Homework\R code\test.csv", 
                       low_memory = False)

In [3]:
def forward_fill_by_patient(df, patient_id_col='patient'):
    # Create a copy of the original DataFrame
    df_imputed = df.copy()

    df_imputed.loc[:, df_imputed.columns != patient_id_col] = (
        df_imputed.groupby(patient_id_col)
        .ffill()
    )
    
    # If there are patients with no entries for a column, those columns will remain unchanged
    return df_imputed

In [4]:
train_forward = forward_fill_by_patient(train_data)
train_forward.head()

Unnamed: 0,patient,time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,16426,1,,,,,,,,,...,,,,57.03,1.0,0.0,1.0,-4.25,1.0,0.0
1,16426,2,78.0,99.5,35.25,108.5,84.5,69.0,12.0,,...,14.2,,,57.03,1.0,0.0,1.0,-4.25,2.0,0.0
2,16426,3,80.0,99.0,36.2,113.0,87.0,73.0,12.0,,...,14.2,,,57.03,1.0,0.0,1.0,-4.25,3.0,0.0
3,16426,4,79.0,100.0,36.5,112.0,83.0,68.0,12.0,,...,14.2,,,57.03,1.0,0.0,1.0,-4.25,4.0,0.0
4,16426,5,73.0,100.0,36.5,115.0,80.0,64.0,11.0,,...,14.2,,,57.03,1.0,0.0,1.0,-4.25,5.0,0.0


In [5]:
def backward_fill_by_patient(df, patient_id_col='patient'):
    # Create a copy of the original DataFrame
    df_imputed = df.copy()
    
    df_imputed.loc[:, df_imputed.columns != patient_id_col] = (
        df_imputed.groupby(patient_id_col)
        .bfill()
    )
    
    # If there are patients with no entries for a column, those columns will remain unchanged
    return df_imputed

In [6]:
pre_imputed_train = backward_fill_by_patient(train_forward)
pre_imputed_train.head()

Unnamed: 0,patient,time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,16426,1,78.0,99.5,35.25,108.5,84.5,69.0,12.0,,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,1.0,0.0
1,16426,2,78.0,99.5,35.25,108.5,84.5,69.0,12.0,,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,2.0,0.0
2,16426,3,80.0,99.0,36.2,113.0,87.0,73.0,12.0,,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,3.0,0.0
3,16426,4,79.0,100.0,36.5,112.0,83.0,68.0,12.0,,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,4.0,0.0
4,16426,5,73.0,100.0,36.5,115.0,80.0,64.0,11.0,,...,14.2,,162.0,57.03,1.0,0.0,1.0,-4.25,5.0,0.0


## Note on backward_fill function:
The results of this code were run outside of this notebook, though imported through the tensor_decomp_pre_knn.pkl file.

# Using K Means Imputation

# Scale data
Data decomposed into train and test data using standardscaler, scaled, then recomposed

In [7]:
# DONE : Track initial null indexes to be imputed
#indices = pre_imputed_train.isna()

# DONE : Creat simple imputer to fill NaN values with the mean
#imputer = SimpleImputer(strategy='mean')
#imputed_train = pd.DataFrame(imputer.fit_transform(pre_imputed_train), columns = pre_imputed_train.columns)

# TODO : Separate train/test before scaling
#imputed_train_x_unscaled = imputed_train.drop("SepsisLabel", axis = 1, inplace = False)
#train_y = imputed_train['SepsisLabel']

# DONE : Scale the variables
#scaler = StandardScaler()
#scaled_train_x = pd.DataFrame(scaler.fit_transform(imputed_train_x_unscaled), columns = imputed_train_x_unscaled.columns)

# DONE : Put back together for full dataframe
#scaled_train = scaled_train_x.join(train_y)
#scaled_train.head()

In [8]:
#scaling while ommiting some variables 
indices = pre_imputed_train.isna()

# Create Simple Imputer to fill NaN values with the mean
imputer = SimpleImputer(strategy='mean')
imputed_train = pd.DataFrame(imputer.fit_transform(pre_imputed_train), columns=pre_imputed_train.columns)

# Identify columns to exclude from scaling
exclude_cols = ['patient', 'time', 'Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS']

# Separate columns into those to scale and those to exclude
cols_to_scale = imputed_train.drop(columns=exclude_cols + ['SepsisLabel']).columns
scaled_data = imputed_train[cols_to_scale]

# Scale the selected columns
scaler = StandardScaler()
scaled_scaled_data = pd.DataFrame(scaler.fit_transform(scaled_data), columns=cols_to_scale)

# Combine scaled data with excluded columns and the target variable
scaled_train = pd.concat([scaled_scaled_data, imputed_train[exclude_cols], imputed_train['SepsisLabel']], axis=1)

# Display the resulting DataFrame
print(scaled_train.head())

         HR     O2Sat      Temp       SBP       MAP       DBP      Resp  \
0 -0.366247  0.747819 -2.225465 -0.654227  0.108104  0.372134 -1.285029   
1 -0.366247  0.747819 -2.225465 -0.654227  0.108104  0.372134 -1.285029   
2 -0.251474  0.588590 -0.911340 -0.460376  0.260250  0.680747 -1.285029   
3 -0.308860  0.907047 -0.496353 -0.503454  0.016817  0.294981 -1.285029   
4 -0.653182  0.907047 -0.496353 -0.374220 -0.165758 -0.013631 -1.478158   

          EtCO2  BaseExcess     HCO3  ...  Platelets  patient  time    Age  \
0  6.868005e-15    0.054560 -0.45495  ...   -0.44147  16426.0   1.0  57.03   
1  6.868005e-15    0.054560 -0.45495  ...   -0.44147  16426.0   2.0  57.03   
2  6.868005e-15    0.054560 -0.45495  ...   -0.44147  16426.0   3.0  57.03   
3  6.868005e-15   -0.931047 -0.45495  ...   -0.44147  16426.0   4.0  57.03   
4  6.868005e-15   -0.931047 -0.45495  ...   -0.44147  16426.0   5.0  57.03   

   Gender  Unit1  Unit2  HospAdmTime  ICULOS  SepsisLabel  
0     1.0    0.0    

In [9]:
# DONE : Use KMeans clusetering 
k = 8
k_means_class = KMeans(n_clusters= k, max_iter=300)

clusters = k_means_class.fit_predict(scaled_train)

In [None]:
# DONE : Impute the missing values based upon means
for cluster in range(k):
    cluster_indices = np.where(clusters == cluster)[0]
    cluster_data = scaled_train.iloc[cluster_indices]
    cluster_mean = cluster_data.mean()

    for col in scaled_train.columns:
        for index in cluster_indices:
            if (indices.at[index, col] == True):
                scaled_train.at[index, col] = cluster_mean[col]

In [None]:
# DONE : Ensure the filtering worked
print(scaled_train.isna().sum())

# Using Experimental Iterative Imputations

In [None]:
# DONE : Try the experimental imputer
# NOTE: THIS TAKES FOREVER, DON'T RUN UNLESS 100% NECESSARY
#imputer = IterativeImputer(random_state=0)
#train_data_imputed = imputer.fit(train_data)
# train_data_imputed = imputer.transform(train_data)
# train_data_imputed_df = pd.DataFrame(train_data_imputed, columns = imputer.get_feature_names_out())#, columns = imputer.get_feature_names_out())
# train_data_imputed_df.head(5)
# train_data_imputed_df.to_pickle("../Data/imputedData/iterative_imputed.pkl", index = False)

# SMOTE to increase proportion of Sepsis cases

In [None]:
X = scaled_train.drop(columns = ['SepsisLabel'])
y = scaled_train['SepsisLabel']


sm = SMOTE(sampling_strategy=0.3, k_neighbors=5, random_state=100)
X_train_syn, y_train_syn = sm.fit_resample(X, y)

print(scaled_train.iloc[:, 0:4].describe())
print(X_train_syn.iloc[:, 0:4].describe())

In [None]:
print(scaled_train['SepsisLabel'].describe())
print(y_train_syn.describe())

In [None]:
#merged data set 
resampled_train = pd.DataFrame(X_train_syn)
resampled_train['SepsisLabel'] = y_train_syn

In [None]:
# DONE : Convert to pickle
#resampled_train.to_pickle(kmeansTrainFilePickle)

 # Test Data preparation 


In [None]:
# forward fill 
test_data['HR'] = pd.to_numeric(test_data['HR'], errors='coerce')

test_filled = forward_fill_by_patient(test_data)
test_filled.head()

In [None]:
# backward fill 
pre_imputed_test = backward_fill_by_patient(test_filled)
pre_imputed_test.head()

In [None]:
imputer = SimpleImputer(strategy='mean')
imputed_test = pd.DataFrame(imputer.fit_transform(pre_imputed_test), columns=pre_imputed_test.columns)

# Identify columns to exclude from scaling
exclude_cols = ['patient', 'time', 'Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS']

# Separate columns into those to scale and those to exclude
cols_to_scale = imputed_test.drop(columns=exclude_cols).columns
scaled_data = imputed_test[cols_to_scale]

# Scale the selected columns
scaler = StandardScaler()
scaled_scaled_data = pd.DataFrame(scaler.fit_transform(scaled_data), columns=cols_to_scale)

# Combine scaled data with excluded columns
scaled_test = pd.concat([scaled_scaled_data, imputed_test[exclude_cols]], axis=1)

# Display the resulting DataFrame
print(scaled_test.head())

In [None]:
# DONE : Use KMeans clusetering 
k = 8
k_means_class = KMeans(n_clusters= k, max_iter=300)

clusters = k_means_class.fit_predict(scaled_test)

In [None]:
# DONE : Impute the missing values based upon means
for cluster in range(k):
    cluster_indices = np.where(clusters == cluster)[0]
    cluster_data = scaled_test.iloc[cluster_indices]
    cluster_mean = cluster_data.mean()

    for col in scaled_test.columns:
        for index in cluster_indices:
            if (indices.at[index, col] == True):
                scaled_test.at[index, col] = cluster_mean[col]

In [None]:
# DONE : Ensure the filtering worked
print(scaled_test.isna().sum())

In [None]:
# DONE : Convert to pickle
#scaled_test.to_pickle(kmeansTrainFilePickle)