In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
import numpy as np
import pandas as pd

%matplotlib inline 
# %config InlineBackend.figure_format = 'retina' ## This is preferable for retina display. 

import warnings ## importing warnings library. 
warnings.filterwarnings('ignore') ## Ignore warning

In [9]:
X_PATH = "./data/Train.csv"
Y_PATH = "./data/y.csv"

X = pd.read_csv(X_PATH)
y = pd.read_csv(Y_PATH)

print(len(X))
X.head()

4000


Unnamed: 0,ALP_first,ALP_last,ALT_first,ALT_last,AST_first,AST_last,Age,Albumin_first,Albumin_last,BUN_first,...,TroponinT_first,TroponinT_last,UrineOutputSum,WBC_first,WBC_last,Weight,Weight_first,Weight_last,pH_first,pH_last
0,,,,,,,54.0,,,13.0,...,,,,11.2,9.4,,,,,
1,,,,,,,76.0,,,16.0,...,,,5.0,7.4,13.3,76.0,80.6,81.6,7.45,7.37
2,127.0,105.0,91.0,75.0,235.0,164.0,44.0,2.7,2.3,8.0,...,,,14.0,4.2,6.2,56.7,56.7,56.7,7.51,7.47
3,105.0,105.0,12.0,12.0,15.0,15.0,68.0,4.4,4.4,23.0,...,,,,11.5,7.9,84.6,84.6,84.6,,
4,,,,,,,88.0,3.3,3.3,45.0,...,,,,3.8,4.8,,,,,


In [10]:
y.value_counts()

In-hospital_death
0                    3446
1                     554
dtype: int64

### Adding the missing ICUType column

In [14]:
X["MICU"] = (~(X["CCU"].astype(bool) | X["CSRU"].astype(bool) | X["SICU"].astype(bool))).astype(float)

In [15]:
(X["CCU"] == 1).sum(), (X["CSRU"] == 1).sum(), (X["MICU"] == 1).sum(), (X["SICU"] == 1).sum()

(577, 874, 1481, 1068)

### Train-Test split

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

## Print Metrics Function

In [17]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, auc, roc_curve, matthews_corrcoef

def print_metrics(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = conf_matrix.ravel()
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)    

    print(f"Confusion Matrix")
    print(f"{conf_matrix}\n")
    print(f"Accuracy: {(tp + tn)/(tp + tn + fn + fp)}")
    print(f"Precision: {tp / (tp + fp)}")
    print(f"Recall: {tp / (tp + fn)}")
    print(f"AUC score: {auc(fpr, tpr)}")
    print(f"MCC score: {mcc}")
    print(f"F1-Score: {(tp)/(tp+(fp+fn)/2)}\n")

# Preprocessing

In [21]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

def preprocess_data(data, drop_threshold=0.5):
    # Drop columns with nan ratio > threshold
    count = data.isna().sum()
    cols_to_drop = data.columns[count / len(data) > drop_threshold]
    data = data.drop(columns=cols_to_drop)

    # Select numeric columns
    numeric_columns = data.select_dtypes(exclude=['object']).columns

    # Impute missing values using KNNImputer
    imputer = SimpleImputer(strategy = "median")
    data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

    # Check for any remaining missing values after imputation
    if data[numeric_columns].isnull().any().any():
        raise ValueError("There are still missing values after imputation.")

    # Standardize numeric columns using StandardScaler
    scaler = StandardScaler()
    data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

    # Check if the number of columns after scaling matches the original numeric columns
    if len(numeric_columns) != data[numeric_columns].shape[1]:
        raise ValueError("The number of columns after scaling doesn't match the original numeric columns.")

    return data

X_train = preprocess_data(X_train)
X_test = preprocess_data(X_test)

## Imbalance handling models

In [None]:
from imblearn.ensemble import RUSBoostClassifier, BalancedRandomForestClassifier


brf = BalancedRandomForestClassifier(
    n_estimators=500, random_state=42, sampling_strategy="all", replacement=True,
    bootstrap=True,
)
brf.fit(X_train, y_train)
y_pred_prob = brf.predict_proba(X_test)[:, 1]

threshold = 0.45
y_pred = (y_pred_prob > threshold).astype(int)

print_metrics(y_true, y_pred)

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve')
plt.legend()
plt.show()
