In [1]:
import numpy as np
import pandas as pd

%matplotlib inline 
# %config InlineBackend.figure_format = 'retina' ## This is preferable for retina display. 

import warnings ## importing warnings library. 
warnings.filterwarnings('ignore') ## Ignore warning

In [2]:
X_PATH = "./data/Train.csv"
Y_PATH = "./data/y.csv"

X = pd.read_csv(X_PATH)
y = pd.read_csv(Y_PATH).values.ravel()

print(len(X))
X.head()

4000


Unnamed: 0,ALP_first,ALP_last,ALT_first,ALT_last,AST_first,AST_last,Age,Albumin_first,Albumin_last,BUN_first,...,TroponinT_first,TroponinT_last,UrineOutputSum,WBC_first,WBC_last,Weight,Weight_first,Weight_last,pH_first,pH_last
0,,,,,,,54.0,,,13.0,...,,,,11.2,9.4,,,,,
1,,,,,,,76.0,,,16.0,...,,,5.0,7.4,13.3,76.0,80.6,81.6,7.45,7.37
2,127.0,105.0,91.0,75.0,235.0,164.0,44.0,2.7,2.3,8.0,...,,,14.0,4.2,6.2,56.7,56.7,56.7,7.51,7.47
3,105.0,105.0,12.0,12.0,15.0,15.0,68.0,4.4,4.4,23.0,...,,,,11.5,7.9,84.6,84.6,84.6,,
4,,,,,,,88.0,3.3,3.3,45.0,...,,,,3.8,4.8,,,,,


### Adding the missing ICUType column

In [3]:
X["MICU"] = (~(X["CCU"].astype(bool) | X["CSRU"].astype(bool) | X["SICU"].astype(bool))).astype(float)

In [4]:
(X["CCU"] == 1).sum(), (X["CSRU"] == 1).sum(), (X["MICU"] == 1).sum(), (X["SICU"] == 1).sum()

(577, 874, 1481, 1068)

### Train-Test split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, shuffle = True)

## Print Metrics Function

In [6]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, auc, roc_curve, matthews_corrcoef

def print_metrics(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = conf_matrix.ravel()
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)    

    print(f"Confusion Matrix")
    print(f"{conf_matrix}\n")
    print(f"Accuracy: {(tp + tn)/(tp + tn + fn + fp)}")
    print(f"Precision: {tp / (tp + fp)}")
    print(f"Recall: {tp / (tp + fn)}")
    print(f"AUC score: {auc(fpr, tpr)}")
    print(f"MCC score: {mcc}")
    print(f"F1-Score: {(tp)/(tp+(fp+fn)/2)}\n")

# Preprocessing

## 1. Drop columns with too many missing values

In [7]:
drop_threshold = 0.5

count = X_train.isna().sum()
cols_to_drop = X_train.columns[count / len(X) > drop_threshold]

X_train = X_train.drop(columns=cols_to_drop)
X_test = X_test.drop(columns=cols_to_drop)

## 2. Imputing Missing Values

We implement the ``` get_imputer ``` function to quickly get the desired imputer handling method during training.

In [8]:
from sklearn.impute import SimpleImputer, KNNImputer

def get_imputer(name: str, kwargs = None):
    if name == "median":        
        return SimpleImputer(strategy = "median")
    elif name == "most-frequent":
        return SimpleImputer(strategy = "most-frequent")
    elif name == "knn":
        #TODO kwargs
        return KNNImputer()
    else:
        return None

### Standard Imputing

In [9]:
imputer = get_imputer("median")

X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

### Imputing based on ```IcuType``` feature

In [None]:
def IcuType_impute(strategy: str, X_train, X_test = None, return_imputers: bool = False, **kwargs):
    imputers = {"CCU": get_imputer(strategy, kwargs), "CSRU": get_imputer(strategy, kwargs), 
                "MICU": get_imputer(strategy, kwargs), "SICU": get_imputer(strategy, kwargs)}
    
    X_train = X_train.copy()
    
    if X_test is not None:
        X_test = X_test.copy()
    
    for type, imputer in imputers.items():
        X_train.loc[X_train[type] == 1, :] = imputer.fit_transform(X_train.loc[X_train[type] == 1, :])
        X_test.loc[X_test[type] == 1, :] = imputer.transform(X_test.loc[X_test[type] == 1, :])
    
    ret_val = X_train
    if X_test is not None:
        ret_val = (ret_val, X_test)
    if return_imputers:
        ret_val = (*ret_val, imputers)    
    
    return ret_val

In [None]:
X_train, X_test = IcuType_impute("median", X_train, X_test)

## 3. Handling Class Imbalance

Let's see how much the data is imbalanced:

In [10]:
print(f"Class1: {np.sum(y == 0)}")
print(f"Class2: {np.sum(y == 1)}")

Class1: 3446
Class2: 554


We implement the ``` get_imbalance_handler ``` function to quickly get the desired imbalance handling method during training.

In [13]:
from Imbalanced_learn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE

def get_imbalance_handler(name: str, random_state: int = 42):
    if name == "RandomOverSampler":
        return RandomOverSampler(random_state=random_state)
    elif name == "SMOTE":
        return SMOTE(random_state=42)
    elif name == "BorderlineSMOTE":
        return BorderlineSMOTE(random_state=random_state)
    else:
        return None

ModuleNotFoundError: No module named 'Imbalanced_learn'

In [None]:
imbalance_handler = get_imbalance_handler("BorderlineSMOTE")

X_train, y_train = imbalance_handler.fit_resample(X_train, y_train)

Let's see the class counts now

In [None]:
print(f"Class1: {np.sum(y == 0)}")
print(f"Class2: {np.sum(y == 1)}")

## 4. Removing Outliers

Taken from https://www.kaggle.com/code/jonaspalucibarbosa/removing-outliers-within-a-pipeline

In [None]:
def CustomSampler_IQR (X, y):
    
    features = X.columns
    df = X.copy()
    df['Outcome'] = y
    
    indices = [x for x in df.index]    
    out_indexlist = []
        
    for col in features:
       
        #Using nanpercentile instead of percentile because of nan values
        Q1 = np.nanpercentile(df[col], 25.)
        Q3 = np.nanpercentile(df[col], 75.)
        
        cut_off = (Q3 - Q1) * 1.5
        upper, lower = Q3 + cut_off, Q1 - cut_off
                
        outliers_index = df[col][(df[col] < lower) | (df[col] > upper)].index.tolist()
        outliers = df[col][(df[col] < lower) | (df[col] > upper)].values        
        out_indexlist.extend(outliers_index)
        
    #using set to remove duplicates
    out_indexlist = list(set(out_indexlist))
    
    clean_data = np.setdiff1d(indices,out_indexlist)

    return X.loc[clean_data].values, y[clean_data]

## 5. Feature Scaling

We implement the ``` get_scaler ``` function to quickly get the desired feature scaling method during training.

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def get_scaler(name: str):
    if name == "MinMaxScaler":
        return MinMaxScaler()
    elif name == "StandardScaler":
        return StandardScaler()
    else:
        return None

In [None]:
scaler = get_scaler("StandardScaler")

X_train = scaler.fit_transform(X_train) 
X_test = scaler.transform(X_test)

# AdaBoost Model

In [None]:
from sklearn.ensemble import AdaBoostClassifier

## Evaluation on Test set

In [None]:
clf = AdaBoostClassifier(n_estimators=300, algorithm="SAMME.R", random_state = 42)
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

print("Train Set Results: ")
print_metrics(y_train, y_train_pred)

print("Test Set Results: ")
print_metrics(y_test, y_test_pred)