In [1]:
import numpy as np
import pandas as pd

X_PATH = "./data/Train.csv"
Y_PATH = "./data/y.csv"

X = pd.read_csv(X_PATH)
y = pd.read_csv(Y_PATH).values.ravel()

print(len(X))
X.head()

4000


Unnamed: 0,ALP_first,ALP_last,ALT_first,ALT_last,AST_first,AST_last,Age,Albumin_first,Albumin_last,BUN_first,...,TroponinT_first,TroponinT_last,UrineOutputSum,WBC_first,WBC_last,Weight,Weight_first,Weight_last,pH_first,pH_last
0,,,,,,,54.0,,,13.0,...,,,,11.2,9.4,,,,,
1,,,,,,,76.0,,,16.0,...,,,5.0,7.4,13.3,76.0,80.6,81.6,7.45,7.37
2,127.0,105.0,91.0,75.0,235.0,164.0,44.0,2.7,2.3,8.0,...,,,14.0,4.2,6.2,56.7,56.7,56.7,7.51,7.47
3,105.0,105.0,12.0,12.0,15.0,15.0,68.0,4.4,4.4,23.0,...,,,,11.5,7.9,84.6,84.6,84.6,,
4,,,,,,,88.0,3.3,3.3,45.0,...,,,,3.8,4.8,,,,,


In [2]:
X_index = X.index
X_cols = X.columns 
X = X.values

## 1. Imputing Missing Values

### Method1: Median

In [3]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = "median")
param_grid = {}

### Method2: KNNImpute

In [None]:
from sklearn.impute import KNNImputer

imputer = KNNImputer()

## 2. Handling Class Imbalance

Let's see how much the data is imbalanced:

In [4]:
print(f"Class1: {np.sum(y == 0)}")
print(f"Class2: {np.sum(y == 1)}")

Class1: 3446
Class2: 554


### Method1: Oversampling

In [6]:
from imblearn.over_sampling import RandomOverSampler

imbalance_handler = RandomOverSampler(random_state=42)

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)

### Method2: SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

imbalance_handler = SMOTE(random_state=42)

### Method3: BorderlineSMOTE

In [5]:
from imblearn.over_sampling import BorderlineSMOTE

imbalance_handler = BorderlineSMOTE(random_state=42)

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)

## 3. Removing Outliers

Taken from https://www.kaggle.com/code/jonaspalucibarbosa/removing-outliers-within-a-pipeline

In [None]:
def CustomSampler_IQR (X, y):
    
    features = X.columns
    df = X.copy()
    df['Outcome'] = y
    
    indices = [x for x in df.index]    
    out_indexlist = []
        
    for col in features:
       
        #Using nanpercentile instead of percentile because of nan values
        Q1 = np.nanpercentile(df[col], 25.)
        Q3 = np.nanpercentile(df[col], 75.)
        
        cut_off = (Q3 - Q1) * 1.5
        upper, lower = Q3 + cut_off, Q1 - cut_off
                
        outliers_index = df[col][(df[col] < lower) | (df[col] > upper)].index.tolist()
        outliers = df[col][(df[col] < lower) | (df[col] > upper)].values        
        out_indexlist.extend(outliers_index)
        
    #using set to remove duplicates
    out_indexlist = list(set(out_indexlist))
    
    clean_data = np.setdiff1d(indices,out_indexlist)

    return X.loc[clean_data].values, y[clean_data]

In [None]:
X,y = CustomSampler_IQR(pd.DataFrame(X, index = X_index, columns = X_cols), y)

The amount of samples left:

In [None]:
len(X)

Class balance:

In [None]:
print(np.sum(y == 0))
print(np.sum(y == 1))

## 4. Feature Scaling

### Method1: MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

### Method2: StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

## Baseline Evaluation

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, auc, roc_curve, matthews_corrcoef

model = LogisticRegression(penalty = "l2", max_iter = 500)
kf = KFold(n_splits=20, shuffle=True, random_state=42)
scores = np.zeros(kf.get_n_splits(X))

for i, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, y_train= X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index] 
    
    X_train, y_train = imbalance_handler.fit_resample(X_train, y_train)
    X_train = scaler.fit_transform(imputer.fit_transform(X_train))
    X_test = scaler.transform(imputer.transform(X_test))    
    model.fit(X_train, y_train)
    
    conf_matrix = confusion_matrix(y_test, model.predict(X_test))
    tn, fp, fn, tp = conf_matrix.ravel()
    fpr, tpr, thresholds = roc_curve(y_test, model.predict(X_test))
    scores[i] = model.score(X_test, y_test)
    
    print(f"Fold {i + 1}: \n")
    print(f"Confusion Matrix")
    print(f"{conf_matrix}\n")
    print(f"Accuracy: {(tp + tn)/(tp + tn + fn + fp)}")
    print(f"Precision: {tp / (tp + fp)}")
    print(f"Recall: {tp / (tp + fn)}")
    print(f"AUC score: {auc(fpr, tpr)}")
    print(f"MCC score: {matthews_corrcoef(y_test, model.predict(X_test))}")
    print(f"F1-Score: {(tp)/(tp+(fp+fn)/2)}\n")
    print(scores[i])
#     fpr = fp/(fp+tn)
#     tpr = tp/(tp+fn)
    
np.mean(scores)

## Hyperparameter Tuning with GridSearchCV

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

scoring_metric = "f1"
param_grid = {"imputer__n_neighbors" : [1, 3, 7], "imputer__weights": ["uniform", "distance"] }

model = LogisticRegression(penalty = "l2", max_iter = 500)
pipe = Pipeline([("imputer", imputer), ('scaler', StandardScaler()), ('model', model)])
grid = GridSearchCV(pipe, param_grid, scoring = scoring_metric, verbose = 3)
grid.fit(X, y)