In [4]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
final_180 = pd.read_csv("/content/drive/MyDrive/Borealis AI/Dataset/final_180 (2).csv", index_col = 0)
final_180 = final_180[['BILIRUBIN_SLOPE', 'ALBUMIN_SLOPE', 'SERUM_SODIUM_SLOPE', 'INR_SLOPE',
       'SERUM_CREAT_SLOPE', 'AGE', 'BMI_CALC', 'HGT_CM_CALC', 'INIT_WGT_KG', 'INIT_ALBUMIN',
       'FINAL_ALBUMIN', 'INIT_BILIRUBIN', 'FINAL_BILIRUBIN', 'INIT_ENCEPH',
       'FINAL_ENCEPH', 'INIT_INR', 'FINAL_INR', 'INIT_SERUM_CREAT',
       'FINAL_SERUM_CREAT', 'INIT_SERUM_SODIUM', 'FINAL_SERUM_SODIUM',
       'INIT_MELD_PELD_LAB_SCORE', 'MELD_PELD_LAB_SCORE', 'DAYSWAIT_CHRON',
       'ABO', 'HCC_DIAGNOSIS_TCR', 'GENDER',
       'PREV_AB_SURG_TCR', 'CREAT', 'HBSAG', 'HBV_CORE', 'HBV_DNA', 'HCV_NAT', 'HCV_SEROLOGY', 'HIV_NAT', 'HIV_SEROLOGY', 'TBILI', '180_OUTCOME']]
df_dummies = pd.get_dummies(final_180)
df_dummies.info()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'pandas.core.frame.DataFrame'>
Index: 33120 entries, 0 to 33119
Data columns (total 66 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   BILIRUBIN_SLOPE           33095 non-null  float64
 1   ALBUMIN_SLOPE             32335 non-null  float64
 2   SERUM_SODIUM_SLOPE        33095 non-null  float64
 3   INR_SLOPE                 33095 non-null  float64
 4   SERUM_CREAT_SLOPE         33095 non-null  float64
 5   AGE                       33120 non-null  float64
 6   BMI_CALC                  33113 non-null  float64
 7   HGT_CM_CALC               33120 non-null  float64
 8   INIT_WGT_KG               33120 non-null  float64
 9   INIT_ALBUMIN              33108 non-null  float64
 10  FINAL_ALBUMIN             33117 non-null  float64
 11  INIT_BILIRUBIN            33108 non-null  float

Now after reading the data files, only selecting relevant features, we can perform one-hot encoding to get dummy variables (performed above).

We then separate train and test dataset, and scale both of them.

We also use KNN imptuer to fill out missing values from the data.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
# value of TBILI is missing, so we only filter out patiens that we have the value of TBILI.
tmp = df_dummies[~df_dummies['TBILI'].isna()]
X = tmp.drop(['180_OUTCOME'], axis = 1)
y = tmp['180_OUTCOME']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

imputer = KNNImputer(n_neighbors = 5)
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
y_train.value_counts()

180_OUTCOME
0    8962
1     438
Name: count, dtype: int64

We also perform SMOTE technique to fix the ratio of 1 and 0's in y_train, which is currently hugely imbalanced

In [7]:
from imblearn.over_sampling import SMOTE
from collections import Counter

sm = SMOTE(sampling_strategy = 0.1, random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train))

Resampled dataset shape Counter({0: 8962, 1: 896})


Now we have X_train, y_train, X_test, y_test, we can create multiple machine learning models, which are:
1. Neural networks
2. Logistic Regression
3. Decision Tree
4. Random Forest
5. XGBoost

### 1. Neural Networks


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import f1_score

input_dim = X_train.shape[1]
# we increase the weight in neural networks as well
weights = {0: 1, 1: 20}

model = Sequential([
    Dense(256, input_dim=input_dim, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.0005),
              loss='binary_crossentropy',
              metrics=[Precision(name='precision'), Recall(name='recall'), AUC(name='auroc', curve='ROC')])

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_auc', mode='max', save_best_only=True)

history = model.fit(X_train, y_train, epochs=50, batch_size=32, class_weight=weights, validation_split=0.2, callbacks=[early_stopping, model_checkpoint])
results = model.evaluate(X_test, y_test, verbose=0)
print("Neural Networks: ")
print(f"Precision: {results[1]}")
print(f"Recall: {results[2]}")
print(f"AUROC: {results[3]}")

y_pred = (model.predict(X_test) > 0.5).astype("int32")

f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1}")

### 2. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, roc_auc_score, make_scorer


classifier = LogisticRegression(solver='liblinear', class_weight='balanced')


param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'roc_auc_score': make_scorer(roc_auc_score, needs_proba=True)
}

clf = GridSearchCV(classifier, param_grid, scoring=scorers, refit='roc_auc_score', return_train_score=True, cv=3)
clf.fit(X_train, y_train)
best_model = clf.best_estimator_

y_probs = best_model.predict_proba(X_test)[:, 1]
y_pred = best_model.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_probs)

print("Logistic Regression: ")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC Score: {roc_auc}")
print(f"F1 Score: {f1}")

### 3. Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier


tree = DecisionTreeClassifier(random_state = 42)
forest = RandomForestClassifier(random_state = 42)

param_grid_tree = [
    {
        'max_depth': list(range(1, 10)),
        'criterion': ['gini', 'entropy'],
        'min_samples_split': [2, 5, 10]
    }
]

gcv_tree = GridSearchCV(estimator=tree,
                       param_grid=param_grid_tree,
                       scoring='f1',
                       n_jobs=-1,
                       cv=5,
                       verbose=0,
                       refit=True)
gcv_tree.fit(X_train, y_train)
tree = gcv_tree.best_estimator_

y_probs = tree.predict_proba(X_test)[:, 1]
y_pred = tree.predict(X_test)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_probs)

print("Decision Tree:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC Score: {roc_auc}")
print(f"F1 Score: {f1}")


### 4. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(random_state = 42)
new_param_grid_forest = [
    {
        'n_estimators': [5, 10, 25],  # Increased number of estimators
        'max_features': ['auto', 'sqrt'],
        'max_depth': [None, 10, 30],  # Additional depths to explore
        'min_samples_split': [2, 5, 10],  # Adding min_samples_split
        'min_samples_leaf': [1, 2, 4],    # Adding min_samples_leaf
        'bootstrap': [True, False],       # Considering both bootstrap options
        'class_weight': [None, 'balanced', 'balanced_subsample']  # Adding class_weight
    }
]

new_gcv_forest = GridSearchCV(estimator=forest,
                          param_grid=new_param_grid_forest,
                          scoring='f1',  # Changed to f1 score
                          n_jobs=-1,
                          cv=5,
                          verbose=0,
                          refit=True)

new_gcv_forest.fit(X_train, y_train)
forest = new_gcv_forest.best_estimator_

y_probs = forest.predict_proba(X_test)[:, 1]
y_pred = forest.predict(X_test)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_probs)

print("Random Forest:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC Score: {roc_auc}")
print(f"F1 Score: {f1}")

### 5. XGBoost

In [None]:
import xgboost as xgb


xgb_model = xgb.XGBClassifier(scale_pos_weight=10, use_label_encoder=False, eval_metric='logloss')
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0]
}

clf = GridSearchCV(xgb_model, param_grid, scoring=scorers, refit='roc_auc_score', return_train_score=True, cv=3)

clf.fit(X_train, y_train)

best_model = clf.best_estimator_

y_probs = best_model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
y_pred = best_model.predict(X_test)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_probs)



print("XGBoost: ")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC Score: {roc_auc}")
print(f"F1 Score: {f1}")

You can combine these results to compare the accuracy / performance of different ML models.