In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import KFold, cross_val_score

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest

In [10]:
df = pd.read_csv('dataset/delirium/cytokine.csv')
df = df.drop(df.columns[-1], axis=1)
df.head()

Unnamed: 0,CAM_ICU_positive,BDNFL,CXC13L,CXCL9L,EGFL,EOTL,FGFL,GCSFL,GMCSFL,GROAL,...,MIPAL,MIPBL,NGFL,PDGFL,PLFGL,RANTL,SCFL,SDFAL,TNFAL,VEGFAL
0,No_Delirium,156,159,61,16,65,4,22,18,3,...,10,81,8,145,350,26,37,498,12,731
1,No_Delirium,12,391,42,15,63,117,542,93,3,...,6,229,10,103,709,61,59,1781,184,650
2,No_Delirium,9,161,92,3,45,4,22,18,3,...,3,82,8,57,131,12,13,449,12,376
3,No_Delirium,388,25,5,106,42,4,22,18,3,...,4,49,8,735,17,10,13,399,12,347
4,No_Delirium,153,240,174,3,149,4,417,18,3,...,11,132,8,130,42,15,44,465,31,818


In [11]:
df.columns = ['y'] + ['x' + str(i) for i in range(1, len(df.columns))]
df.head()

Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x32,x33,x34,x35,x36,x37,x38,x39,x40,x41
0,No_Delirium,156,159,61,16,65,4,22,18,3,...,10,81,8,145,350,26,37,498,12,731
1,No_Delirium,12,391,42,15,63,117,542,93,3,...,6,229,10,103,709,61,59,1781,184,650
2,No_Delirium,9,161,92,3,45,4,22,18,3,...,3,82,8,57,131,12,13,449,12,376
3,No_Delirium,388,25,5,106,42,4,22,18,3,...,4,49,8,735,17,10,13,399,12,347
4,No_Delirium,153,240,174,3,149,4,417,18,3,...,11,132,8,130,42,15,44,465,31,818


In [12]:
df['y'] = df['y'].map({'No Delirium': 0, 'Delirium': 1, 'No_Delirium': 0})
df

Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x32,x33,x34,x35,x36,x37,x38,x39,x40,x41
0,0,156,159,61,16,65,4,22,18,3,...,10,81,8,145,350,26,37,498,12,731
1,0,12,391,42,15,63,117,542,93,3,...,6,229,10,103,709,61,59,1781,184,650
2,0,9,161,92,3,45,4,22,18,3,...,3,82,8,57,131,12,13,449,12,376
3,0,388,25,5,106,42,4,22,18,3,...,4,49,8,735,17,10,13,399,12,347
4,0,153,240,174,3,149,4,417,18,3,...,11,132,8,130,42,15,44,465,31,818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,1,100,466,602,24,82,4,22,18,3,...,20,267,8,58,66,51,93,3117,12,3220
58,1,162,355,25,23,60,4,22,18,3,...,8,89,8,87,2,14,18,1143,12,790
59,1,15,238,13,79,17,4,22,18,5,...,2,75,8,95,2,6,2,582,12,2032
60,1,7,96,20,3,11,4,22,18,3,...,2,9,8,14,42,2,2,451,12,412


In [13]:
def scale(df):
    scaler = StandardScaler().fit(df.drop('y', axis=1))
    scaled_data = scaler.transform(df.drop('y', axis=1))
    scaled_df = pd.DataFrame(scaled_data, columns=df.columns[1:])
    scaled_df.insert(0, 'y', df['y'])
    return scaled_df

In [14]:
iso = IsolationForest(contamination=0.02)
yhat = iso.fit_predict(df.drop('y', axis=1).values)
mask = yhat != -1

In [15]:
print("Outliers detected by Isolation Forest:", len(df) - len(df[mask]))

Outliers detected by Isolation Forest: 2


In [16]:
df = scale(df)
X = df.drop('y', axis=1).values
y = df['y'].values

In [17]:
print('Class 0:', sum(y == 0))
print('Class 1:', sum(y == 1))

Class 0: 23
Class 1: 39


In [18]:
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, StratifiedKFold

In [19]:
outer_kf = StratifiedKFold(n_splits=20, random_state=3, shuffle=True)
outer_optimal_lambdas = []
outer_accuracies = []
outer_aucs = []

for train_index, test_index in outer_kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    inner_kf = KFold(n_splits=10, shuffle=True, random_state=1)
    inner_lambdas = np.linspace(0.1, 100, 100)
    inner_scores = {}
    
    for l in inner_lambdas:
        model = RidgeClassifier(alpha=l, class_weight='balanced')
        scores = cross_val_score(model, X_train, y_train, cv=inner_kf, scoring='neg_mean_squared_error')
        inner_scores[l] = np.mean(scores)
    
    # Select the best lambda for this outer fold
    best_lambda = max(inner_scores, key=inner_scores.get)
    outer_optimal_lambdas.append(best_lambda)

    # Train the model with the best lambda
    best_model = RidgeClassifier(alpha=best_lambda, class_weight='balanced')
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.decision_function(X_test)
    
    # Compute accuracy
    accuracy = accuracy_score(y_test, y_pred)
    outer_accuracies.append(accuracy)
    
    # Compute AUC
    auc = roc_auc_score(y_test, y_pred_proba)
    outer_aucs.append(auc)

In [20]:
outer_optimal_lambdas

[np.float64(49.54545454545455),
 np.float64(54.590909090909086),
 np.float64(46.518181818181816),
 np.float64(83.85454545454544),
 np.float64(2.118181818181818),
 np.float64(3.1272727272727274),
 np.float64(48.53636363636364),
 np.float64(7.163636363636363),
 np.float64(0.1),
 np.float64(17.254545454545454),
 np.float64(5.145454545454545),
 np.float64(20.28181818181818),
 np.float64(4.136363636363636),
 np.float64(1.1090909090909091),
 np.float64(82.84545454545453),
 np.float64(5.145454545454545),
 np.float64(15.236363636363635),
 np.float64(5.145454545454545),
 np.float64(29.363636363636363),
 np.float64(30.37272727272727)]

In [21]:
outer_accuracies

[0.5,
 0.25,
 0.0,
 0.3333333333333333,
 0.6666666666666666,
 0.3333333333333333,
 0.6666666666666666,
 0.6666666666666666,
 0.6666666666666666,
 1.0,
 0.6666666666666666,
 0.6666666666666666,
 0.6666666666666666,
 0.3333333333333333,
 1.0,
 1.0,
 1.0,
 0.3333333333333333,
 0.6666666666666666,
 1.0]

In [22]:
outer_aucs

[np.float64(0.25),
 np.float64(0.5),
 np.float64(0.0),
 np.float64(0.5),
 np.float64(1.0),
 np.float64(0.5),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(0.5),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(0.5),
 np.float64(0.5),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(0.5),
 np.float64(0.5),
 np.float64(1.0)]

In [23]:
# After running, average the outer_accuracies and outer_aucs to get a more stable performance estimate
average_accuracy = np.mean(outer_accuracies)
average_auc = np.mean(outer_aucs)

print("Average Accuracy:", average_accuracy)
print("Average AUC:", average_auc)

Average Accuracy: 0.6208333333333333
Average AUC: 0.6625


In [24]:
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np
from tqdm import tqdm

outer_kf = StratifiedKFold(n_splits=20, random_state=3, shuffle=True)
outer_accuracies = []
outer_aucs = []

# Outer loop with progress bar
for train_index, test_index in tqdm(outer_kf.split(X, y), total=outer_kf.get_n_splits(), desc="Outer Loop"):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    inner_kf = KFold(n_splits=5, shuffle=True, random_state=1)
    inner_scores = {}
    
    learning_rates = np.linspace(0.01, 0.2, 10)
    n_estimators = [50, 100, 150]
    best_score = -np.inf
    best_params = {}

    # Inner loop with progress bar
    for lr in learning_rates:
        for ne in n_estimators:
            model = GradientBoostingClassifier(learning_rate=lr, n_estimators=ne)
            scores = cross_val_score(model, X_train, y_train, cv=inner_kf, scoring='accuracy')
            mean_score = np.mean(scores)
            inner_scores[(lr, ne)] = mean_score
            if mean_score > best_score:
                best_score = mean_score
                best_params = {'learning_rate': lr, 'n_estimators': ne}
    
    # Select the best parameters for this outer fold
    best_learning_rate = best_params['learning_rate']
    best_n_estimators = best_params['n_estimators']
    
    # Train the model with the best parameters
    best_model = GradientBoostingClassifier(learning_rate=best_learning_rate, n_estimators=best_n_estimators)
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    
    # Compute accuracy
    accuracy = accuracy_score(y_test, y_pred)
    outer_accuracies.append(accuracy)
    
    # Compute AUC
    auc = roc_auc_score(y_test, y_pred_proba)
    outer_aucs.append(auc)

# Print the results
print("Accuracies:", outer_accuracies)
print("AUCs:", outer_aucs)
print("Mean Accuracy:", np.mean(outer_accuracies))
print("Mean AUC:", np.mean(outer_aucs))

Outer Loop: 100%|██████████| 20/20 [02:04<00:00,  6.22s/it]

Accuracies: [0.25, 0.75, 0.6666666666666666, 0.3333333333333333, 1.0, 1.0, 0.6666666666666666, 0.6666666666666666, 1.0, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 1.0, 1.0, 0.6666666666666666, 0.3333333333333333, 0.3333333333333333, 0.6666666666666666]
AUCs: [np.float64(0.5), np.float64(1.0), np.float64(0.5), np.float64(0.5), np.float64(1.0), np.float64(1.0), np.float64(0.5), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(0.5), np.float64(0.5), np.float64(1.0)]
Mean Accuracy: 0.6833333333333333
Mean AUC: 0.85





In [25]:
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np
from tqdm import tqdm

outer_kf = StratifiedKFold(n_splits=20, random_state=3, shuffle=True)
outer_accuracies = []
outer_aucs = []

# Outer loop with progress bar
for train_index, test_index in tqdm(outer_kf.split(X, y), total=outer_kf.get_n_splits(), desc="Outer Loop"):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    inner_kf = KFold(n_splits=5, shuffle=True, random_state=1)
    inner_scores = {}
    
    C_values = np.logspace(-2, 2, 5)
    kernel_types = ['linear', 'rbf']
    best_score = -np.inf
    best_params = {}

    # Inner loop with progress bar
    for C in C_values:
        for kernel in kernel_types:
            model = SVC(C=C, kernel=kernel, probability=True)
            scores = cross_val_score(model, X_train, y_train, cv=inner_kf, scoring='accuracy')
            mean_score = np.mean(scores)
            inner_scores[(C, kernel)] = mean_score
            if mean_score > best_score:
                best_score = mean_score
                best_params = {'C': C, 'kernel': kernel}
    
    # Select the best parameters for this outer fold
    best_C = best_params['C']
    best_kernel = best_params['kernel']
    
    # Train the model with the best parameters
    best_model = SVC(C=best_C, kernel=best_kernel, probability=True)
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    
    # Compute accuracy
    accuracy = accuracy_score(y_test, y_pred)
    outer_accuracies.append(accuracy)
    
    # Compute AUC
    auc = roc_auc_score(y_test, y_pred_proba)
    outer_aucs.append(auc)

# Print the results
print("Accuracies:", outer_accuracies)
print("AUCs:", outer_aucs)
print("Mean Accuracy:", np.mean(outer_accuracies))
print("Mean AUC:", np.mean(outer_aucs))

Outer Loop: 100%|██████████| 20/20 [00:01<00:00, 15.45it/s]

Accuracies: [0.5, 0.5, 0.3333333333333333, 0.3333333333333333, 1.0, 0.3333333333333333, 0.6666666666666666, 1.0, 0.6666666666666666, 1.0, 1.0, 0.6666666666666666, 1.0, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 1.0, 0.3333333333333333, 0.3333333333333333, 0.6666666666666666]
AUCs: [np.float64(0.25), np.float64(0.5), np.float64(0.0), np.float64(0.5), np.float64(1.0), np.float64(0.5), np.float64(0.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(0.0), np.float64(1.0), np.float64(0.5), np.float64(0.0), np.float64(0.0), np.float64(1.0), np.float64(0.5), np.float64(0.0), np.float64(1.0)]
Mean Accuracy: 0.6666666666666667
Mean AUC: 0.5375





In [26]:
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np
from tqdm import tqdm

outer_kf = StratifiedKFold(n_splits=20, random_state=3, shuffle=True)
outer_accuracies = []
outer_aucs = []

# Outer loop with progress bar
for train_index, test_index in tqdm(outer_kf.split(X, y), total=outer_kf.get_n_splits(), desc="Outer Loop"):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    inner_kf = KFold(n_splits=5, shuffle=True, random_state=1)
    inner_scores = {}
    
    learning_rates = [0.01, 0.1, 0.2]
    n_estimators = [50, 100, 150]
    best_score = -np.inf
    best_params = {}

    # Inner loop with progress bar
    for lr in learning_rates:
        for ne in n_estimators:
            model = XGBClassifier(learning_rate=lr, n_estimators=ne, use_label_encoder=False, eval_metric='logloss')
            scores = cross_val_score(model, X_train, y_train, cv=inner_kf, scoring='accuracy')
            mean_score = np.mean(scores)
            inner_scores[(lr, ne)] = mean_score
            if mean_score > best_score:
                best_score = mean_score
                best_params = {'learning_rate': lr, 'n_estimators': ne}
    
    # Select the best parameters for this outer fold
    best_learning_rate = best_params['learning_rate']
    best_n_estimators = best_params['n_estimators']
    
    # Train the model with the best parameters
    best_model = XGBClassifier(learning_rate=best_learning_rate, n_estimators=best_n_estimators, use_label_encoder=False, eval_metric='logloss')
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    
    # Compute accuracy
    accuracy = accuracy_score(y_test, y_pred)
    outer_accuracies.append(accuracy)
    
    # Compute AUC
    auc = roc_auc_score(y_test, y_pred_proba)
    outer_aucs.append(auc)

# Print the results
print("Accuracies:", outer_accuracies)
print("AUCs:", outer_aucs)
print("Mean Accuracy:", np.mean(outer_accuracies))
print("Mean AUC:", np.mean(outer_aucs))

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Accuracies: [0.5, 0.75, 0.3333333333333333, 0.6666666666666666, 1.0, 0.6666666666666666, 0.3333333333333333, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.3333333333333333, 0.6666666666666666, 1.0, 1.0, 0.6666666666666666, 1.0, 1.0, 1.0, 0.3333333333333333, 0.6666666666666666]
AUCs: [np.float64(0.5), np.float64(0.5), np.float64(0.5), np.float64(0.5), np.float64(1.0), np.float64(1.0), np.float64(0.5), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(0.5), np.float64(0.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(0.5), np.float64(0.0)]
Mean Accuracy: 0.6958333333333333
Mean AUC: 0.725



