In [12]:
# !pip install sdv

In [13]:
import os
import sys
import pandas as pd
import numpy as np
from scipy import stats
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.evaluation.single_table import evaluate_quality, run_diagnostic, get_column_plot
from google.colab import drive
import matplotlib.pyplot as plt
import logging
import warnings
# from pycaret.classification import *
from sklearn.model_selection import train_test_split
drive.mount('/content/gdrive')
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sdv.evaluation.single_table import get_column_plot


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [14]:
# Data cleaning: convert from 5th column to be float, replacing non number values with Nan, then drop. Then I convert disease in numeric format and round values to 2.

df = pd.read_excel('/content/gdrive/MyDrive/Colab Notebooks/Database_Features_ML.xlsx')
print(df.shape)
df[df.columns[4:]] = df[df.columns[4:]].apply(pd.to_numeric, errors='coerce')
print(df.shape)
df.replace(["#!NULL", ""], np.nan, inplace=True)
df.drop(columns=['ID Paziente', 'Età', 'Sesso'], inplace=True)
df = df.dropna()
print(df.shape)
df['Disease'] = df['Disease'].replace({'ALS': 1, 'HEALTHY': 0})
df = df.round(2)

df_als = df[df['Disease'] == 1]
df_hc = df[df['Disease'] == 0]

print(f"Number of ALS: {len(df_als)} (N.B. after dropping!)")
print(f"Number of HC: {len(df_hc)}")
print('-'*50)
print(f"DF dimensions: {df.shape}")
print(df.columns)

(134, 38)
(134, 38)
(118, 35)
Number of ALS: 97 (N.B. after dropping!)
Number of HC: 21
--------------------------------------------------
DF dimensions: (118, 35)
Index(['Disease', 'GCSF', 'IFNgamma', 'IL10', 'IL15', 'IL17A', 'IL1beta',
       'IL2', 'IL4', 'IL6', 'IL8', 'MCP1', 'MIP1alfa', 'TNFalfa', 'VEGF',
       'TTVlog', 'TTVcopies', 'acetic', 'Propionic', 'Butyric', 'isoButyric',
       'isoValeric', '@MethylButyric', 'valeric', 'Hexanoic', 'Heptanoic',
       'Nonanoic', '@EthylHexanoic', 'Octanoic', 'Decanoic', 'Benzoic',
       'Dodecanoic', 'Tetradecanoic', 'Hexadecanoic', 'Octadecanoic'],
      dtype='object')


In [15]:
df_als_downsized = df_als.sample(n=21, random_state=42)
df_downsized_for_synthesis = pd.concat([df_als_downsized, df_hc], ignore_index=True)
print(df_downsized_for_synthesis.shape)

(42, 35)


In [16]:
# Detect metadata and ensure 'Disease' is categorical
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df_downsized_for_synthesis)
metadata.update_column('Disease', sdtype='categorical')

# validate
metadata.validate()
metadata.validate_data(data=df_downsized_for_synthesis)

# Initialize and fit the synthesizer
synthesizer_GC = GaussianCopulaSynthesizer(
        metadata,  # required
        enforce_min_max_values=True,
        enforce_rounding=False,
        default_distribution='gaussian_kde'
        )
synthesizer_GC.fit(df_downsized_for_synthesis)

# Generate synthetic data
# Sample 1
synthetic_data = synthesizer_GC.sample(num_rows=420)
# Sample 2
# synthetic_data = synthesizer_GC.sample(num_rows=840)
print(synthetic_data.shape)

(420, 35)


In [17]:
quality_report = evaluate_quality(df_downsized_for_synthesis, synthetic_data, metadata)
print(quality_report)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 35/35 [00:00<00:00, 724.61it/s]|
Column Shapes Score: 79.85%

(2/2) Evaluating Column Pair Trends: |██████████| 595/595 [00:04<00:00, 137.71it/s]|
Column Pair Trends Score: 95.25%

Overall Score (Average): 87.55%

<sdmetrics.reports.single_table.quality_report.QualityReport object at 0x7fdc9825f790>


In [18]:
# Combine real and synthetic data for training
# training_data = pd.concat([df_downsized_for_synthesis, synthetic_data], ignore_index=True)
training_data = synthetic_data

# training_data = synthetic_data
X_train = training_data.drop(columns=['Disease']).values  # Features
y_train = training_data['Disease'].values  # Target

# Use all real data as the test set
X_test = df.drop(columns=['Disease']).values  # Features
y_test = df['Disease'].values  # Target

In [19]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(420, 34)
(420,)
(118, 34)
(118,)


In [20]:
# keras
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, log_loss, f1_score, confusion_matrix, roc_auc_score, precision_score, recall_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf

# Scale the data for improved training stability
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Compute class weights to handle class imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))

# Define the deep learning model
def create_model(input_dim):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(96, activation='relu', input_shape=(input_dim,)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(32, activation='relu'),
        # tf.keras.layers.Dropout(0.2),
        # tf.keras.layers.Dense(32, activation='relu'),
        # tf.keras.layers.Dropout(0.1),
        # tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Cross-validation setup
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold = 1

# Metrics containers
val_accuracies = []
val_log_losses = []
val_f1_scores = []
val_precisions = []
val_recalls = []
val_specificities = []
val_aurocs = []

print("Starting cross-validation...")

for train_index, val_index in kf.split(X_train, y_train):
    print(f"\nTraining fold {fold}...")

    # Split training data into train and validation sets
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Create and train the model
    model = create_model(X_train_fold.shape[1])

    # Define callbacks
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-5)

    history = model.fit(
        X_train_fold, y_train_fold,
        validation_data=(X_val_fold, y_val_fold),
        epochs=100,
        batch_size=32,
        verbose=1,
        callbacks=[early_stopping, lr_scheduler],
        class_weight=class_weight_dict
    )

    # Evaluate the model on the validation set
    y_val_pred_proba = model.predict(X_val_fold)
    y_val_pred_classes = (y_val_pred_proba > 0.5).astype(int)

    # Calculate metrics
    val_accuracy = accuracy_score(y_val_fold, y_val_pred_classes)
    val_log_loss = log_loss(y_val_fold, y_val_pred_proba)
    val_f1 = f1_score(y_val_fold, y_val_pred_classes)
    val_precision = precision_score(y_val_fold, y_val_pred_classes)
    val_recall = recall_score(y_val_fold, y_val_pred_classes)
    val_auroc = roc_auc_score(y_val_fold, y_val_pred_proba)

    # Specificity calculation
    cm = confusion_matrix(y_val_fold, y_val_pred_classes)
    tn, fp, fn, tp = cm.ravel()
    val_specificity = tn / (tn + fp)

    # Append metrics
    val_accuracies.append(val_accuracy)
    val_log_losses.append(val_log_loss)
    val_f1_scores.append(val_f1)
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_specificities.append(val_specificity)
    val_aurocs.append(val_auroc)

    print(f"Fold {fold} - Val Accuracy: {val_accuracy:.4f}, Val Log Loss: {val_log_loss:.4f}, "
          f"Val F1: {val_f1:.4f}, Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, "
          f"Val Specificity: {val_specificity:.4f}, Val AUROC: {val_auroc:.4f}")
    fold += 1

# Final Cross-Validation Results
print("\nCross-Validation Results:")
print(f"Average Val Accuracy: {np.mean(val_accuracies):.4f}")
print(f"Average Val Log Loss: {np.mean(val_log_losses):.4f}")
print(f"Average Val F1 Score: {np.mean(val_f1_scores):.4f}")
print(f"Average Val Precision: {np.mean(val_precisions):.4f}")
print(f"Average Val Recall: {np.mean(val_recalls):.4f}")
print(f"Average Val Specificity: {np.mean(val_specificities):.4f}")
print(f"Average Val AUROC: {np.mean(val_aurocs):.4f}")

# Final Test Evaluation
print("\nFinal Test Evaluation...")
y_test_pred_proba = model.predict(X_test)
y_test_pred_classes = (y_test_pred_proba > 0.5).astype(int)

test_accuracy = accuracy_score(y_test, y_test_pred_classes)
test_log_loss = log_loss(y_test, y_test_pred_proba)
test_f1 = f1_score(y_test, y_test_pred_classes)
test_precision = precision_score(y_test, y_test_pred_classes)
test_recall = recall_score(y_test, y_test_pred_classes)
test_auroc = roc_auc_score(y_test, y_test_pred_proba)

cm_test = confusion_matrix(y_test, y_test_pred_classes)
tn_test, fp_test, fn_test, tp_test = cm_test.ravel()
test_specificity = tn_test / (tn_test + fp_test)

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Log Loss: {test_log_loss:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test Specificity: {test_specificity:.4f}")
print(f"Test AUROC: {test_auroc:.4f}")


Starting cross-validation...

Training fold 1...
Epoch 1/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 35ms/step - accuracy: 0.5051 - loss: 0.9056 - val_accuracy: 0.5238 - val_loss: 0.6866 - learning_rate: 1.0000e-04
Epoch 2/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5708 - loss: 0.8235 - val_accuracy: 0.5476 - val_loss: 0.6696 - learning_rate: 1.0000e-04
Epoch 3/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5419 - loss: 0.8453 - val_accuracy: 0.5476 - val_loss: 0.6523 - learning_rate: 1.0000e-04
Epoch 4/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5877 - loss: 0.7758 - val_accuracy: 0.5714 - val_loss: 0.6351 - learning_rate: 1.0000e-04
Epoch 5/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6244 - loss: 0.7279 - val_accuracy: 0.5476 - val_loss: 0.6179 - learning_rate: 1.0000e-04
Epoch

In [21]:
# AdaBoostClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, log_loss, f1_score, confusion_matrix, roc_auc_score, precision_score, recall_score
)

# Scale the data for improved stability
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the AdaBoost model
model = AdaBoostClassifier(n_estimators=25, learning_rate=0.1, random_state=42)

# Cross-validation setup
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold = 1

# Metrics containers
accuracies = []
log_losses = []
f1_scores = []
precisions = []
recalls = []
specificities = []
aurocs = []

print("Starting cross-validation...")

for train_index, val_index in kf.split(X_train, y_train):
    print(f"\nTraining fold {fold}...")

    # Split training data into train and validation sets
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Train the AdaBoost classifier
    model.fit(X_train_fold, y_train_fold)

    # Make predictions on the validation set
    y_val_pred_proba = model.predict_proba(X_val_fold)[:, 1]
    y_val_pred_classes = model.predict(X_val_fold)

    # Calculate validation metrics
    accuracy = accuracy_score(y_val_fold, y_val_pred_classes)
    log_loss_value = log_loss(y_val_fold, y_val_pred_proba)
    f1 = f1_score(y_val_fold, y_val_pred_classes)
    precision = precision_score(y_val_fold, y_val_pred_classes)
    recall = recall_score(y_val_fold, y_val_pred_classes)
    auroc = roc_auc_score(y_val_fold, y_val_pred_proba)

    # Specificity calculation
    cm = confusion_matrix(y_val_fold, y_val_pred_classes)
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp)

    # Append metrics
    accuracies.append(accuracy)
    log_losses.append(log_loss_value)
    f1_scores.append(f1)
    precisions.append(precision)
    recalls.append(recall)
    specificities.append(specificity)
    aurocs.append(auroc)

    print(f"Fold {fold} - Accuracy: {accuracy:.4f}, Log Loss: {log_loss_value:.4f}, F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, Specificity: {specificity:.4f}, AUROC: {auroc:.4f}")
    fold += 1

# Final aggregated cross-validation results
print("\nCross-Validation Results:")
print(f"Average Accuracy: {np.mean(accuracies):.4f}")
print(f"Average Log Loss: {np.mean(log_losses):.4f}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")
print(f"Average Precision: {np.mean(precisions):.4f}")
print(f"Average Recall: {np.mean(recalls):.4f}")
print(f"Average Specificity: {np.mean(specificities):.4f}")
print(f"Average AUROC: {np.mean(aurocs):.4f}")

# Final evaluation on the test set
print("\nFinal Test Evaluation...")
model.fit(X_train, y_train)  # Train the model on the full training data

# Test set predictions
y_test_pred_proba = model.predict_proba(X_test)[:, 1]
y_test_pred_classes = model.predict(X_test)

# Calculate test set metrics
test_accuracy = accuracy_score(y_test, y_test_pred_classes)
test_log_loss = log_loss(y_test, y_test_pred_proba)
test_f1 = f1_score(y_test, y_test_pred_classes)
test_precision = precision_score(y_test, y_test_pred_classes)
test_recall = recall_score(y_test, y_test_pred_classes)
test_auroc = roc_auc_score(y_test, y_test_pred_proba)

# Specificity calculation
cm_test = confusion_matrix(y_test, y_test_pred_classes)
tn, fp, fn, tp = cm_test.ravel()
test_specificity = tn / (tn + fp)

# Print test results
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Log Loss: {test_log_loss:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test Specificity: {test_specificity:.4f}")
print(f"Test AUROC: {test_auroc:.4f}")


Starting cross-validation...

Training fold 1...
Fold 1 - Accuracy: 0.9048, Log Loss: 0.3640, F1: 0.8947, Precision: 0.8947, Recall: 0.8947, Specificity: 0.9130, AUROC: 0.9588

Training fold 2...
Fold 2 - Accuracy: 0.7619, Log Loss: 0.4642, F1: 0.7500, Precision: 0.7143, Recall: 0.7895, Specificity: 0.7391, AUROC: 0.8650

Training fold 3...
Fold 3 - Accuracy: 0.8095, Log Loss: 0.4556, F1: 0.8000, Precision: 0.8000, Recall: 0.8000, Specificity: 0.8182, AUROC: 0.9057

Training fold 4...
Fold 4 - Accuracy: 0.7857, Log Loss: 0.4640, F1: 0.7568, Precision: 0.8235, Recall: 0.7000, Specificity: 0.8636, AUROC: 0.8705

Training fold 5...
Fold 5 - Accuracy: 0.7619, Log Loss: 0.4456, F1: 0.7619, Precision: 0.7273, Recall: 0.8000, Specificity: 0.7273, AUROC: 0.8864

Training fold 6...
Fold 6 - Accuracy: 0.7619, Log Loss: 0.4593, F1: 0.7619, Precision: 0.7273, Recall: 0.8000, Specificity: 0.7273, AUROC: 0.8886

Training fold 7...
Fold 7 - Accuracy: 0.8571, Log Loss: 0.3947, F1: 0.8636, Precision: 0

In [22]:
# LGBMClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    accuracy_score, log_loss, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score
)

# Define model parameters
model = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=-1,
    random_state=42
)

# Cross-validation setup
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Metrics containers
cv_accuracies = []
cv_log_losses = []
cv_f1_scores = []
cv_precisions = []
cv_recalls = []
cv_specificities = []
cv_aurocs = []

print("Starting cross-validation...")

fold = 1
for train_index, val_index in kf.split(X_train, y_train):
    print(f"\nTraining fold {fold}...")

    # Split data into train and validation sets
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Scale the data within the fold
    scaler = StandardScaler()
    X_train_fold = scaler.fit_transform(X_train_fold)
    X_val_fold = scaler.transform(X_val_fold)

    # Train the model
    model.fit(X_train_fold, y_train_fold)

    # Predict on the validation set
    y_val_pred_proba = model.predict_proba(X_val_fold)[:, 1]
    y_val_pred_classes = model.predict(X_val_fold)

    # Calculate validation metrics
    accuracy = accuracy_score(y_val_fold, y_val_pred_classes)
    log_loss_value = log_loss(y_val_fold, y_val_pred_proba)
    f1 = f1_score(y_val_fold, y_val_pred_classes)
    precision = precision_score(y_val_fold, y_val_pred_classes)
    recall = recall_score(y_val_fold, y_val_pred_classes)
    auroc = roc_auc_score(y_val_fold, y_val_pred_proba)

    # Specificity calculation
    cm = confusion_matrix(y_val_fold, y_val_pred_classes)
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp)

    # Append metrics
    cv_accuracies.append(accuracy)
    cv_log_losses.append(log_loss_value)
    cv_f1_scores.append(f1)
    cv_precisions.append(precision)
    cv_recalls.append(recall)
    cv_specificities.append(specificity)
    cv_aurocs.append(auroc)

    print(f"Fold {fold} - Accuracy: {accuracy:.4f}, Log Loss: {log_loss_value:.4f}, F1: {f1:.4f}, "
          f"Precision: {precision:.4f}, Recall: {recall:.4f}, Specificity: {specificity:.4f}, AUROC: {auroc:.4f}")
    fold += 1

# Final Cross-Validation Results
print("\nCross-Validation Results:")
print(f"Average Accuracy: {np.mean(cv_accuracies):.4f}")
print(f"Average Log Loss: {np.mean(cv_log_losses):.4f}")
print(f"Average F1 Score: {np.mean(cv_f1_scores):.4f}")
print(f"Average Precision: {np.mean(cv_precisions):.4f}")
print(f"Average Recall: {np.mean(cv_recalls):.4f}")
print(f"Average Specificity: {np.mean(cv_specificities):.4f}")
print(f"Average AUROC: {np.mean(cv_aurocs):.4f}")

# Final Test Evaluation
print("\nFinal Test Evaluation...")

# Scale the test set using the entire training set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model on the entire training set
model.fit(X_train_scaled, y_train)

# Evaluate on the test set
y_test_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
y_test_pred_classes = model.predict(X_test_scaled)

test_accuracy = accuracy_score(y_test, y_test_pred_classes)
test_log_loss = log_loss(y_test, y_test_pred_proba)
test_f1 = f1_score(y_test, y_test_pred_classes)
test_precision = precision_score(y_test, y_test_pred_classes)
test_recall = recall_score(y_test, y_test_pred_classes)
test_auroc = roc_auc_score(y_test, y_test_pred_proba)

cm_test = confusion_matrix(y_test, y_test_pred_classes)
tn, fp, fn, tp = cm_test.ravel()
test_specificity = tn / (tn + fp)

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Log Loss: {test_log_loss:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test Specificity: {test_specificity:.4f}")
print(f"Test AUROC: {test_auroc:.4f}")


Starting cross-validation...

Training fold 1...
[LightGBM] [Info] Number of positive: 179, number of negative: 199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000349 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3619
[LightGBM] [Info] Number of data points in the train set: 378, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.473545 -> initscore=-0.105919
[LightGBM] [Info] Start training from score -0.105919
Fold 1 - Accuracy: 0.8333, Log Loss: 0.4327, F1: 0.8108, Precision: 0.8333, Recall: 0.7895, Specificity: 0.8696, AUROC: 0.9291

Training fold 2...
[LightGBM] [Info] Number of positive: 179, number of negative: 199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000304 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3614
[LightGBM] [Info] Number of data points in the train 