In [None]:
# XGBoost with SHAP Values
# Author: Amber Fedynak
# Date: March 1, 2025

import numpy as np
import pandas as pd
import xgboost as xgb  # XGBoost for classification
import shap  # SHAP for model explainability
import matplotlib.pyplot as plt  # For figures

# For preprocessing and model evaluation
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.preprocessing import OneHotEncoder, StandardScaler  
from sklearn.compose import ColumnTransformer  
from sklearn.pipeline import Pipeline  
from sklearn.metrics import (
    accuracy_score, roc_auc_score, confusion_matrix,
    classification_report, roc_curve, precision_recall_curve
)
import shap 

In [None]:
# Data Preprocessing 
df = pd.read_csv("/Users/afedynak/master_sheet.csv")
df.head()

target_col = 'week10_remission_flag'
categorical_features = X.select_dtypes(include=['object']).columns  # Categorical columns
numerical_features = X.select_dtypes(exclude=['object']).columns  # Numerical columns

columns = [
    'age', 'madrs', 'phq9', 'gender', 'race', 'ethnicity', 'education_level', 
    'medication_group', 'remission_status', 'AIS_01', 'LIS_01', 'MDMIS_01', 'MVCIS_01', 
    'IMIS_01', 'IL.6', 'gp130', 'IL.8.CXCL.8', 'MIF', 'CCL.2.MCP.1', 'IL.1beta.IL.1F2', 
    'CCL.20.MIP.3alpha', 'CCL.4.MIP.1beta', 'GM.CSF',
    'BMI', 'CRP', 'fasting_glucose', 'insulin_resistance', 'lipid_profile', 'total_cholesterol', 
    'LDL_cholesterol', 'HDL_cholesterol', 'triglycerides', 'leptin', 'adiponectin', 'visceral_fat', 
    'eGFR', 'kidney_function', 'systolic_blood_pressure', 'diastolic_blood_pressure', 'heart_rate', 
    'oxygen_saturation', 'pulmonary_function', 'FVC', 'FEV1', 'peak_flow', 'albumin_to_creatinine_ratio', 
    'proinflammatory_cytokines', 'autoimmune_marker', 'neurotrophins'
]

categorical = ['gender', 'race', 'ethnicity', 'medication_group', 'remission_status', 'education_level']
numerical = [col for col in columns if col not in categorical]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [None]:
# Train-test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of train and test sets
print(f"Training set size: {X_train.shape}, Test set size: {X_test.shape}")

In [None]:
# XGBoost Model with pipeline 
model = xgb.XGBClassifier(
    eval_metric='mlogloss', 
    use_label_encoder=False,
    random_state=42)  

# Creating a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Fit the model
pipeline.fit(X_train, y_train)

In [None]:
# Model Prediction and Evaluation
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1] 

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")
print(f"ROC-AUC: {roc_auc:.4f}")

In [None]:
# SHAP Analysis
explainer = shap.TreeExplainer(pipeline.named_steps['model'])

shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test)

In [None]:
# Stratified K-Folds Cross-Validation 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  

cv_results = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')

print(f"Cross-validation accuracy scores: {cv_results}")
print(f"Mean accuracy: {cv_results.mean():.3f}")
print(f"Standard deviation of accuracy: {cv_results.std():.3f}")

y_pred_cv = cross_val_predict(pipeline, X, y, cv=cv)

accuracy = accuracy_score(y, y_pred_cv)
auc = roc_auc_score(y, pipeline.predict_proba(X)[:, 1])
