In [1]:
import pandas as pd
from sksurv.util import Surv
import numpy as np
from sksurv.metrics import concordance_index_ipcw
from sksurv.ensemble import GradientBoostingSurvivalAnalysis

pd.options.plotting.backend = "plotly"

df = pd.read_csv('../../data/train_enhanced.csv', sep =',')
eval = pd.read_csv('../../data/eval_enhanced.csv', sep =',')

In [2]:
target = ["OS_STATUS", "OS_YEARS"]
X = df.drop(columns = target + ["ID"])
y = Surv.from_dataframe(*target, df[target])

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:


# Initialize and train the Cox Proportional Hazards model
cox = GradientBoostingSurvivalAnalysis()
cox.fit(X_train, y_train)

# Evaluate the model using Concordance Index IPCW
cox_cindex_train = concordance_index_ipcw(y_train, y_train, cox.predict(X_train), tau=7)[0]
cox_cindex_test = concordance_index_ipcw(y_train, y_test, cox.predict(X_test), tau=7)[0]
print(f"Cox Proportional Hazard Model Concordance Index IPCW on train: {cox_cindex_train:.4f}")
print(f"Cox Proportional Hazard Model Concordance Index IPCW on test: {cox_cindex_test:.4f}")

Cox Proportional Hazard Model Concordance Index IPCW on train: 0.7739
Cox Proportional Hazard Model Concordance Index IPCW on test: 0.7161


In [5]:

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold

# Initialize K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store cross-validation scores
cv_scores_train = []
cv_scores_test = []

# Perform K-Fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    # Split data
    X_train_fold = X.iloc[train_idx]
    X_test_fold = X.iloc[test_idx]
    y_train_fold = y[train_idx]
    y_test_fold = y[test_idx]

    # Impute missing values
    X_train_fold = pd.DataFrame(
        X_train_fold,
        columns=X_train_fold.columns,
        index=X_train_fold.index
    )
    X_test_fold = pd.DataFrame(
        X_test_fold,
        columns=X_test_fold.columns,
        index=X_test_fold.index
    )

    # Train Cox model
    cox_fold = GradientBoostingSurvivalAnalysis()
    cox_fold.fit(X_train_fold, y_train_fold)

    # Evaluate
    train_score = concordance_index_ipcw(y, y_train_fold, cox_fold.predict(X_train_fold), tau=7)[0]
    test_score = concordance_index_ipcw(y, y_test_fold, cox_fold.predict(X_test_fold), tau=7)[0]

    cv_scores_train.append(train_score)
    cv_scores_test.append(test_score)

    print(f"Fold {fold} - Train C-Index IPCW: {train_score:.4f}, Test C-Index IPCW: {test_score:.4f}")

# Print average scores
print(f"\nAverage Train C-Index IPCW: {np.mean(cv_scores_train):.4f} (+/- {np.std(cv_scores_train):.4f})")
print(f"Average Test C-Index IPCW: {np.mean(cv_scores_test):.4f} (+/- {np.std(cv_scores_test):.4f})")


Fold 1 - Train C-Index IPCW: 0.7642, Test C-Index IPCW: 0.7273
Fold 2 - Train C-Index IPCW: 0.7736, Test C-Index IPCW: 0.6854
Fold 3 - Train C-Index IPCW: 0.7640, Test C-Index IPCW: 0.7307
Fold 4 - Train C-Index IPCW: 0.7659, Test C-Index IPCW: 0.7164
Fold 5 - Train C-Index IPCW: 0.7694, Test C-Index IPCW: 0.6989

Average Train C-Index IPCW: 0.7674 (+/- 0.0036)
Average Test C-Index IPCW: 0.7117 (+/- 0.0172)


## Hyperparameter Tuning with Optuna

Using Optuna to optimize GradientBoostingSurvivalAnalysis hyperparameters with cross-validation.

In [None]:
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances


def objective(trial):
    # Define hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'dropout_rate': trial.suggest_float('dropout_rate', 0.0, 0.3),
        'random_state': 42
    }

    # Perform K-Fold cross-validation
    cv_scores = []

    for train_idx, test_idx in kf.split(X):
        X_train_fold = X.iloc[train_idx]
        X_test_fold = X.iloc[test_idx]
        y_train_fold = y[train_idx]
        y_test_fold = y[test_idx]

        model = GradientBoostingSurvivalAnalysis(**params)
        model.fit(X_train_fold, y_train_fold)

        test_score = concordance_index_ipcw(y, y_test_fold, model.predict(X_test_fold), tau=7)[0]
        cv_scores.append(test_score)

    return np.mean(cv_scores)


study = optuna.create_study(direction='maximize', study_name='gb_survival_optimization')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print(f"\nBest C-Index: {study.best_value:.4f}")
print("\nBest hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

[I 2025-11-12 22:48:52,608] A new study created in memory with name: gb_survival_optimization


  0%|          | 0/100 [00:00<?, ?it/s]

In [13]:
plot_optimization_history(study)

In [14]:
plot_param_importances(study)

In [15]:
# Train model with best parameters using cross-validation
best_params = study.best_params
best_params['random_state'] = 42

cv_scores_train_tuned = []
cv_scores_test_tuned = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    X_train_fold = X.iloc[train_idx]
    X_test_fold = X.iloc[test_idx]
    y_train_fold = y[train_idx]
    y_test_fold = y[test_idx]

    model_tuned = GradientBoostingSurvivalAnalysis(**best_params)
    model_tuned.fit(X_train_fold, y_train_fold)

    train_score = concordance_index_ipcw(y, y_train_fold, model_tuned.predict(X_train_fold), tau=7)[0]
    test_score = concordance_index_ipcw(y, y_test_fold, model_tuned.predict(X_test_fold), tau=7)[0]

    cv_scores_train_tuned.append(train_score)
    cv_scores_test_tuned.append(test_score)

    print(f"Fold {fold} - Train C-Index: {train_score:.4f}, Test C-Index: {test_score:.4f}")

print(
    f"\nTuned Model - Average Train C-Index: {np.mean(cv_scores_train_tuned):.4f} (+/- {np.std(cv_scores_train_tuned):.4f})")
print(
    f"Tuned Model - Average Test C-Index: {np.mean(cv_scores_test_tuned):.4f} (+/- {np.std(cv_scores_test_tuned):.4f})")
print(f"\nBaseline Model - Average Test C-Index: {np.mean(cv_scores_test):.4f} (+/- {np.std(cv_scores_test):.4f})")
print(f"Improvement: {np.mean(cv_scores_test_tuned) - np.mean(cv_scores_test):.4f}")

Fold 1 - Train C-Index: 0.7606, Test C-Index: 0.7241
Fold 2 - Train C-Index: 0.7650, Test C-Index: 0.6885
Fold 3 - Train C-Index: 0.7561, Test C-Index: 0.7296
Fold 4 - Train C-Index: 0.7577, Test C-Index: 0.7097
Fold 5 - Train C-Index: 0.7619, Test C-Index: 0.6988

Tuned Model - Average Train C-Index: 0.7603 (+/- 0.0031)
Tuned Model - Average Test C-Index: 0.7101 (+/- 0.0153)

Baseline Model - Average Test C-Index: 0.7049 (+/- 0.0154)
Improvement: 0.0052


In [16]:
cox_tuned = GradientBoostingSurvivalAnalysis(**best_params)
cox_tuned.fit(X, y)
prediction_tuned = cox_tuned.predict(eval.drop(columns=["ID"]))
submission_tuned = pd.Series(prediction_tuned, index=eval['ID'], name='risk_score')
submission_tuned.to_csv('../../submissions/gb_survival_tuned.csv')
print("Tuned model predictions saved to submissions/gb_survival_tuned.csv")


Tuned model predictions saved to submissions/gb_survival_tuned.csv
