## Rewriting notebook to use CoxPH instead of CoxNet

In [1]:
from sksurv.linear_model import CoxPHSurvivalAnalysis
import pandas as pd
from sksurv.util import Surv
import numpy as np
from sksurv.metrics import concordance_index_ipcw

pd.options.plotting.backend = "plotly"

df = pd.read_csv('../../data/train_enhanced.csv', sep=',')
eval = pd.read_csv('../../data/eval_enhanced.csv', sep=',')
eval = pd.read_csv('../../data/eval_enhanced.csv', sep=',')
target = ["OS_STATUS", "OS_YEARS"]
X = df.drop(columns=target + ["ID"])
y = Surv.from_dataframe(*target, df[target])
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [2]:
cox = CoxPHSurvivalAnalysis()
cox.fit(X_train, y_train)

cox_cindex_train = concordance_index_ipcw(y_train, y_train, cox.predict(X_train), tau=7)[0]
cox_cindex_test = concordance_index_ipcw(y_train, y_test, cox.predict(X_test), tau=7)[0]
print(f"Cox Proportional Hazard Model Concordance Index IPCW on train: {cox_cindex_train:.4f}")
print(f"Cox Proportional Hazard Model Concordance Index IPCW on test: {cox_cindex_test:.4f}")

Cox Proportional Hazard Model Concordance Index IPCW on train: 0.7105
Cox Proportional Hazard Model Concordance Index IPCW on test: 0.7079


In [3]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores_train = []
cv_scores_test = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    X_train_fold = X.iloc[train_idx]
    X_test_fold = X.iloc[test_idx]
    y_train_fold = y[train_idx]
    y_test_fold = y[test_idx]

    X_train_fold = pd.DataFrame(
        X_train_fold,
        columns=X_train_fold.columns,
        index=X_train_fold.index
    )
    X_test_fold = pd.DataFrame(
        X_test_fold,
        columns=X_test_fold.columns,
        index=X_test_fold.index
    )

    cox_fold = CoxPHSurvivalAnalysis()
    cox_fold.fit(X_train_fold, y_train_fold)

    train_score = concordance_index_ipcw(y, y_train_fold, cox_fold.predict(X_train_fold), tau=7)[0]
    test_score = concordance_index_ipcw(y, y_test_fold, cox_fold.predict(X_test_fold), tau=7)[0]

    cv_scores_train.append(train_score)
    cv_scores_test.append(test_score)

    print(f"Fold {fold} - Train C-Index IPCW: {train_score:.4f}, Test C-Index IPCW: {test_score:.4f}")

print(f"\nAverage Train C-Index IPCW: {np.mean(cv_scores_train):.4f} (+/- {np.std(cv_scores_train):.4f})")
print(f"Average Test C-Index IPCW: {np.mean(cv_scores_test):.4f} (+/- {np.std(cv_scores_test):.4f})")

Fold 1 - Train C-Index IPCW: 0.7098, Test C-Index IPCW: 0.7117
Fold 2 - Train C-Index IPCW: 0.7147, Test C-Index IPCW: 0.6940
Fold 3 - Train C-Index IPCW: 0.7071, Test C-Index IPCW: 0.7267
Fold 4 - Train C-Index IPCW: 0.7102, Test C-Index IPCW: 0.7104
Fold 5 - Train C-Index IPCW: 0.7124, Test C-Index IPCW: 0.6943

Average Train C-Index IPCW: 0.7108 (+/- 0.0026)
Average Test C-Index IPCW: 0.7074 (+/- 0.0123)


In [4]:
def objective(trial):
    params = {
        'alpha': trial.suggest_float('alpha', 1e-6, 10.0, log=True),
        'n_iter': trial.suggest_int('n_iter', 100, 1000),
        'tol': trial.suggest_float('tol', 1e-9, 1e-3, log=True),
        'verbose': 0
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, test_idx in kf.split(X):
        X_train_fold = X.iloc[train_idx]
        X_test_fold = X.iloc[test_idx]
        y_train_fold = y[train_idx]
        y_test_fold = y[test_idx]

        cox_trial = CoxPHSurvivalAnalysis(**params)
        cox_trial.fit(X_train_fold, y_train_fold)

        test_score = concordance_index_ipcw(y, y_test_fold, cox_trial.predict(X_test_fold), tau=7)[0]
        cv_scores.append(test_score)

    return np.mean(cv_scores)

In [6]:
import optuna

study = optuna.create_study(direction='maximize', study_name='CoxPHSurvivalAnalysis_tuning')
study.optimize(objective, n_trials=250, show_progress_bar=True)

[I 2025-10-01 18:20:28,265] A new study created in memory with name: CoxPHSurvivalAnalysis_tuning


  0%|          | 0/250 [00:00<?, ?it/s]

[I 2025-10-01 18:20:29,192] Trial 0 finished with value: 0.7074499362781255 and parameters: {'alpha': 0.03812383095508565, 'n_iter': 178, 'tol': 1.3782344441786312e-06}. Best is trial 0 with value: 0.7074499362781255.
[I 2025-10-01 18:20:30,062] Trial 1 finished with value: 0.7074978453218979 and parameters: {'alpha': 0.30414077974576414, 'n_iter': 327, 'tol': 1.6985444474720268e-06}. Best is trial 1 with value: 0.7074978453218979.
[I 2025-10-01 18:20:30,943] Trial 2 finished with value: 0.7075755336985853 and parameters: {'alpha': 6.104674321138969, 'n_iter': 624, 'tol': 7.491007392433392e-08}. Best is trial 2 with value: 0.7075755336985853.
[I 2025-10-01 18:20:31,819] Trial 3 finished with value: 0.707429923979807 and parameters: {'alpha': 1.0583818340262218e-05, 'n_iter': 198, 'tol': 1.148214965181371e-07}. Best is trial 2 with value: 0.7075755336985853.
[I 2025-10-01 18:20:32,731] Trial 4 finished with value: 0.707429923979807 and parameters: {'alpha': 1.6685190264824868e-05, 'n_it

In [7]:
from optuna.visualization import plot_optimization_history, plot_param_importances

print(f"Best trial value (C-Index IPCW): {study.best_trial.value:.4f}")
print("\nBest hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

Best trial value (C-Index IPCW): 0.7077

Best hyperparameters:
  alpha: 9.560874711411728
  n_iter: 570
  tol: 3.921651247591744e-08


In [8]:
fig = plot_optimization_history(study)
fig.show()

In [9]:
fig = plot_param_importances(study)
fig.show()

## Rerun Cross-Validation with Best Hyperparameters

In [10]:
kf_best = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores_train_best = []
cv_scores_test_best = []

print(f"Running CV with best parameters: {study.best_params}\n")

for fold, (train_idx, test_idx) in enumerate(kf_best.split(X), 1):
    X_train_fold = X.iloc[train_idx]
    X_test_fold = X.iloc[test_idx]
    y_train_fold = y[train_idx]
    y_test_fold = y[test_idx]

    X_train_fold = pd.DataFrame(
        X_train_fold,
        columns=X_train_fold.columns,
        index=X_train_fold.index
    )
    X_test_fold = pd.DataFrame(
        X_test_fold,
        columns=X_test_fold.columns,
        index=X_test_fold.index
    )

    cox_best = CoxPHSurvivalAnalysis(**study.best_params)
    cox_best.fit(X_train_fold, y_train_fold)

    train_score = concordance_index_ipcw(y, y_train_fold, cox_best.predict(X_train_fold), tau=7)[0]
    test_score = concordance_index_ipcw(y, y_test_fold, cox_best.predict(X_test_fold), tau=7)[0]

    cv_scores_train_best.append(train_score)
    cv_scores_test_best.append(test_score)

    print(f"Fold {fold} - Train C-Index IPCW: {train_score:.4f}, Test C-Index IPCW: {test_score:.4f}")

print(f"\nAverage Train C-Index IPCW: {np.mean(cv_scores_train_best):.4f} (+/- {np.std(cv_scores_train_best):.4f})")
print(f"Average Test C-Index IPCW: {np.mean(cv_scores_test_best):.4f} (+/- {np.std(cv_scores_test_best):.4f})")

Running CV with best parameters: {'alpha': 9.560874711411728, 'n_iter': 570, 'tol': 3.921651247591744e-08}

Fold 1 - Train C-Index IPCW: 0.7099, Test C-Index IPCW: 0.7120
Fold 2 - Train C-Index IPCW: 0.7147, Test C-Index IPCW: 0.6941
Fold 3 - Train C-Index IPCW: 0.7071, Test C-Index IPCW: 0.7271
Fold 4 - Train C-Index IPCW: 0.7102, Test C-Index IPCW: 0.7108
Fold 5 - Train C-Index IPCW: 0.7126, Test C-Index IPCW: 0.6945

Average Train C-Index IPCW: 0.7109 (+/- 0.0026)
Average Test C-Index IPCW: 0.7077 (+/- 0.0124)


In [11]:
cv_results_comparison = pd.DataFrame({
    'Baseline Train': cv_scores_train,
    'Baseline Test': cv_scores_test,
    'Tuned Train': cv_scores_train_best,
    'Tuned Test': cv_scores_test_best
}, index=[f'Fold {i + 1}' for i in range(5)])

cv_results_comparison.loc['Mean'] = cv_results_comparison.mean()
cv_results_comparison.loc['Std'] = cv_results_comparison.std()

cv_results_comparison

Unnamed: 0,Baseline Train,Baseline Test,Tuned Train,Tuned Test
Fold 1,0.709768,0.711707,0.709916,0.712015
Fold 2,0.714671,0.694003,0.714688,0.694078
Fold 3,0.707065,0.726746,0.707116,0.727132
Fold 4,0.710176,0.710404,0.710189,0.710844
Fold 5,0.712428,0.694289,0.712567,0.694463
Mean,0.710822,0.70743,0.710895,0.707706
Std,0.002571,0.012274,0.002566,0.012385


In [12]:
cox_final = CoxPHSurvivalAnalysis(**study.best_params)
cox_final.fit(X, y)
prediction = cox_final.predict(eval.drop(columns=["ID"]))
submission = pd.Series(prediction, index=eval['ID'], name='risk_score')
submission.to_csv('../../submissions/coxph_enhanced_tuned.csv')

In [13]:
print(f"Tuned model predictions saved to: coxph_enhanced_tuned.csv")


Tuned model predictions saved to: coxph_enhanced_tuned.csv
