## Rewriting notebook to use CoxPH instead of CoxNet

In [1]:
from sksurv.linear_model import CoxPHSurvivalAnalysis
import pandas as pd
from sksurv.util import Surv
import numpy as np
from sksurv.metrics import concordance_index_ipcw

pd.options.plotting.backend = "plotly"

df = pd.read_csv('../../data/train_enhanced.csv', sep=',')
eval = pd.read_csv('../../data/eval_enhanced.csv', sep=',')
eval = pd.read_csv('../../data/eval_enhanced.csv', sep=',')
target = ["OS_STATUS", "OS_YEARS"]
X = df.drop(columns=target + ["ID"])
y = Surv.from_dataframe(*target, df[target])
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [2]:
cox = CoxPHSurvivalAnalysis()
cox.fit(X_train, y_train)

cox_cindex_train = concordance_index_ipcw(y_train, y_train, cox.predict(X_train), tau=7)[0]
cox_cindex_test = concordance_index_ipcw(y_train, y_test, cox.predict(X_test), tau=7)[0]
print(f"Cox Proportional Hazard Model Concordance Index IPCW on train: {cox_cindex_train:.4f}")
print(f"Cox Proportional Hazard Model Concordance Index IPCW on test: {cox_cindex_test:.4f}")

Cox Proportional Hazard Model Concordance Index IPCW on train: 0.7170
Cox Proportional Hazard Model Concordance Index IPCW on test: 0.7072


In [3]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores_train = []
cv_scores_test = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    X_train_fold = X.iloc[train_idx]
    X_test_fold = X.iloc[test_idx]
    y_train_fold = y[train_idx]
    y_test_fold = y[test_idx]

    X_train_fold = pd.DataFrame(
        X_train_fold,
        columns=X_train_fold.columns,
        index=X_train_fold.index
    )
    X_test_fold = pd.DataFrame(
        X_test_fold,
        columns=X_test_fold.columns,
        index=X_test_fold.index
    )

    cox_fold = CoxPHSurvivalAnalysis()
    cox_fold.fit(X_train_fold, y_train_fold)

    train_score = concordance_index_ipcw(y, y_train_fold, cox_fold.predict(X_train_fold), tau=7)[0]
    test_score = concordance_index_ipcw(y, y_test_fold, cox_fold.predict(X_test_fold), tau=7)[0]

    cv_scores_train.append(train_score)
    cv_scores_test.append(test_score)

    print(f"Fold {fold} - Train C-Index IPCW: {train_score:.4f}, Test C-Index IPCW: {test_score:.4f}")

print(f"\nAverage Train C-Index IPCW: {np.mean(cv_scores_train):.4f} (+/- {np.std(cv_scores_train):.4f})")
print(f"Average Test C-Index IPCW: {np.mean(cv_scores_test):.4f} (+/- {np.std(cv_scores_test):.4f})")

Fold 1 - Train C-Index IPCW: 0.7152, Test C-Index IPCW: 0.7142
Fold 2 - Train C-Index IPCW: 0.7227, Test C-Index IPCW: 0.6856
Fold 3 - Train C-Index IPCW: 0.7120, Test C-Index IPCW: 0.7283
Fold 4 - Train C-Index IPCW: 0.7152, Test C-Index IPCW: 0.7214
Fold 5 - Train C-Index IPCW: 0.7214, Test C-Index IPCW: 0.6867

Average Train C-Index IPCW: 0.7173 (+/- 0.0041)
Average Test C-Index IPCW: 0.7072 (+/- 0.0178)


In [4]:
def objective(trial):
    params = {
        'alpha': trial.suggest_float('alpha', 1e-6, 10.0, log=True),
        'n_iter': trial.suggest_int('n_iter', 100, 1000),
        'tol': trial.suggest_float('tol', 1e-9, 1e-3, log=True),
        'verbose': 0
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, test_idx in kf.split(X):
        X_train_fold = X.iloc[train_idx]
        X_test_fold = X.iloc[test_idx]
        y_train_fold = y[train_idx]
        y_test_fold = y[test_idx]

        cox_trial = CoxPHSurvivalAnalysis(**params)
        cox_trial.fit(X_train_fold, y_train_fold)

        test_score = concordance_index_ipcw(y, y_test_fold, cox_trial.predict(X_test_fold), tau=7)[0]
        cv_scores.append(test_score)

    return np.mean(cv_scores)

In [5]:
import optuna

study = optuna.create_study(direction='maximize', study_name='CoxPHSurvivalAnalysis_tuning')
study.optimize(objective, n_trials=250, show_progress_bar=True)

[I 2025-11-12 22:52:11,136] A new study created in memory with name: CoxPHSurvivalAnalysis_tuning


  0%|          | 0/250 [00:00<?, ?it/s]

[I 2025-11-12 22:52:12,031] Trial 0 finished with value: 0.7073327512927179 and parameters: {'alpha': 0.0004711303956407106, 'n_iter': 452, 'tol': 0.0006973663048411936}. Best is trial 0 with value: 0.7073327512927179.
[I 2025-11-12 22:52:13,244] Trial 1 finished with value: 0.7072406332019049 and parameters: {'alpha': 0.0014962890959301174, 'n_iter': 952, 'tol': 8.130812200544664e-08}. Best is trial 0 with value: 0.7073327512927179.
[I 2025-11-12 22:52:14,324] Trial 2 finished with value: 0.7072360703618775 and parameters: {'alpha': 1.9848794152111897e-06, 'n_iter': 376, 'tol': 1.8907424687044996e-06}. Best is trial 0 with value: 0.7073327512927179.
[I 2025-11-12 22:52:15,640] Trial 3 finished with value: 0.7086167654743966 and parameters: {'alpha': 9.09888843450772, 'n_iter': 217, 'tol': 1.720358298098987e-09}. Best is trial 3 with value: 0.7086167654743966.
[I 2025-11-12 22:52:16,980] Trial 4 finished with value: 0.7072412768040713 and parameters: {'alpha': 0.0039091440126614654, 'n

In [6]:
from optuna.visualization import plot_optimization_history, plot_param_importances

print(f"Best trial value (C-Index IPCW): {study.best_trial.value:.4f}")
print("\nBest hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

Best trial value (C-Index IPCW): 0.7088

Best hyperparameters:
  alpha: 6.348312310430074
  n_iter: 629
  tol: 2.349756261901387e-06


In [7]:
fig = plot_optimization_history(study)
fig.show()

In [8]:
fig = plot_param_importances(study)
fig.show()

## Rerun Cross-Validation with Best Hyperparameters

In [9]:
kf_best = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores_train_best = []
cv_scores_test_best = []

print(f"Running CV with best parameters: {study.best_params}\n")

for fold, (train_idx, test_idx) in enumerate(kf_best.split(X), 1):
    X_train_fold = X.iloc[train_idx]
    X_test_fold = X.iloc[test_idx]
    y_train_fold = y[train_idx]
    y_test_fold = y[test_idx]

    X_train_fold = pd.DataFrame(
        X_train_fold,
        columns=X_train_fold.columns,
        index=X_train_fold.index
    )
    X_test_fold = pd.DataFrame(
        X_test_fold,
        columns=X_test_fold.columns,
        index=X_test_fold.index
    )

    cox_best = CoxPHSurvivalAnalysis(**study.best_params)
    cox_best.fit(X_train_fold, y_train_fold)

    train_score = concordance_index_ipcw(y, y_train_fold, cox_best.predict(X_train_fold), tau=7)[0]
    test_score = concordance_index_ipcw(y, y_test_fold, cox_best.predict(X_test_fold), tau=7)[0]

    cv_scores_train_best.append(train_score)
    cv_scores_test_best.append(test_score)

    print(f"Fold {fold} - Train C-Index IPCW: {train_score:.4f}, Test C-Index IPCW: {test_score:.4f}")

print(f"\nAverage Train C-Index IPCW: {np.mean(cv_scores_train_best):.4f} (+/- {np.std(cv_scores_train_best):.4f})")
print(f"Average Test C-Index IPCW: {np.mean(cv_scores_test_best):.4f} (+/- {np.std(cv_scores_test_best):.4f})")

Running CV with best parameters: {'alpha': 6.348312310430074, 'n_iter': 629, 'tol': 2.349756261901387e-06}

Fold 1 - Train C-Index IPCW: 0.7156, Test C-Index IPCW: 0.7156
Fold 2 - Train C-Index IPCW: 0.7229, Test C-Index IPCW: 0.6871
Fold 3 - Train C-Index IPCW: 0.7127, Test C-Index IPCW: 0.7309
Fold 4 - Train C-Index IPCW: 0.7155, Test C-Index IPCW: 0.7215
Fold 5 - Train C-Index IPCW: 0.7212, Test C-Index IPCW: 0.6888

Average Train C-Index IPCW: 0.7176 (+/- 0.0038)
Average Test C-Index IPCW: 0.7088 (+/- 0.0177)


In [10]:
cv_results_comparison = pd.DataFrame({
    'Baseline Train': cv_scores_train,
    'Baseline Test': cv_scores_test,
    'Tuned Train': cv_scores_train_best,
    'Tuned Test': cv_scores_test_best
}, index=[f'Fold {i + 1}' for i in range(5)])

cv_results_comparison.loc['Mean'] = cv_results_comparison.mean()
cv_results_comparison.loc['Std'] = cv_results_comparison.std()

cv_results_comparison

Unnamed: 0,Baseline Train,Baseline Test,Tuned Train,Tuned Test
Fold 1,0.715175,0.714174,0.715587,0.715644
Fold 2,0.722729,0.685648,0.722914,0.687118
Fold 3,0.712002,0.728276,0.712742,0.730883
Fold 4,0.715166,0.721431,0.715486,0.72147
Fold 5,0.721376,0.686674,0.72119,0.688824
Mean,0.71729,0.707241,0.717584,0.708788
Std,0.00408,0.017783,0.003827,0.017687


In [11]:
cox_final = CoxPHSurvivalAnalysis(**study.best_params)
cox_final.fit(X, y)
prediction = cox_final.predict(eval.drop(columns=["ID"]))
submission = pd.Series(prediction, index=eval['ID'], name='risk_score')
submission.to_csv('../../submissions/coxph_enhanced_tuned.csv')

In [12]:
print(f"Tuned model predictions saved to: coxph_enhanced_tuned.csv")


Tuned model predictions saved to: coxph_enhanced_tuned.csv
