In [1]:
import pandas as pd
from sksurv.util import Surv
import numpy as np
from sksurv.metrics import concordance_index_ipcw
from sksurv.linear_model import CoxnetSurvivalAnalysis

pd.options.plotting.backend = "plotly"

df = pd.read_csv('../../data/train_enhanced.csv', sep =',')
eval = pd.read_csv('../../data/eval_enhanced.csv', sep =',')

In [2]:
target = ["OS_STATUS", "OS_YEARS"]
X = df.drop(columns = target + ["ID"])
y = Surv.from_dataframe(*target, df[target])

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Initialize and train the Cox Proportional Hazards model
cox = CoxnetSurvivalAnalysis()
cox.fit(X_train, y_train)

# Evaluate the model using Concordance Index IPCW
cox_cindex_train = concordance_index_ipcw(y_train, y_train, cox.predict(X_train), tau=7)[0]
cox_cindex_test = concordance_index_ipcw(y_train, y_test, cox.predict(X_test), tau=7)[0]
print(f"Cox Proportional Hazard Model Concordance Index IPCW on train: {cox_cindex_train:.4f}")
print(f"Cox Proportional Hazard Model Concordance Index IPCW on test: {cox_cindex_test:.4f}")

Cox Proportional Hazard Model Concordance Index IPCW on train: 0.7174
Cox Proportional Hazard Model Concordance Index IPCW on test: 0.7078


In [5]:

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold

# Initialize K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store cross-validation scores
cv_scores_train = []
cv_scores_test = []

# Perform K-Fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    # Split data
    X_train_fold = X.iloc[train_idx]
    X_test_fold = X.iloc[test_idx]
    y_train_fold = y[train_idx]
    y_test_fold = y[test_idx]

    # Impute missing values
    X_train_fold = pd.DataFrame(
        X_train_fold,
        columns=X_train_fold.columns,
        index=X_train_fold.index
    )
    X_test_fold = pd.DataFrame(
        X_test_fold,
        columns=X_test_fold.columns,
        index=X_test_fold.index
    )

    # Train Cox model
    cox_fold = CoxnetSurvivalAnalysis()
    cox_fold.fit(X_train_fold, y_train_fold)

    # Evaluate
    train_score = concordance_index_ipcw(y, y_train_fold, cox_fold.predict(X_train_fold), tau=7)[0]
    test_score = concordance_index_ipcw(y, y_test_fold, cox_fold.predict(X_test_fold), tau=7)[0]

    cv_scores_train.append(train_score)
    cv_scores_test.append(test_score)

    print(f"Fold {fold} - Train C-Index IPCW: {train_score:.4f}, Test C-Index IPCW: {test_score:.4f}")

# Print average scores
print(f"\nAverage Train C-Index IPCW: {np.mean(cv_scores_train):.4f} (+/- {np.std(cv_scores_train):.4f})")
print(f"Average Test C-Index IPCW: {np.mean(cv_scores_test):.4f} (+/- {np.std(cv_scores_test):.4f})")


Fold 1 - Train C-Index IPCW: 0.7156, Test C-Index IPCW: 0.7148
Fold 2 - Train C-Index IPCW: 0.7229, Test C-Index IPCW: 0.6862
Fold 3 - Train C-Index IPCW: 0.7127, Test C-Index IPCW: 0.7301
Fold 4 - Train C-Index IPCW: 0.7156, Test C-Index IPCW: 0.7213
Fold 5 - Train C-Index IPCW: 0.7213, Test C-Index IPCW: 0.6886

Average Train C-Index IPCW: 0.7176 (+/- 0.0038)
Average Test C-Index IPCW: 0.7082 (+/- 0.0177)


In [6]:
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

In [7]:
def objective(trial):
    # Define hyperparameters to tune
    params = {
        'l1_ratio': trial.suggest_float('l1_ratio', 0.0, 1.0),
        'alpha_min_ratio': trial.suggest_float('alpha_min_ratio', 1e-8, 0.1, log=True),
        'n_alphas': trial.suggest_int('n_alphas', 50, 200),
        'max_iter': trial.suggest_int('max_iter', 100000, 1000000),
        'tol': trial.suggest_float('tol', 1e-8, 1e-4, log=True)
    }

    # K-Fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, test_idx in kf.split(X):
        X_train_fold = X.iloc[train_idx]
        X_test_fold = X.iloc[test_idx]
        y_train_fold = y[train_idx]
        y_test_fold = y[test_idx]

        # Train model with trial parameters
        cox_trial = CoxnetSurvivalAnalysis(**params)
        cox_trial.fit(X_train_fold, y_train_fold)

        # Evaluate
        test_score = concordance_index_ipcw(y, y_test_fold, cox_trial.predict(X_test_fold), tau=7)[0]
        cv_scores.append(test_score)

    return np.mean(cv_scores)

In [8]:
study = optuna.create_study(direction='maximize', study_name='CoxnetSurvivalAnalysis_tuning')
study.optimize(objective, n_trials=250, show_progress_bar=True)

[I 2025-11-12 22:52:32,580] A new study created in memory with name: CoxnetSurvivalAnalysis_tuning


  0%|          | 0/250 [00:00<?, ?it/s]

[I 2025-11-12 22:52:33,040] Trial 0 finished with value: 0.7106958427813157 and parameters: {'l1_ratio': 0.8156159753776379, 'alpha_min_ratio': 0.007658031130790273, 'n_alphas': 146, 'max_iter': 293008, 'tol': 2.3728841829608086e-08}. Best is trial 0 with value: 0.7106958427813157.
[I 2025-11-12 22:52:33,251] Trial 1 finished with value: 0.7084885016655733 and parameters: {'l1_ratio': 0.9213917161094937, 'alpha_min_ratio': 0.0011377742436759213, 'n_alphas': 135, 'max_iter': 202452, 'tol': 1.4927023325311156e-05}. Best is trial 0 with value: 0.7106958427813157.
[I 2025-11-12 22:52:33,454] Trial 2 finished with value: 0.7093234752938047 and parameters: {'l1_ratio': 0.6921567951937886, 'alpha_min_ratio': 0.06547995429651271, 'n_alphas': 167, 'max_iter': 798571, 'tol': 6.419323299358893e-05}. Best is trial 0 with value: 0.7106958427813157.
[I 2025-11-12 22:52:33,612] Trial 3 finished with value: 0.7081087161352588 and parameters: {'l1_ratio': 0.5986839343880617, 'alpha_min_ratio': 4.669519

In [9]:
print(f"Best trial value (C-Index IPCW): {study.best_trial.value:.4f}")
print("\nBest hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

Best trial value (C-Index IPCW): 0.7119

Best hyperparameters:
  l1_ratio: 0.8928717284248087
  alpha_min_ratio: 0.015539411990938067
  n_alphas: 50
  max_iter: 649436
  tol: 1.4827818294657812e-05


In [10]:
fig = plot_optimization_history(study)
fig.show()

In [11]:
fig = plot_param_importances(study)
fig.show()

## Rerun Cross-Validation with Best Hyperparameters

In [12]:
# Initialize K-Fold cross-validation with best parameters
kf_best = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores_train_best = []
cv_scores_test_best = []

print(f"Running CV with best parameters: {study.best_params}\n")

for fold, (train_idx, test_idx) in enumerate(kf_best.split(X), 1):
    X_train_fold = X.iloc[train_idx]
    X_test_fold = X.iloc[test_idx]
    y_train_fold = y[train_idx]
    y_test_fold = y[test_idx]

    X_train_fold = pd.DataFrame(
        X_train_fold,
        columns=X_train_fold.columns,
        index=X_train_fold.index
    )
    X_test_fold = pd.DataFrame(
        X_test_fold,
        columns=X_test_fold.columns,
        index=X_test_fold.index
    )

    cox_best = CoxnetSurvivalAnalysis(**study.best_params)
    cox_best.fit(X_train_fold, y_train_fold)

    train_score = concordance_index_ipcw(y, y_train_fold, cox_best.predict(X_train_fold), tau=7)[0]
    test_score = concordance_index_ipcw(y, y_test_fold, cox_best.predict(X_test_fold), tau=7)[0]

    cv_scores_train_best.append(train_score)
    cv_scores_test_best.append(test_score)

    print(f"Fold {fold} - Train C-Index IPCW: {train_score:.4f}, Test C-Index IPCW: {test_score:.4f}")

print(f"\nAverage Train C-Index IPCW: {np.mean(cv_scores_train_best):.4f} (+/- {np.std(cv_scores_train_best):.4f})")
print(f"Average Test C-Index IPCW: {np.mean(cv_scores_test_best):.4f} (+/- {np.std(cv_scores_test_best):.4f})")

Running CV with best parameters: {'l1_ratio': 0.8928717284248087, 'alpha_min_ratio': 0.015539411990938067, 'n_alphas': 50, 'max_iter': 649436, 'tol': 1.4827818294657812e-05}

Fold 1 - Train C-Index IPCW: 0.7149, Test C-Index IPCW: 0.7177
Fold 2 - Train C-Index IPCW: 0.7217, Test C-Index IPCW: 0.6945
Fold 3 - Train C-Index IPCW: 0.7122, Test C-Index IPCW: 0.7346
Fold 4 - Train C-Index IPCW: 0.7157, Test C-Index IPCW: 0.7187
Fold 5 - Train C-Index IPCW: 0.7200, Test C-Index IPCW: 0.6939

Average Train C-Index IPCW: 0.7169 (+/- 0.0035)
Average Test C-Index IPCW: 0.7119 (+/- 0.0156)


In [13]:
cv_results_comparison = pd.DataFrame({
    'Baseline Train': cv_scores_train,
    'Baseline Test': cv_scores_test,
    'Tuned Train': cv_scores_train_best,
    'Tuned Test': cv_scores_test_best
}, index=[f'Fold {i + 1}' for i in range(5)])

cv_results_comparison.loc['Mean'] = cv_results_comparison.mean()
cv_results_comparison.loc['Std'] = cv_results_comparison.std()

cv_results_comparison


Unnamed: 0,Baseline Train,Baseline Test,Tuned Train,Tuned Test
Fold 1,0.715575,0.714791,0.714884,0.717696
Fold 2,0.722896,0.68623,0.721742,0.694456
Fold 3,0.712713,0.730122,0.712184,0.734604
Fold 4,0.715649,0.721341,0.715667,0.718732
Fold 5,0.721337,0.688555,0.720039,0.693919
Mean,0.717634,0.708208,0.716903,0.711881
Std,0.003842,0.017693,0.003497,0.015642


In [14]:
cox_final = CoxnetSurvivalAnalysis(**study.best_params)
cox_final.fit(X, y)
prediction = cox_final.predict(eval.drop(columns=["ID"]))
submission = pd.Series(prediction, index=eval['ID'], name='risk_score')
submission.to_csv('../../submissions/coxph_enhanced_tuned.csv')

In [15]:
print(f"Tuned model predictions saved to: coxph_enhanced_tuned.csv")


Tuned model predictions saved to: coxph_enhanced_tuned.csv
