In [1]:
import pandas as pd
from sksurv.util import Surv
import numpy as np
from sksurv.metrics import concordance_index_ipcw
from sksurv.linear_model import CoxnetSurvivalAnalysis

pd.options.plotting.backend = "plotly"

df = pd.read_csv('../../data/train_enhanced.csv', sep =',')
eval = pd.read_csv('../../data/eval_enhanced.csv', sep =',')

In [2]:
target = ["OS_STATUS", "OS_YEARS"]
X = df.drop(columns = target + ["ID"])
y = Surv.from_dataframe(*target, df[target])

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Initialize and train the Cox Proportional Hazards model
cox = CoxnetSurvivalAnalysis()
cox.fit(X_train, y_train)

# Evaluate the model using Concordance Index IPCW
cox_cindex_train = concordance_index_ipcw(y_train, y_train, cox.predict(X_train), tau=7)[0]
cox_cindex_test = concordance_index_ipcw(y_train, y_test, cox.predict(X_test), tau=7)[0]
print(f"Cox Proportional Hazard Model Concordance Index IPCW on train: {cox_cindex_train:.4f}")
print(f"Cox Proportional Hazard Model Concordance Index IPCW on test: {cox_cindex_test:.4f}")

Cox Proportional Hazard Model Concordance Index IPCW on train: 0.7108
Cox Proportional Hazard Model Concordance Index IPCW on test: 0.7084


In [5]:

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold

# Initialize K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store cross-validation scores
cv_scores_train = []
cv_scores_test = []

# Perform K-Fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    # Split data
    X_train_fold = X.iloc[train_idx]
    X_test_fold = X.iloc[test_idx]
    y_train_fold = y[train_idx]
    y_test_fold = y[test_idx]

    # Impute missing values
    X_train_fold = pd.DataFrame(
        X_train_fold,
        columns=X_train_fold.columns,
        index=X_train_fold.index
    )
    X_test_fold = pd.DataFrame(
        X_test_fold,
        columns=X_test_fold.columns,
        index=X_test_fold.index
    )

    # Train Cox model
    cox_fold = CoxnetSurvivalAnalysis()
    cox_fold.fit(X_train_fold, y_train_fold)

    # Evaluate
    train_score = concordance_index_ipcw(y, y_train_fold, cox_fold.predict(X_train_fold), tau=7)[0]
    test_score = concordance_index_ipcw(y, y_test_fold, cox_fold.predict(X_test_fold), tau=7)[0]

    cv_scores_train.append(train_score)
    cv_scores_test.append(test_score)

    print(f"Fold {fold} - Train C-Index IPCW: {train_score:.4f}, Test C-Index IPCW: {test_score:.4f}")

# Print average scores
print(f"\nAverage Train C-Index IPCW: {np.mean(cv_scores_train):.4f} (+/- {np.std(cv_scores_train):.4f})")
print(f"Average Test C-Index IPCW: {np.mean(cv_scores_test):.4f} (+/- {np.std(cv_scores_test):.4f})")


Fold 1 - Train C-Index IPCW: 0.7101, Test C-Index IPCW: 0.7125
Fold 2 - Train C-Index IPCW: 0.7148, Test C-Index IPCW: 0.6940
Fold 3 - Train C-Index IPCW: 0.7071, Test C-Index IPCW: 0.7271
Fold 4 - Train C-Index IPCW: 0.7104, Test C-Index IPCW: 0.7109
Fold 5 - Train C-Index IPCW: 0.7127, Test C-Index IPCW: 0.6944

Average Train C-Index IPCW: 0.7110 (+/- 0.0026)
Average Test C-Index IPCW: 0.7078 (+/- 0.0124)


In [6]:
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

In [16]:
def objective(trial):
    # Define hyperparameters to tune
    params = {
        'l1_ratio': trial.suggest_float('l1_ratio', 0.0, 1.0),
        'alpha_min_ratio': trial.suggest_float('alpha_min_ratio', 1e-8, 0.1, log=True),
        'n_alphas': trial.suggest_int('n_alphas', 50, 200),
        'max_iter': trial.suggest_int('max_iter', 100000, 1000000),
        'tol': trial.suggest_float('tol', 1e-8, 1e-4, log=True)
    }

    # K-Fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, test_idx in kf.split(X):
        X_train_fold = X.iloc[train_idx]
        X_test_fold = X.iloc[test_idx]
        y_train_fold = y[train_idx]
        y_test_fold = y[test_idx]

        # Train model with trial parameters
        cox_trial = CoxnetSurvivalAnalysis(**params)
        cox_trial.fit(X_train_fold, y_train_fold)

        # Evaluate
        test_score = concordance_index_ipcw(y, y_test_fold, cox_trial.predict(X_test_fold), tau=7)[0]
        cv_scores.append(test_score)

    return np.mean(cv_scores)

In [17]:
study = optuna.create_study(direction='maximize', study_name='CoxnetSurvivalAnalysis_tuning')
study.optimize(objective, n_trials=250, show_progress_bar=True)

[I 2025-10-01 17:53:13,033] A new study created in memory with name: CoxnetSurvivalAnalysis_tuning


  0%|          | 0/250 [00:00<?, ?it/s]

[I 2025-10-01 17:53:13,311] Trial 0 finished with value: 0.7079602554144129 and parameters: {'l1_ratio': 0.6863188870325085, 'alpha_min_ratio': 0.05668302233470934, 'n_alphas': 194, 'max_iter': 117811, 'tol': 1.3194274253714184e-08}. Best is trial 0 with value: 0.7079602554144129.
[I 2025-10-01 17:53:13,427] Trial 1 finished with value: 0.7078417975072496 and parameters: {'l1_ratio': 0.966730521562906, 'alpha_min_ratio': 6.591676150697592e-06, 'n_alphas': 158, 'max_iter': 686913, 'tol': 4.6415993212515835e-05}. Best is trial 0 with value: 0.7079602554144129.
[I 2025-10-01 17:53:13,567] Trial 2 finished with value: 0.707909991489031 and parameters: {'l1_ratio': 0.4640055073612167, 'alpha_min_ratio': 0.00016309909179907314, 'n_alphas': 161, 'max_iter': 789634, 'tol': 8.33512261926104e-06}. Best is trial 0 with value: 0.7079602554144129.
[I 2025-10-01 17:53:13,792] Trial 3 finished with value: 0.7076585947078142 and parameters: {'l1_ratio': 0.8256160977469136, 'alpha_min_ratio': 3.4730580

In [18]:
print(f"Best trial value (C-Index IPCW): {study.best_trial.value:.4f}")
print("\nBest hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

Best trial value (C-Index IPCW): 0.7088

Best hyperparameters:
  l1_ratio: 0.7773238493438969
  alpha_min_ratio: 0.007495756210143564
  n_alphas: 52
  max_iter: 717302
  tol: 7.904569805860203e-06


In [19]:
fig = plot_optimization_history(study)
fig.show()

In [20]:
fig = plot_param_importances(study)
fig.show()

## Rerun Cross-Validation with Best Hyperparameters

In [21]:
# Initialize K-Fold cross-validation with best parameters
kf_best = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores_train_best = []
cv_scores_test_best = []

print(f"Running CV with best parameters: {study.best_params}\n")

for fold, (train_idx, test_idx) in enumerate(kf_best.split(X), 1):
    X_train_fold = X.iloc[train_idx]
    X_test_fold = X.iloc[test_idx]
    y_train_fold = y[train_idx]
    y_test_fold = y[test_idx]

    X_train_fold = pd.DataFrame(
        X_train_fold,
        columns=X_train_fold.columns,
        index=X_train_fold.index
    )
    X_test_fold = pd.DataFrame(
        X_test_fold,
        columns=X_test_fold.columns,
        index=X_test_fold.index
    )

    cox_best = CoxnetSurvivalAnalysis(**study.best_params)
    cox_best.fit(X_train_fold, y_train_fold)

    train_score = concordance_index_ipcw(y, y_train_fold, cox_best.predict(X_train_fold), tau=7)[0]
    test_score = concordance_index_ipcw(y, y_test_fold, cox_best.predict(X_test_fold), tau=7)[0]

    cv_scores_train_best.append(train_score)
    cv_scores_test_best.append(test_score)

    print(f"Fold {fold} - Train C-Index IPCW: {train_score:.4f}, Test C-Index IPCW: {test_score:.4f}")

print(f"\nAverage Train C-Index IPCW: {np.mean(cv_scores_train_best):.4f} (+/- {np.std(cv_scores_train_best):.4f})")
print(f"Average Test C-Index IPCW: {np.mean(cv_scores_test_best):.4f} (+/- {np.std(cv_scores_test_best):.4f})")

Running CV with best parameters: {'l1_ratio': 0.7773238493438969, 'alpha_min_ratio': 0.007495756210143564, 'n_alphas': 52, 'max_iter': 717302, 'tol': 7.904569805860203e-06}

Fold 1 - Train C-Index IPCW: 0.7101, Test C-Index IPCW: 0.7118
Fold 2 - Train C-Index IPCW: 0.7146, Test C-Index IPCW: 0.6964
Fold 3 - Train C-Index IPCW: 0.7068, Test C-Index IPCW: 0.7282
Fold 4 - Train C-Index IPCW: 0.7101, Test C-Index IPCW: 0.7115
Fold 5 - Train C-Index IPCW: 0.7128, Test C-Index IPCW: 0.6959

Average Train C-Index IPCW: 0.7109 (+/- 0.0026)
Average Test C-Index IPCW: 0.7088 (+/- 0.0120)


In [None]:
cv_results_comparison = pd.DataFrame({
    'Baseline Train': cv_scores_train,
    'Baseline Test': cv_scores_test,
    'Tuned Train': cv_scores_train_best,
    'Tuned Test': cv_scores_test_best
}, index=[f'Fold {i + 1}' for i in range(5)])

cv_results_comparison.loc['Mean'] = cv_results_comparison.mean()
cv_results_comparison.loc['Std'] = cv_results_comparison.std()

cv_results_comparison


In [None]:
cox_final = CoxnetSurvivalAnalysis(**study.best_params)
cox_final.fit(X, y)
prediction = cox_final.predict(eval.drop(columns=["ID"]))
submission = pd.Series(prediction, index=eval['ID'], name='risk_score')
submission.to_csv('../../submissions/coxph_enhanced_tuned.csv')

In [None]:
print(f"Tuned model predictions saved to: coxph_enhanced_tuned.csv")
