In [8]:
import pandas as pd
from sksurv.util import Surv
import numpy as np
from sksurv.metrics import concordance_index_ipcw
from sksurv.linear_model import CoxPHSurvivalAnalysis

pd.options.plotting.backend = "plotly"

df = pd.read_csv('../../data/train_enhanced.csv', sep =',')
eval = pd.read_csv('../../data/eval_enhanced.csv', sep =',')

In [9]:
target = ["OS_STATUS", "OS_YEARS"]
X = df.drop(columns = target + ["ID"])
y = Surv.from_dataframe(*target, df[target])

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# Initialize and train the Cox Proportional Hazards model
cox = CoxPHSurvivalAnalysis()
cox.fit(X_train, y_train)

# Evaluate the model using Concordance Index IPCW
cox_cindex_train = concordance_index_ipcw(y_train, y_train, cox.predict(X_train), tau=7)[0]
cox_cindex_test = concordance_index_ipcw(y_train, y_test, cox.predict(X_test), tau=7)[0]
print(f"Cox Proportional Hazard Model Concordance Index IPCW on train: {cox_cindex_train:.4f}")
print(f"Cox Proportional Hazard Model Concordance Index IPCW on test: {cox_cindex_test:.4f}")

Cox Proportional Hazard Model Concordance Index IPCW on train: 0.7170
Cox Proportional Hazard Model Concordance Index IPCW on test: 0.7072


In [12]:

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold

# Initialize K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store cross-validation scores
cv_scores_train = []
cv_scores_test = []

# Perform K-Fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    # Split data
    X_train_fold = X.iloc[train_idx]
    X_test_fold = X.iloc[test_idx]
    y_train_fold = y[train_idx]
    y_test_fold = y[test_idx]

    # Impute missing values
    X_train_fold = pd.DataFrame(
        X_train_fold,
        columns=X_train_fold.columns,
        index=X_train_fold.index
    )
    X_test_fold = pd.DataFrame(
        X_test_fold,
        columns=X_test_fold.columns,
        index=X_test_fold.index
    )

    # Train Cox model
    cox_fold = CoxPHSurvivalAnalysis()
    cox_fold.fit(X_train_fold, y_train_fold)

    # Evaluate
    train_score = concordance_index_ipcw(y, y_train_fold, cox_fold.predict(X_train_fold), tau=7)[0]
    test_score = concordance_index_ipcw(y, y_test_fold, cox_fold.predict(X_test_fold), tau=7)[0]

    cv_scores_train.append(train_score)
    cv_scores_test.append(test_score)

    print(f"Fold {fold} - Train C-Index IPCW: {train_score:.4f}, Test C-Index IPCW: {test_score:.4f}")

# Print average scores
print(f"\nAverage Train C-Index IPCW: {np.mean(cv_scores_train):.4f} (+/- {np.std(cv_scores_train):.4f})")
print(f"Average Test C-Index IPCW: {np.mean(cv_scores_test):.4f} (+/- {np.std(cv_scores_test):.4f})")


Fold 1 - Train C-Index IPCW: 0.7152, Test C-Index IPCW: 0.7142
Fold 2 - Train C-Index IPCW: 0.7227, Test C-Index IPCW: 0.6856
Fold 3 - Train C-Index IPCW: 0.7120, Test C-Index IPCW: 0.7283
Fold 4 - Train C-Index IPCW: 0.7152, Test C-Index IPCW: 0.7214
Fold 5 - Train C-Index IPCW: 0.7214, Test C-Index IPCW: 0.6867

Average Train C-Index IPCW: 0.7173 (+/- 0.0041)
Average Test C-Index IPCW: 0.7072 (+/- 0.0178)


In [13]:
cox_final = CoxPHSurvivalAnalysis()
cox_final.fit(X, y)
prediction = cox_final.predict(eval.drop(columns = ["ID"]))
submission = pd.Series(prediction, index=eval['ID'], name='risk_score')
submission.to_csv('../../submissions/coxph_enhanced_2.csv')