In [2]:
import pandas as pd
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.util import Surv
import numpy as np
from sksurv.metrics import concordance_index_ipcw
from sksurv.linear_model import CoxnetSurvivalAnalysis, CoxPHSurvivalAnalysis
from qrtdc import MTLRWrapper
from torchmtlr.utils import encode_survival, make_time_bins
import torch
import torch.nn as nn


pd.options.plotting.backend = "plotly"

df = pd.read_csv('../../data/train_enhanced.csv', sep =',')
eval = pd.read_csv('../../data/eval_enhanced.csv', sep =',')

In [3]:
target = ["OS_STATUS", "OS_YEARS"]
X = df.drop(columns = target + ["ID"])
y = Surv.from_dataframe(*target, df[target])

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
print(y_train)

[( True, 0.83013699) ( True, 0.60273973) ( True, 1.03561644) ...
 (False, 0.84383562) ( True, 2.36164384) (False, 1.25753425)]


In [6]:
if not isinstance(y_train, np.ndarray) or y_train.dtype.fields is None or len(y_train.dtype.fields) != 2:
    print("cc")

In [18]:
from sksurv.meta import EnsembleSelection

# Initialize and train the Cox Proportional Hazards model


def get_ensemble_model():
    coxNet = CoxnetSurvivalAnalysis(
      l1_ratio= 0.8928717284248087,
      alpha_min_ratio= 0.015539411990938067,
      n_alphas= 50,
      max_iter= 649436,
      tol= 1.4827818294657812e-05,
    )

    coxPH = CoxPHSurvivalAnalysis(
      alpha= 6.348312310430074,
      n_iter= 629,
      tol= 2.349756261901387e-06
    )

    gbSurv = GradientBoostingSurvivalAnalysis(**{
        'n_estimators': 121,
        'learning_rate': 0.2886432467102598,
        'max_depth': 8,
        'min_samples_split': 13,
        'min_samples_leaf': 10,
        'max_features': 'sqrt',
        'subsample': 0.5099935946094218,
        'dropout_rate': 0.12896575300869814
    })

    mtlr1 = MTLRWrapper(
        input_dim=X.shape[1],
        time_bins=make_time_bins(df['OS_YEARS'], event=df['OS_STATUS']),
        n_hidden1=X.shape[1], n_hidden2=X.shape[1], n_epochs=300, activation='elu')

    mtlr2 = MTLRWrapper(
        input_dim=X.shape[1],
        time_bins=make_time_bins(df['OS_YEARS'], event=df['OS_STATUS']),
        n_hidden1=64, n_hidden2=32, n_epochs=150, activation='relu')

    def scorer(estimator, X_test, y_test, **test_predict_params):
        return concordance_index_ipcw(y, y_test, estimator.predict(X_test), tau=7)[0]

    return EnsembleSelection(
        base_estimators=[
            ("coxNet", coxNet),
            ("coxPH", coxPH),
            ("gbSurv", gbSurv),
            ("mtlr1", mtlr1),
            ("mtlr2", mtlr2)
        ],
        scorer=scorer,
        n_jobs=4,
        verbose=100
    )

model = get_ensemble_model()
model.fit(X_train, y_train)

# Evaluate the model using Concordance Index IPCW
cox_cindex_train = concordance_index_ipcw(y_train, y_train, model.predict(X_train), tau=7)[0]
cox_cindex_test = concordance_index_ipcw(y_train, y_test, model.predict(X_test), tau=7)[0]
print(f"Cox Proportional Hazard Model Concordance Index IPCW on train: {cox_cindex_train:.4f}")
print(f"Cox Proportional Hazard Model Concordance Index IPCW on test: {cox_cindex_test:.4f}")

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.023032426834106445s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:    3.5s
[Parallel(n_jobs=4)]: Done  14 out of  25 | elapsed:    7.8s remaining:    6.1s
[Parallel(n_jobs=4)]: Done  16 out of  25 | elapsed:    8.9s remaining:    4.9s
[Parallel(n_jobs=4)]: Done  18 out of  25 | elapsed:   10.4

In [16]:

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold

# Initialize K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store cross-validation scores
cv_scores_train = []
cv_scores_test = []

# Perform K-Fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    # Split data
    X_train_fold = X.iloc[train_idx]
    X_test_fold = X.iloc[test_idx]
    y_train_fold = y[train_idx]
    y_test_fold = y[test_idx]

    # Impute missing values
    X_train_fold = pd.DataFrame(
        X_train_fold,
        columns=X_train_fold.columns,
        index=X_train_fold.index
    )
    X_test_fold = pd.DataFrame(
        X_test_fold,
        columns=X_test_fold.columns,
        index=X_test_fold.index
    )

    # Train Cox model
    cox_fold = get_ensemble_model()
    cox_fold.fit(X_train_fold, y_train_fold)

    # Evaluate
    train_score = concordance_index_ipcw(y, y_train_fold, cox_fold.predict(X_train_fold), tau=7)[0]
    test_score = concordance_index_ipcw(y, y_test_fold, cox_fold.predict(X_test_fold), tau=7)[0]

    cv_scores_train.append(train_score)
    cv_scores_test.append(test_score)

    print(f"Fold {fold} - Train C-Index IPCW: {train_score:.4f}, Test C-Index IPCW: {test_score:.4f}")

# Print average scores
print(f"\nAverage Train C-Index IPCW: {np.mean(cv_scores_train):.4f} (+/- {np.std(cv_scores_train):.4f})")
print(f"Average Test C-Index IPCW: {np.mean(cv_scores_test):.4f} (+/- {np.std(cv_scores_test):.4f})")


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.03202080726623535s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done  14 out of  25 | elapsed:    7.8s remaining:    6.1s
[Parallel(n_jobs=4)]: Done  16 out of  25 | elapsed:    9.5s remaining:    5.3s
[Parallel(n_jobs=4)]: Done  18 out of  25 | elapsed:   11.4s

In [19]:
cox_final = get_ensemble_model()
cox_final.fit(X, y)
prediction = cox_final.predict(eval.drop(columns=["ID"]))
submission = pd.Series(prediction, index=eval['ID'], name='risk_score')
submission.to_csv('../../submissions/ensemble_mtlr.csv')

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.03959512710571289s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:    5.3s
[Parallel(n_jobs=4)]: Done  14 out of  25 | elapsed:   10.0s remaining:    7.8s
[Parallel(n_jobs=4)]: Done  16 out of  25 | elapsed:   11.4s remaining:    6.4s
[Parallel(n_jobs=4)]: Done  18 out of  25 | elapsed:   12.6s

In [20]:
print(f"Tuned model predictions saved to: ensemble_mtlr.csv")


Tuned model predictions saved to: ensemble_mtlr.csv
