In [1]:
import pandas as pd
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.util import Surv
import numpy as np
from sksurv.metrics import concordance_index_ipcw
from sksurv.linear_model import CoxnetSurvivalAnalysis, CoxPHSurvivalAnalysis
from torchmtlr import MTLR, mtlr_neg_log_likelihood, mtlr_survival
from torchmtlr.utils import encode_survival, make_time_bins
import torch
import torch.nn as nn


pd.options.plotting.backend = "plotly"

df = pd.read_csv('../../data/train_enhanced.csv', sep =',')
eval = pd.read_csv('../../data/eval_enhanced.csv', sep =',')

In [2]:
target = ["OS_STATUS", "OS_YEARS"]
X = df.drop(columns = target + ["ID"])
y = Surv.from_dataframe(*target, df[target])

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
print(y_train)

[( True, 0.83013699) ( True, 0.60273973) ( True, 1.03561644) ...
 (False, 0.84383562) ( True, 2.36164384) (False, 1.25753425)]


In [5]:
if not isinstance(y_train, np.ndarray) or y_train.dtype.fields is None or len(y_train.dtype.fields) != 2:
    print("cc")

In [6]:
from sklearn.base import BaseEstimator

class MTLRWrapper(BaseEstimator):
    def __init__(self, input_dim, time_bins, n_hidden1=64, n_hidden2=32, dropout1=0.2, dropout2=0.2, activation='relu', n_epochs=100, lr=0.001, C1=1.0):
        self.input_dim = input_dim
        self.time_bins = time_bins
        self.n_hidden1 = n_hidden1
        self.n_hidden2 = n_hidden2
        self.dropout1 = dropout1
        self.dropout2 = dropout2
        self.activation = activation
        self.n_epochs = n_epochs
        self.lr = lr
        self.C1 = C1
        self.model = None

    def fit(self, X, y):
        # y est un numpy structured array avec les champs 'event' et 'time'
        # On force la copie pour Ã©viter le bug de strides
        y_event = torch.tensor(np.copy(y['OS_STATUS'] if 'OS_STATUS' in y.dtype.names else y['event']), dtype=torch.float32)
        y_time = torch.tensor(np.copy(y['OS_YEARS'] if 'OS_YEARS' in y.dtype.names else y['time']), dtype=torch.float32)
        X_tensor = torch.tensor(np.copy(X.values), dtype=torch.float32)
        target = encode_survival(y_time, y_event, self.time_bins)
        act_fn = nn.ReLU() if self.activation == 'relu' else nn.ELU()
        self.model = nn.Sequential(
            nn.Linear(self.input_dim, self.n_hidden1),
            act_fn,
            nn.Dropout(self.dropout1),
            nn.Linear(self.n_hidden1, self.n_hidden2),
            act_fn,
            nn.Dropout(self.dropout2),
            MTLR(self.n_hidden2, len(self.time_bins))
        )
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        self.model.train()
        for epoch in range(self.n_epochs):
            optimizer.zero_grad()
            logits = self.model(X_tensor)
            loss = mtlr_neg_log_likelihood(logits, target, self.model[-1], C1=self.C1, average=True)
            loss.backward()
            optimizer.step()
        return self

    def predict(self, X):
        import torch
        X_tensor = torch.tensor(X.values, dtype=torch.float32)
        self.model.eval()
        with torch.no_grad():
            logits = self.model(X_tensor)
            risk_scores = torch.logsumexp(logits, dim=1).numpy()
        return risk_scores

In [8]:
from sksurv.meta import EnsembleSelection

# Initialize and train the Cox Proportional Hazards model


def get_ensemble_model():
    coxNet = CoxnetSurvivalAnalysis(
      l1_ratio=0.7773238493438969,
      alpha_min_ratio=0.007495756210143564,
      n_alphas=52,
      max_iter=717302,
      tol=7.904569805860203e-06
    )

    coxPH = CoxPHSurvivalAnalysis(
        alpha=9.560874711411728,
        n_iter=570,
        tol=3.921651247591744e-08
    )

    gbSurv = GradientBoostingSurvivalAnalysis(
        n_estimators=238,
        learning_rate= 0.22161644549500695,
        max_depth= 3,
        min_samples_split= 14,
        min_samples_leaf= 9,
        max_features= 'sqrt',
        subsample= 0.8525763914058241,
        dropout_rate= 0.0002948696684484872,
    )

    mtlr = MTLRWrapper(
        input_dim=X.shape[1],
        time_bins=make_time_bins(df['OS_YEARS'], event=df['OS_STATUS']),
        n_hidden1=64, n_hidden2=32, n_epochs=100
    )

    def scorer(estimator, X_test, y_test, **test_predict_params):
        return concordance_index_ipcw(y, y_test, estimator.predict(X_test), tau=7)[0]

    return EnsembleSelection(
        base_estimators=[
            ("coxNet", coxNet),
            ("coxPH", coxPH),
            ("gbSurv", gbSurv),
            #("mtlr", mtlr)
        ],
        scorer=scorer,
        n_jobs=4,
        verbose=100
    )

model = get_ensemble_model()
model.fit(X_train, y_train)

# Evaluate the model using Concordance Index IPCW
cox_cindex_train = concordance_index_ipcw(y_train, y_train, model.predict(X_train), tau=7)[0]
cox_cindex_test = concordance_index_ipcw(y_train, y_test, model.predict(X_test), tau=7)[0]
print(f"Cox Proportional Hazard Model Concordance Index IPCW on train: {cox_cindex_train:.4f}")
print(f"Cox Proportional Hazard Model Concordance Index IPCW on test: {cox_cindex_test:.4f}")

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done   9 out of  15 | elapsed:    1.4s remaining:    0.9s
[Parallel(n_jobs=4)]: Done  10 out of  15 | elapsed:    1.5s remaining:    0.7s
[Parallel(n_jobs=4)]: Done  11 out of  15 | elapsed:   11.7s remaining:    4.2s
[Parallel(n_jobs=4)]: Done  12 out of  15 | elapsed:   11.7s remaining:    2.9s
[Parallel(n_jobs=4)]: Done  13 out of  15 | elapsed:   11.8s remaining:    1.7s
[Parallel(n_jobs=4)]: Done  15 out of

In [9]:

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold

# Initialize K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store cross-validation scores
cv_scores_train = []
cv_scores_test = []

# Perform K-Fold cross-validation
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    # Split data
    X_train_fold = X.iloc[train_idx]
    X_test_fold = X.iloc[test_idx]
    y_train_fold = y[train_idx]
    y_test_fold = y[test_idx]

    # Impute missing values
    X_train_fold = pd.DataFrame(
        X_train_fold,
        columns=X_train_fold.columns,
        index=X_train_fold.index
    )
    X_test_fold = pd.DataFrame(
        X_test_fold,
        columns=X_test_fold.columns,
        index=X_test_fold.index
    )

    # Train Cox model
    cox_fold = get_ensemble_model()
    cox_fold.fit(X_train_fold, y_train_fold)

    # Evaluate
    train_score = concordance_index_ipcw(y, y_train_fold, cox_fold.predict(X_train_fold), tau=7)[0]
    test_score = concordance_index_ipcw(y, y_test_fold, cox_fold.predict(X_test_fold), tau=7)[0]

    cv_scores_train.append(train_score)
    cv_scores_test.append(test_score)

    print(f"Fold {fold} - Train C-Index IPCW: {train_score:.4f}, Test C-Index IPCW: {test_score:.4f}")

# Print average scores
print(f"\nAverage Train C-Index IPCW: {np.mean(cv_scores_train):.4f} (+/- {np.std(cv_scores_train):.4f})")
print(f"Average Test C-Index IPCW: {np.mean(cv_scores_test):.4f} (+/- {np.std(cv_scores_test):.4f})")


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.028192520141601562s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   9 out of  15 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=4)]: Done  10 out of  15 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=4)]: Done  11 out of  15 | elapsed:   11.8s remaining:    4.2s
[Parallel(n_jobs=4)]: Done  12 out of  15 | elapsed:   11.8s remaining:    2.9s
[Parallel(n_jobs=4)]:

In [10]:
cox_final = get_ensemble_model()
cox_final.fit(X, y)
prediction = cox_final.predict(eval.drop(columns=["ID"]))
submission = pd.Series(prediction, index=eval['ID'], name='risk_score')
submission.to_csv('../../submissions/ensemble_one.csv')

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Batch computation too fast (0.043206214904785156s.) Setting batch_size=2.
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done   9 out of  15 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=4)]: Done  10 out of  15 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=4)]: Done  11 out of  15 | elapsed:   16.7s remaining:    6.0s
[Parallel(n_jobs=4)]: Done  12 out of  15 | elapsed:   16.9s remaining:    4.2s
[Parallel(n_jobs=4)]:

In [11]:
print(f"Tuned model predictions saved to: ensemble_one.csv")


Tuned model predictions saved to: ensemble_one.csv
