In [1]:
!pip install /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

!pip install -q /kaggle/input/download-lightning-and-pytorch-tabular/pytorch_lightning-2.4.0-py3-none-any.whl
!pip install -q /kaggle/input/download-lightning-and-pytorch-tabular/scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -q /kaggle/input/download-lightning-and-pytorch-tabular/torchmetrics-1.5.2-py3-none-any.whl
!pip install -q /kaggle/input/download-lightning-and-pytorch-tabular/pytorch_tabnet-4.1.0-py3-none-any.whl
!pip install -q /kaggle/input/download-lightning-and-pytorch-tabular/einops-0.7.0-py3-none-any.whl
!pip install -q /kaggle/input/download-lightning-and-pytorch-tabular/pytorch_tabular-1.1.1-py2.py3-none-any.whl

Processing /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
autograd is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
Processing /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l[?25hdone
  Created wheel for autograd-gamma: filename=autograd_gamma-0.5.0-py3-none-any.whl size=4031 sha256=2ef32d17944a95d8a3a930e770bf99279411add813c47b318302e9df5c42bcf0
  Stored in directory: /root/.cache/pip/wheels/6b/b5/e0/4c79e15c0b5f2c15ecf613c720bb20daab20a666eb67135155
Successfully built autograd-gamma
Installing collected packages: autograd-gamma
Successfully installed autograd-gamma-0.5.0
Processing /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
Installing collected packages: interface-

## Prepare data

Below are a few utility functions to load and prepare the data for training with pytorch.

In [2]:
import numpy as np
import pandas as pd
import torch
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import TensorDataset
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def get_X_cat(df, cat_cols, transformers=None):
    """
    Apply a specific categorical data transformer or a LabelEncoder if None.
    """
    if transformers is None:
        transformers = [LabelEncoder().fit(df[col]) for col in cat_cols]
    return transformers, np.array(
        [transformer.transform(df[col]) for col, transformer in zip(cat_cols, transformers)]
    ).T


def preprocess_data(train, val):
    """
    Standardize numerical variables and transform (Label-encode) categoricals.
    Fill NA values with mean for numerical.
    Create torch dataloaders to prepare data for training and evaluation.
    """
    X_cat_train, X_cat_val, numerical, transformers = get_categoricals(train, val)
    scaler = StandardScaler()
    imp = SimpleImputer(missing_values=np.nan, strategy='mean', add_indicator=True)
    X_num_train = imp.fit_transform(train[numerical])
    X_num_train = scaler.fit_transform(X_num_train)
    X_num_val = imp.transform(val[numerical])
    X_num_val = scaler.transform(X_num_val)
    dl_train = init_dl(X_cat_train, X_num_train, train, training=True)
    dl_val = init_dl(X_cat_val, X_num_val, val)
    return X_cat_val, X_num_train, X_num_val, dl_train, dl_val, transformers


def get_categoricals(train, val):
    """
    Remove constant categorical columns and transform them using LabelEncoder.
    Return the label-transformers for each categorical column, categorical dataframes and numerical columns.
    """
    categorical_cols, numerical = get_feature_types(train)
    remove = []
    for col in categorical_cols:
        if train[col].nunique() == 1:
            remove.append(col)
        ind = ~val[col].isin(train[col])
        if ind.any():
            val.loc[ind, col] = train[col].mode()[0] #p.nan
    categorical_cols = [col for col in categorical_cols if col not in remove]
    transformers, X_cat_train = get_X_cat(train, categorical_cols)
    _, X_cat_val = get_X_cat(val, categorical_cols, transformers)
    return X_cat_train, X_cat_val, numerical, transformers


def init_dl(X_cat, X_num, df, training=False):
    """
    Initialize data loaders with 4 dimensions : categorical dataframe, numerical dataframe and target values (efs and efs_time).
    Notice that efs_time is log-transformed.
    Fix batch size to 2048 and return dataloader for training or validation depending on training value.
    """
    ds_train = TensorDataset(
        torch.tensor(X_cat, dtype=torch.long),
        torch.tensor(X_num, dtype=torch.float32),
        torch.tensor(df.efs_time.values, dtype=torch.float32).log(),
        torch.tensor(df.efs.values, dtype=torch.long)
    )
    bs = 2048
    dl_train = torch.utils.data.DataLoader(ds_train, batch_size=bs, pin_memory=True, shuffle=training)
    return dl_train


def get_feature_types(train):
    """
    Utility function to return categorical and numerical column names.
    """
    categorical_cols = [col for i, col in enumerate(train.columns) if ((train[col].dtype == "object") | (2 < train[col].nunique() < 25))]
    RMV = ["ID", "efs", "efs_time", "y"]
    FEATURES = [c for c in train.columns if not c in RMV]
    print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")
    numerical = [i for i in FEATURES if i not in categorical_cols]
    return categorical_cols, numerical


def add_features(df):
    """
    Create some new features to help the model focus on specific patterns.
    """
    df['is_cyto_score_same'] = (df['cyto_score'] == df['cyto_score_detail']).astype(int)
    df['year_hct'] -= 2000

    return df


def load_data():
    """
    Load data and add features.
    """
    test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")
    test = add_features(test)
    print("Test shape:", test.shape)
    train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
    train = add_features(train)
    print("Train shape:", train.shape)
    return test, train


## Define models with pairwise ranking loss

The model is defined in 3 steps :
* Embedding class for categorical data
* MLP for numerical and categorical data
* Final model trained with pairwise ranking loss with selection of valid pairs

In [3]:
weights = []

In [4]:
import functools
from typing import List

import pytorch_lightning as pl
import numpy as np
import torch
from lifelines.utils import concordance_index
from pytorch_lightning.cli import ReduceLROnPlateau
from pytorch_tabular.models.common.layers import ODST
from torch import nn
from pytorch_lightning.utilities import grad_norm


class CatEmbeddings(nn.Module):
    """
    Embedding module for the categorical dataframe.
    """
    def __init__(
        self,
        projection_dim: int,
        categorical_cardinality: List[int],
        embedding_dim: int
    ):
        """
        projection_dim: The dimension of the final output after projecting the concatenated embeddings into a lower-dimensional space.
        categorical_cardinality: A list where each element represents the number of unique categories (cardinality) in each categorical feature.
        embedding_dim: The size of the embedding space for each categorical feature.
        self.embeddings: list of embedding layers for each categorical feature.
        self.projection: sequential neural network that goes from the embedding to the output projection dimension with GELU activation.
        """
        super(CatEmbeddings, self).__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(cardinality, embedding_dim)
            for cardinality in categorical_cardinality
        ])
        self.projection = nn.Sequential(
            nn.Linear(embedding_dim * len(categorical_cardinality), projection_dim),
            nn.GELU(),
            nn.Linear(projection_dim, projection_dim)
        )

    def forward(self, x_cat):
        """
        Apply the projection on concatened embeddings that contains all categorical features.
        """
        x_cat = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embeddings)]
        x_cat = torch.cat(x_cat, dim=1)
        return self.projection(x_cat)


class NN(nn.Module):
    """
    Train a model on both categorical embeddings and numerical data.
    """
    def __init__(
            self,
            continuous_dim: int,
            categorical_cardinality: List[int],
            embedding_dim: int,
            projection_dim: int,
            hidden_dim: int,
            dropout: float = 0
    ):
        """
        continuous_dim: The number of continuous features.
        categorical_cardinality: A list of integers representing the number of unique categories in each categorical feature.
        embedding_dim: The dimensionality of the embedding space for each categorical feature.
        projection_dim: The size of the projected output space for the categorical embeddings.
        hidden_dim: The number of neurons in the hidden layer of the MLP.
        dropout: The dropout rate applied in the network.
        self.embeddings: previous embeddings for categorical data.
        self.mlp: defines an MLP model with an ODST layer followed by batch normalization and dropout.
        self.out: linear output layer that maps the output of the MLP to a single value
        self.dropout: defines dropout
        Weights initialization with xavier normal algorithm and biases with zeros.
        """
        super(NN, self).__init__()
        self.embeddings = CatEmbeddings(projection_dim, categorical_cardinality, embedding_dim)
        self.mlp = nn.Sequential(
            ODST(projection_dim + continuous_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(dropout)
        )
        self.out = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

        # initialize weights
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x_cat, x_cont):
        """
        Create embedding layers for categorical data, concatenate with continous variables.
        Add dropout and goes through MLP and return raw output and 1-dimensional output as well.
        """
        x = self.embeddings(x_cat)
        x = torch.cat([x, x_cont], dim=1)
        x = self.dropout(x)
        x = self.mlp(x)
        return self.out(x), x


@functools.lru_cache
def combinations(N):
    """
    calculates all possible 2-combinations (pairs) of a tensor of indices from 0 to N-1, 
    and caches the result using functools.lru_cache for optimization
    """
    ind = torch.arange(N)
    comb = torch.combinations(ind, r=2)
    return comb.cuda()


class LitNN(pl.LightningModule):
    """
    Main Model creation and losses definition to fully train the model.
    """
    def __init__(
            self,
            continuous_dim: int,
            categorical_cardinality: List[int],
            embedding_dim: int,
            projection_dim: int,
            hidden_dim: int,
            lr: float = 1e-3,
            dropout: float = 0.2,
            weight_decay: float = 1e-3,
            aux_weight: float = 0.1,
            margin: float = 0.5,
            race_index: int = 0
    ):
        """
        continuous_dim: The number of continuous input features.
        categorical_cardinality: A list of integers, where each element corresponds to the number of unique categories for each categorical feature.
        embedding_dim: The dimension of the embeddings for the categorical features.
        projection_dim: The dimension of the projected space after embedding concatenation.
        hidden_dim: The size of the hidden layers in the feedforward network (MLP).
        lr: The learning rate for the optimizer.
        dropout: Dropout probability to avoid overfitting.
        weight_decay: The L2 regularization term for the optimizer.
        aux_weight: Weight used for auxiliary tasks.
        margin: Margin used in some loss functions.
        race_index: An index that refer to race_group in the input data.
        """
        super(LitNN, self).__init__()
        self.save_hyperparameters()

        # Creates an instance of the NN model defined above
        self.model = NN(
            continuous_dim=self.hparams.continuous_dim,
            categorical_cardinality=self.hparams.categorical_cardinality,
            embedding_dim=self.hparams.embedding_dim,
            projection_dim=self.hparams.projection_dim,
            hidden_dim=self.hparams.hidden_dim,
            dropout=self.hparams.dropout
        )
        self.targets = []

        # Defines a small feedforward neural network that performs an auxiliary task with 1-dimensional output
        self.aux_cls = nn.Sequential(
            nn.Linear(self.hparams.hidden_dim, self.hparams.hidden_dim // 3),
            nn.GELU(),
            nn.Linear(self.hparams.hidden_dim // 3, 1)
        )

    def on_before_optimizer_step(self, optimizer):
        """
        Compute the 2-norm for each layer
        If using mixed precision, the gradients are already unscaled here
        """
        norms = grad_norm(self.model, norm_type=2)
        self.log_dict(norms)

    def forward(self, x_cat, x_cont):
        """
        Forward pass that outputs the 1-dimensional prediction and the embeddings (raw output)
        """
        x, emb = self.model(x_cat, x_cont)
        return x.squeeze(1), emb

    def training_step(self, batch, batch_idx):
        """
        defines how the model processes each batch of data during training.
        A batch is a combination of : categorical data, continuous data, efs_time (y) and efs event.
        y_hat is the efs_time prediction on all data and aux_pred is auxiliary prediction on embeddings.
        Calculates loss and race_group loss on full data.
        Auxiliary loss is calculated with an event mask, ignoring efs=0 predictions and taking the average.
        Returns loss and aux_loss multiplied by weight defined above.
        """
        x_cat, x_cont, y, efs = batch
        y_hat, emb = self(x_cat, x_cont)
        aux_pred = self.aux_cls(emb).squeeze(1)
        loss, race_loss = self.get_full_loss(efs, x_cat, y, y_hat)
        aux_loss = nn.functional.mse_loss(aux_pred, y, reduction='none')
        aux_mask = efs == 1
        aux_loss = (aux_loss * aux_mask).sum() / aux_mask.sum()
        self.log("train_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        self.log("race_loss", race_loss, on_epoch=True, prog_bar=True, logger=True, on_step=False)
        self.log("aux_loss", aux_loss, on_epoch=True, prog_bar=True, logger=True, on_step=False)
        return loss + aux_loss * self.hparams.aux_weight

    def get_full_loss(self, efs, x_cat, y, y_hat):
        """
        Output loss and race_group loss.
        """
        loss = self.calc_loss(y, y_hat, efs)
        race_loss = self.get_race_losses(efs, x_cat, y, y_hat)
        loss += 0.1 * race_loss
        return loss, race_loss

    def get_race_losses(self, efs, x_cat, y, y_hat):
        """
        Calculate loss for each race_group based on deviation/variance.
        """
        races = torch.unique(x_cat[:, self.hparams.race_index])
        race_losses = []
        for race in races:
            ind = x_cat[:, self.hparams.race_index] == race
            race_losses.append(self.calc_loss(y[ind], y_hat[ind], efs[ind]))
        race_loss = sum(race_losses) / len(race_losses)
        races_loss_std = sum((r - race_loss)**2 for r in race_losses) / len(race_losses)
        return torch.sqrt(races_loss_std)

    def calc_loss(self, y, y_hat, efs):
        """
        Most important part of the model : loss function used for training.
        We face survival data with event indicators along with time-to-event.

        This function computes the main loss by the following the steps :
        * create all data pairs with "combinations" function (= all "two subjects" combinations)
        * make sure that we have at least 1 event in each pair
        * convert y to +1 or -1 depending on the correct ranking
        * loss is computed using a margin-based hinge loss
        * mask is applied to ensure only valid pairs are being used (censored data can't be ranked with event in some cases)
        * average loss on all pairs is returned
        """
        N = y.shape[0]
        comb = combinations(N)
        comb = comb[(efs[comb[:, 0]] == 1) | (efs[comb[:, 1]] == 1)]
        pred_left = y_hat[comb[:, 0]]
        pred_right = y_hat[comb[:, 1]]
        y_left = y[comb[:, 0]]
        y_right = y[comb[:, 1]]
        y = 2 * (y_left > y_right).int() - 1
        loss = nn.functional.relu(-y * (pred_left - pred_right) + self.hparams.margin)
        mask = self.get_mask(comb, efs, y_left, y_right)
        loss = (loss.double() * (mask.double())).sum() / mask.sum()
        return loss

    def get_mask(self, comb, efs, y_left, y_right):
        """
        Defines all invalid comparisons :
        * Case 1: "Left outlived Right" but Right is censored
        * Case 2: "Right outlived Left" but Left is censored
        Masks for case 1 and case 2 are combined using |= operator and inverted using ~ to create a "valid pair mask"
        """
        left_outlived = y_left >= y_right
        left_1_right_0 = (efs[comb[:, 0]] == 1) & (efs[comb[:, 1]] == 0)
        mask2 = (left_outlived & left_1_right_0)
        right_outlived = y_right >= y_left
        right_1_left_0 = (efs[comb[:, 1]] == 1) & (efs[comb[:, 0]] == 0)
        mask2 |= (right_outlived & right_1_left_0)
        mask2 = ~mask2
        mask = mask2
        return mask

    def validation_step(self, batch, batch_idx):
        """
        This method defines how the model processes each batch during validation
        """
        x_cat, x_cont, y, efs = batch
        y_hat, emb = self(x_cat, x_cont)
        loss, race_loss = self.get_full_loss(efs, x_cat, y, y_hat)
        self.targets.append([y, y_hat.detach(), efs, x_cat[:, self.hparams.race_index]])
        self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def on_validation_epoch_end(self):
        """
        At the end of the validation epoch, it computes and logs the concordance index
        """
        cindex, metric = self._calc_cindex()

        
        self.log("cindex", metric, on_epoch=True, prog_bar=True, logger=True)
        self.log("cindex_simple", cindex, on_epoch=True, prog_bar=True, logger=True)
        self.targets.clear()

    def _calc_cindex(self):
        """
        Calculate c-index accounting for each race_group or global.
        """
        y = torch.cat([t[0] for t in self.targets]).cpu().numpy()
        y_hat = torch.cat([t[1] for t in self.targets]).cpu().numpy()
        efs = torch.cat([t[2] for t in self.targets]).cpu().numpy()
        races = torch.cat([t[3] for t in self.targets]).cpu().numpy()
        metric = self._metric(efs, races, y, y_hat)
        cindex = concordance_index(y, y_hat, efs)
        return cindex, metric

    def _metric(self, efs, races, y, y_hat):
        """
        Calculate c-index accounting for each race_group
        """
        metric_list = []
        for race in np.unique(races):
            y_ = y[races == race]
            y_hat_ = y_hat[races == race]
            efs_ = efs[races == race]
            metric_list.append(concordance_index(y_, y_hat_, efs_))
        metric = float(np.mean(metric_list) - np.sqrt(np.var(metric_list)))
        return metric

    def test_step(self, batch, batch_idx):
        """
        Same as training step but to log test data
        """
        x_cat, x_cont, y, efs = batch
        y_hat, emb = self(x_cat, x_cont)
        loss, race_loss = self.get_full_loss(efs, x_cat, y, y_hat)
        self.targets.append([y, y_hat.detach(), efs, x_cat[:, self.hparams.race_index]])
        self.log("test_loss", loss)
        return loss

    def on_test_epoch_end(self) -> None:
        """
        At the end of the test epoch, calculates and logs the concordance index for the test set
        """
        cindex, metric = self._calc_cindex()
        global weights
        weights.append(metric)
        self.log("test_cindex", metric, on_epoch=True, prog_bar=True, logger=True)
        self.log("test_cindex_simple", cindex, on_epoch=True, prog_bar=True, logger=True)
        self.targets.clear()


    def configure_optimizers(self):
        """
        configures the optimizer and learning rate scheduler:
        * Optimizer: Adam optimizer with weight decay (L2 regularization).
        * Scheduler: Cosine Annealing scheduler, which adjusts the learning rate according to a cosine curve.
        """
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        scheduler_config = {
            "scheduler": torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer,
                T_max=45,
                eta_min=6e-3
            ),
            "interval": "epoch",
            "frequency": 1,
            "strict": False,
        }

        return {"optimizer": optimizer, "lr_scheduler": scheduler_config}

In [5]:
import json
import pytorch_lightning as pl
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import torch
from pytorch_lightning.callbacks import LearningRateMonitor, TQDMProgressBar
from pytorch_lightning.callbacks import StochasticWeightAveraging
from sklearn.model_selection import StratifiedKFold

pl.seed_everything(42)

def main(hparams):
    """
    Main function to train the model.
    The steps are as following :
    * load data and fill efs and efs time for test data with 1
    * initialize pred array with 0
    * get categorical and numerical columns
    * split the train data on the stratified criterion : race_group * newborns yes/no
    * preprocess the fold data (create dataloaders)
    * train the model and create final submission output
    """
    test, train_original = load_data()
    test['efs_time'] = 1
    test['efs'] = 1
    test_pred = np.zeros(test.shape[0])
    oof_odst = np.zeros(train_original.shape[0])
    categorical_cols, numerical = get_feature_types(train_original)
    kf = StratifiedKFold(n_splits=5, shuffle=True, )
    test_pred_list = []
    for i, (train_index, test_index) in enumerate(
        kf.split(
            train_original, train_original.race_group.astype(str) + (train_original.age_at_hct == 0.044).astype(str)
            )
        ):
        tt = train_original.copy()
        train = tt.iloc[train_index]
        val = tt.iloc[test_index]
        X_cat_val, X_num_train, X_num_val, dl_train, dl_val, transformers =preprocess_data (train, val)
        model = train_final(X_num_train, dl_train, dl_val, transformers, categorical_cols=categorical_cols)
        # oof
        pred_oof, _ = model.cuda().eval()(
            torch.tensor(X_cat_val, dtype=torch.long).cuda(),
            torch.tensor(X_num_val, dtype=torch.float32).cuda()
        )
        oof_odst[test_index] = -pred_oof.detach().cpu().numpy()
        
        # Create submission
        train = tt.iloc[train_index]
        X_cat_val, X_num_train, X_num_val, dl_train, dl_val, transformers = preprocess_data(train, test)
        pred, _ = model.cuda().eval()(
            torch.tensor(X_cat_val, dtype=torch.long).cuda(),
            torch.tensor(X_num_val, dtype=torch.float32).cuda()
        )
        test_pred_list.append(pred.detach().cpu().numpy())
        torch.cuda.empty_cache()
    global weights
    test_pred = np.dot(weights,test_pred_list)
    subm_data = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv")
    subm_data['prediction'] = -test_pred
    subm_data.to_csv('submission.csv', index=False)
    
    display(subm_data.head())
    return oof_odst,subm_data



def train_final(X_num_train, dl_train, dl_val, transformers, hparams=None, categorical_cols=None):
    """
    Defines model hyperparameters and fit the model.
    """
    if hparams is None:
        hparams = {
            "embedding_dim": 16,
            "projection_dim": 112,
            "hidden_dim": 56,
            "lr": 0.06464861983337984,
            "dropout": 0.05463240181423116,
            "aux_weight": 0.26545778308743806,
            "margin": 0.2588153271003354,
            "weight_decay": 0.0002773544957610778
        }
    model = LitNN(
        continuous_dim=X_num_train.shape[1],
        categorical_cardinality=[len(t.classes_) for t in transformers],
        race_index=categorical_cols.index("race_group"),
        **hparams
    )
    checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor="val_loss", save_top_k=1)
    trainer = pl.Trainer(
        accelerator='cuda',
        max_epochs=60,
        callbacks=[
            checkpoint_callback,
            LearningRateMonitor(logging_interval='epoch'),
            TQDMProgressBar(),
            StochasticWeightAveraging(swa_lrs=1e-5, swa_epoch_start=45, annealing_epochs=15)
        ],
    )
    trainer.fit(model, dl_train)
    trainer.test(model, dl_val)
    return model.eval()


hparams = None
oof_odst,res = main(hparams)
print("done fold mean cindex",np.mean(weights))

Test shape: (3, 59)
Train shape: (28800, 61)
There are 58 FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (12) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:384: `ModelCheckpoint(monitor='val_loss')` could not find the monitored key in the returned metrics: ['lr-Adam', 'train_loss', 'train_loss_step', 'grad_2.0_norm/embeddings.embeddings.0.weight', 'grad_2.0_norm/embeddings.embeddings.1.weight', 'grad_2.0_norm/embeddings.embeddings.2.weight', 'grad_2.0_norm/embeddings.embeddings.3.weight', 'grad_2.0_norm/embeddings.embeddings.4.weight', 'grad_2.0_norm/embeddings.embeddings.5.weight', 'grad_2.0_norm/embeddings.embeddings.6.weight', 'grad_2.0_norm/embeddings.embeddings.7.weight', 'grad_2.0_norm/embeddings.embeddings.8.weight', 'grad_2.0_norm/embeddings.embeddings.9.weight', 'grad_2.0_norm/embeddings.embeddings.10.weight', 'grad_2.0_norm/embeddings.embeddings.11.weight', 'grad_2.0_norm/embeddings.embeddings.12.weight', 'grad_2.0_norm/embeddings.embeddings.13.weight', 'grad_2.0_norm/embeddings.embeddings.14.weight', 'grad_2.0_norm/embeddings.embeddings.15.w

Testing: |          | 0/? [00:00<?, ?it/s]

There are 58 FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10', 'is_cyto_score_same']
There are 58 FEATU

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (12) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

There are 58 FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10', 'is_cyto_score_same']
There are 58 FEATU

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (12) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

There are 58 FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10', 'is_cyto_score_same']
There are 58 FEATU

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (12) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

There are 58 FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10', 'is_cyto_score_same']
There are 58 FEATU

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (12) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

There are 58 FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10', 'is_cyto_score_same']


Unnamed: 0,ID,prediction
0,28800,-1.486931
1,28801,0.019216
2,28802,-2.025777


done fold mean cindex 0.677764379641147


In [6]:
torch.cuda.empty_cache()

In [7]:
from cib_metric import score

In [8]:
train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_odst
odst_oof_score = score(y_true.copy(), y_pred.copy(), "ID")
print('odst cv race_index',odst_oof_score )

odst cv race_index 0.6804612116657046


## Multi-target NN

In [9]:
import pandas as pd
import numpy as np
import random
import os
from lifelines import CoxPHFitter
from lifelines import KaplanMeierFitter
from lifelines import NelsonAalenFitter
from tqdm import tqdm
import torch
from cib_metric import score
from lifelines.utils import concordance_index
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader,TensorDataset
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

def seed_torch(seed=42):
    random.seed(seed)   # Python的随机性
    os.environ['PYTHONHASHSEED'] = str(seed)    # 设置Python哈希种子，为了禁止hash随机化，使得实验可复现
    np.random.seed(seed)   # numpy的随机性
    torch.manual_seed(seed)   # torch的CPU随机性，为CPU设置随机种子
    torch.cuda.manual_seed(seed)   # torch的GPU随机性，为当前GPU设置随机种子
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.   torch的GPU随机性，为所有GPU设置随机种子
    torch.backends.cudnn.benchmark = False  # if benchmark=True, deterministic will be False
    torch.backends.cudnn.deterministic = True   # 选择确定性算法
seed_torch(42)

Using device: cuda


In [10]:
train = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
test = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')
submission =  pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv')

In [11]:
def get_num_cat_col(train):
    cat_cols = []
    num_cols = []
    for col in train.columns:
        if train[col].dtypes == 'object' and col not in ('efs','efs_time'):
            cat_cols.append(col)
        elif train[col].dtypes != 'object' and col not in ('efs','efs_time','ID'):
            num_cols.append(col)
    return cat_cols,num_cols

In [12]:
def add_features(df):

    df['hla_nmdp_6'] = df['hla_match_a_low'] * df['hla_match_b_low'] * df['hla_match_drb1_high']
    df['hla_low_res_6'] = df['hla_match_a_low'] * df['hla_match_b_low'] * df['hla_match_drb1_low']
    df['hla_high_res_6'] = df['hla_match_a_high'] * df['hla_match_b_high'] * df['hla_match_drb1_high']
    df['hla_low_res_8'] = df['hla_match_a_low'] * df['hla_match_b_low'] * df['hla_match_c_low'] * df['hla_match_drb1_low']
    df['hla_high_res_8'] = df['hla_match_a_high'] * df['hla_match_b_high'] * df['hla_match_c_high'] * df['hla_match_drb1_high']
    df['hla_low_res_10'] = df['hla_match_a_low'] * df['hla_match_b_low'] * df['hla_match_c_low'] * df['hla_match_drb1_low'] * df['hla_match_dqb1_low']
    df['hla_high_res_10'] = df['hla_match_a_high'] * df['hla_match_b_high'] * df['hla_match_c_high'] * df['hla_match_drb1_high'] * df['hla_match_dqb1_high']
    
    add_features_list = [
        'hla_nmdp_6',
        'hla_low_res_6',
        'hla_high_res_6',
        'hla_low_res_8',
        'hla_high_res_8',
        'hla_low_res_10',
        'hla_high_res_10'
    ]
    return df,add_features_list

In [13]:
def processData(df,FEATURES,model_type = 'cnn'):
    CAT_SIZE = []
    CAT_EMB = []
    for c in FEATURES:
        if c in cat_cols:
            # LABEL ENCODE
            df[c] = df[c].fillna('missing').astype("str")
            df[c],_ = df[c].factorize()
            df[c] -= df[c].min()
            
            df[c] = df[c].astype("int32")
            n = df[c].nunique()
            mn = df[c].min()
            mx = df[c].max()
            print(f'{c} has ({n}) unique values')
    
            CAT_SIZE.append(mx+1) 
            CAT_EMB.append(int(np.ceil( np.sqrt(mx+1))) ) 
        else:
            # Num feat standardize    
            m = df[c].mean()
            s = df[c].std()
            df[c] = (df[c]-m)/s
            if model_type == 'cnn':
                df[c] = df[c].fillna(df[c].mean())
            if model_type == 'dnn':
                df[c] = df[c].fillna(0)
    return df,CAT_SIZE,CAT_EMB

In [14]:

combined = pd.concat([train,test],axis=0,ignore_index=True)
cat_cols,num_cols = get_num_cat_col(train)
RMV = ["ID","efs","efs_time","y"]
FEATURES = [c for c in train.columns if not c in RMV]
print('add features')
combined,add_features_list = add_features(combined)
num_cols = num_cols+add_features_list
print('process the data')
combined,CAT_SIZE,CAT_EMB = processData(combined,FEATURES)
print('split the train and the test')
train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

add features
process the data
dri_score has (12) unique values
psych_disturb has (4) unique values
cyto_score has (8) unique values
diabetes has (4) unique values
tbi_status has (8) unique values
arrhythmia has (4) unique values
graft_type has (2) unique values
vent_hist has (3) unique values
renal_issue has (4) unique values
pulm_severe has (4) unique values
prim_disease_hct has (18) unique values
cmv_status has (5) unique values
tce_imm_match has (9) unique values
rituximab has (3) unique values
prod_type has (2) unique values
cyto_score_detail has (6) unique values
conditioning_intensity has (7) unique values
ethnicity has (4) unique values
obesity has (4) unique values
mrd_hct has (3) unique values
in_vivo_tcd has (3) unique values
tce_match has (5) unique values
hepatic_severe has (4) unique values
prior_tumor has (4) unique values
peptic_ulcer has (4) unique values
gvhd_proph has (18) unique values
rheum_issue has (4) unique values
sex_match has (5) unique values
race_group has (

In [15]:
class Targets:

    def __init__(self, data, cat_cols, penalizer, n_splits):
        
        self.data = data.copy()
        self.cat_cols = cat_cols
        
        self._length = len(self.data)
        self._penalizer = penalizer
        self._n_splits = n_splits

    def _prepare_cv(self):
        
        oof_preds = np.zeros(self._length)
            
        cv = KFold(n_splits=self._n_splits, shuffle=True, random_state=42)

        return cv, oof_preds

    def validate_model(self, preds, title):
            
        y_true = self.data[['ID', 'efs', 'efs_time', 'race_group']].copy()
        y_pred = self.data[['ID']].copy()
        
        y_pred['prediction'] = preds
            
        c_index_score = score(y_true.copy(), y_pred.copy(), 'ID')
        print(f'Overall Stratified C-Index Score for {title}: {c_index_score:.4f}')

    def create_target1(self):  


        cv, oof_preds = self._prepare_cv()

        # Apply one hot encoding to categorical columns
        data = pd.get_dummies(self.data, columns=self.cat_cols, drop_first=True).drop('ID', axis=1) 
        # data = self.data
        for train_index, valid_index in tqdm(cv.split(data)):

            train_data = data.iloc[train_index]
            valid_data = data.iloc[valid_index]

            # Drop constant columns if they exist
            train_data = train_data.loc[:, train_data.nunique() > 1]
            valid_data = valid_data[train_data.columns]

            cph = CoxPHFitter(penalizer=self._penalizer)
            cph.fit(train_data, duration_col='efs_time', event_col='efs')
            
            oof_preds[valid_index] = cph.predict_partial_hazard(valid_data)              

        data_cox = self.data.copy()
        
        self.data['target1'] = oof_preds 
        self.validate_model(oof_preds, 'Cox') 
        return self.data
        
    def custom_target(self):
        cus_target =  self.data[['efs','efs_time']]
        cus_target['y'] = self.data.efs_time.values
        mx = cus_target.loc[cus_target.efs==1,"efs_time"].max()
        mn = cus_target.loc[cus_target.efs==0,"efs_time"].min()
        cus_target.loc[cus_target.efs==0,"y"] = cus_target.loc[cus_target.efs==0,"y"] + mx - mn
        cus_target.y = cus_target.y.rank()
        cus_target.loc[cus_target.efs==0,"y"] += 2*len(cus_target)
        cus_target.y = cus_target.y / cus_target.y.max()
        cus_target.y = np.log( cus_target.y )
        cus_target.y -= cus_target.y.mean()
        cus_target.y *= -1.0    
        return cus_target

In [16]:
cnn_target = Targets(train, cat_cols, penalizer = 0.00, n_splits = 5)
cox_target = cnn_target.create_target1()['target1']
cus_target = cnn_target.custom_target()

5it [02:26, 29.31s/it]


Overall Stratified C-Index Score for Cox: 0.6570


In [17]:
# 定义 PyTorch 数据集
class TabularDataset_multi(Dataset):
    def __init__(self, cat_features, num_features, targets1, targets2):
        self.cat_features = cat_features
        self.num_features = num_features
        self.targets1 = targets1
        self.targets2 = targets2

    def __len__(self):
        return len(self.targets1)

    def __getitem__(self, index):
        return self.cat_features[index], self.num_features[index], (self.targets1[index],self.targets2[index])

In [18]:
import torch
import torch.nn as nn

class CNN(nn.Module):
    def __init__(self, cat_nunique, cat_emb_dims, num_input_dim):
        super(CNN, self).__init__()
        
        self.embeddings = nn.ModuleList([
            nn.Embedding(cat_nunique[i], cat_emb_dims[i]) for i in range(len(cat_nunique))
        ])
        
        self.num_layer = nn.Sequential(
            nn.Linear(num_input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 64),
            nn.ReLU()
        )
        self.conv_layer = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=3, kernel_size=3, padding=1),
            nn.MaxPool1d(kernel_size = 3),
            nn.Conv1d(in_channels=3, out_channels=9, kernel_size=5, padding=1),
            nn.MaxPool1d(kernel_size = 3),
            
        )
        self.conv_project = nn.Sequential(
            nn.Linear(296,512),
            nn.ReLU(),
            nn.Linear(512,256),
            nn.ReLU(),
        )
        
        self.output1 = nn.Linear(256, 1)
        self.output2 = nn.Linear(256, 1)

    def forward(self, cat_inputs, num_inputs):
        # embeding
        cat_embeds = [emb(cat_inputs[:, i]) for i, emb in enumerate(self.embeddings)]
        cat_embeds = torch.cat(cat_embeds, dim=1)  # 拼接嵌入层输出
        
        # num feat layer
        num_out = self.num_layer(num_inputs)

        # cancat cat+num
        combined = torch.cat([cat_embeds, num_out], dim=1)  # 形状: [batch_size, total_emb_dim]

        # conv
        conv_layer = combined.unsqueeze(1) 
        conv_layer = self.conv_layer(conv_layer)
        # concat 
        conbine_conv = torch.cat([conv_layer.view(conv_layer.shape[0],-1),combined],dim = 1)
        conv_layer = self.conv_project(conbine_conv)

        # ouput
        output1 = self.output1(conv_layer)
        output2 = self.output2(conv_layer)
        
        return output1, output2

In [19]:
def test_predict(test_loader,model):
    test_pred1_list = []
    test_pred2_list = []
    
    model.eval()
    with torch.no_grad():
        for X_test_cat_tensor,X_test_num_tensor in test_loader:
            test_pred1,test_pred2 = model(X_test_cat_tensor.to(device), X_test_num_tensor.to(device))
            test_pred1_list.extend(test_pred1.detach().cpu().numpy().tolist())
            test_pred2_list.extend(test_pred2.detach().cpu().numpy().tolist())
            
    return np.array(test_pred1_list),np.array(test_pred2_list)
            

In [20]:
def train_nn_multi(train,test,cat_cols,num_cols,y_train,hparams):
    
    kf = KFold(n_splits = hparams['n_splits'], shuffle = True, random_state = hparams['random_state'])
    fold_results = []
    oof_nn = np.zeros((len(train),2))
    test_nn = np.zeros((len(test),2))

    X_train_cat = train[cat_cols].values
    X_train_num = train[num_cols].values
    X_test_cat = test[cat_cols].values
    X_test_num = test[num_cols].values
    
    X_train_cat_tensor = torch.tensor(X_train_cat, dtype=torch.long)
    X_train_num_tensor = torch.tensor(X_train_num, dtype=torch.float32)
    
    X_test_cat_tensor = torch.tensor(X_test_cat, dtype=torch.long)
    X_test_num_tensor = torch.tensor(X_test_num, dtype=torch.float32) 

    y_train1 = y_train[0].values
    y_train2 = y_train[1].values
    
    y_train1_tensor = torch.tensor(y_train1, dtype=torch.float32).view(-1, 1)
    y_train2_tensor = torch.tensor(y_train2, dtype=torch.float32).view(-1, 1)
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_num)):
        print(f"Fold {fold+1}")
    
        # dataset preparing
        train_dataset = TabularDataset_multi(X_train_cat_tensor[train_idx], X_train_num_tensor[train_idx],
                                             y_train1_tensor[train_idx],y_train2_tensor[train_idx])
        val_dataset = TabularDataset_multi(X_train_cat_tensor[val_idx], X_train_num_tensor[val_idx],
                                           y_train1_tensor[val_idx],y_train2_tensor[val_idx])
        
        train_loader = DataLoader(train_dataset, batch_size=hparams['BATCH_SIZE'], shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=hparams['BATCH_SIZE'], shuffle=False)
        test_loader = DataLoader(TensorDataset(X_test_cat_tensor,X_test_num_tensor),batch_size=hparams['BATCH_SIZE'], shuffle=False)
    
        # init the model
        seed_torch(hparams['random_state'])
        if hparams['model_type'] == 'cnn':
            model = CNN(hparams['CAT_SIZE'],hparams['CAT_EMB'],len(num_cols)).to(device)
        if hparams['model_type'] == 'dnn':
            model = DNN(hparams['CAT_SIZE'],hparams['CAT_EMB'],len(num_cols)).to(device)
            
        optimizer = optim.Adam(model.parameters(), lr=hparams['lr'],weight_decay = hparams['weight_decay'])
        
        criterion1 = nn.MSELoss()
        criterion2 = nn.MSELoss()
        # linear decay
        scheduler = optim.lr_scheduler.LinearLR(optimizer, 
                                                start_factor=1.0,   # 初始学习率倍率（1.0 表示正常起点）
                                                end_factor=0.001,     # 最终学习率倍率（0.1 表示最终学习率为初始的10%）
                                                total_iters=hparams['EPOCH']) 
        # train
        best_val_loss = float('inf')
        for epoch in range(hparams['EPOCH']):  # 训练50个epoch
            
            model.train()
            train_index_pred = []
            train_loss = []
            for cat_features, num_features, targets in train_loader:
                cat_features, num_features, targets = cat_features.to(device), num_features.to(device), (targets[0].to(device),targets[1].to(device))
                optimizer.zero_grad()
                outputs = model(cat_features, num_features)
                loss = criterion1( outputs[0], targets[0])*0.5 +  criterion2(outputs[1], targets[1])*0.5
                loss.backward()
                optimizer.step()
                train_loss.append(loss.item())
            print('lr',optimizer.param_groups[0]['lr'])
            scheduler.step()    
            # 验证阶段
            val_index_pred1 = []
            val_index_pred2 = []
            
            val_loss = []
            val_loss1 = []
            val_loss2 = []
            model.eval()
            
            val_now_loss = 0
            with torch.no_grad():
                for cat_features, num_features, targets in val_loader:
                    cat_features, num_features, targets = cat_features.to(device), num_features.to(device), (targets[0].to(device),targets[1].to(device))
                    outputs = model(cat_features, num_features)
                    loss1 = criterion1( outputs[0], targets[0])
                    loss2 = criterion2(outputs[1], targets[1])
                    loss = loss1*0.5 +  loss2*0.5
                    val_index_pred1.extend(outputs[0].cpu().numpy().tolist())
                    val_index_pred2.extend(outputs[1].cpu().numpy().tolist())
                    
                    val_loss.append(loss.item())
                    val_loss1.append(loss1.item())
                    val_loss2.append(loss2.item())
                    
            # val_c_index = concordance_index(val_event_times, val_index_pred, val_event_observed)
            oof_nn[val_idx,0:1] = np.array(val_index_pred1)
            oof_nn[val_idx,1:2] = np.array(val_index_pred2)
            val_now_loss = np.mean(val_loss)
            print(f"Epoch {epoch+1}: train loss ={np.mean(train_loss):.4f}  Val loss={np.mean(val_loss):.4f} custom_loss = {np.mean(val_loss1)} cox_loss = {np.mean(val_loss2)}") # Train index ={loss.item():.4f},
            
        # test predict
        test_pred1,test_pred2 = test_predict(test_loader,model)
        test_nn[:,0:1]+=test_pred1/hparams['n_splits']
        test_nn[:,1:2]+=test_pred2/hparams['n_splits']
        torch.cuda.empty_cache()
            
    target_score = 0
    for i in range(oof_nn.shape[1]):
        y_true = train[["ID","efs","efs_time","race_group"]].copy()
        y_pred = train[["ID"]].copy()
        y_pred["prediction"] = oof_nn[:,i]
        m = score(y_true.copy(), y_pred.copy(), "ID")
        target_score+=m/oof_nn.shape[1]
        print(f"\n target {i+1} Overall CV for torch NN =",m)
    print('multi-target_mean_CV',target_score)
    return oof_nn,test_nn

In [21]:
hparams = {
    'CAT_SIZE':CAT_SIZE, 
    'CAT_EMB': CAT_EMB,
    'EPOCH': 19,
    'BATCH_SIZE':512,
    'lr':0.006,
    'n_splits':5,
    'weight_decay':0.001,
    'random_state':42,
    'model_type':'cnn'
}

oof_nn_multi,test_nn_multi = train_nn_multi(train,test,cat_cols,
                        num_cols, [cus_target['y'],cox_target],
                         hparams)

Fold 1
lr 0.006
Epoch 1: train loss =1.6508  Val loss=1.0605 custom_loss = 1.9464145700136821 cox_loss = 0.1745468204220136
lr 0.005684526315789474
Epoch 2: train loss =1.0201  Val loss=1.0124 custom_loss = 1.9211062391599019 cox_loss = 0.1036053616553545
lr 0.005369052631578948
Epoch 3: train loss =0.9845  Val loss=0.9862 custom_loss = 1.9075401922067006 cox_loss = 0.06480640793840091
lr 0.005053578947368422
Epoch 4: train loss =0.9627  Val loss=0.9847 custom_loss = 1.9041798909505208 cox_loss = 0.06517650187015533
lr 0.004738105263157896
Epoch 5: train loss =0.9539  Val loss=0.9861 custom_loss = 1.8975799878438313 cox_loss = 0.07466549426317215
lr 0.00442263157894737
Epoch 6: train loss =0.9485  Val loss=0.9771 custom_loss = 1.8958364725112915 cox_loss = 0.05834635781745116
lr 0.004107157894736844
Epoch 7: train loss =0.9446  Val loss=0.9683 custom_loss = 1.890821526447932 cox_loss = 0.045854080778857075
lr 0.0037916842105263173
Epoch 8: train loss =0.9415  Val loss=0.9757 custom_los

In [22]:
torch.cuda.empty_cache()

In [23]:
test_nn_multi

array([[-0.99902437,  0.24344752],
       [ 0.16381276,  1.07535186],
       [-1.22240691,  0.18337077]])

## DNN

In [24]:
train = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
test = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')
submission =  pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv')

combined = pd.concat([train,test],axis=0,ignore_index=True)
cat_cols,num_cols = get_num_cat_col(train)
RMV = ["ID","efs","efs_time","y"]
FEATURES = [c for c in train.columns if not c in RMV]
print('process the data')
combined,CAT_SIZE,CAT_EMB = processData(combined,FEATURES,model_type = 'dnn')
print('split the train and the test')
train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()



process the data
dri_score has (12) unique values
psych_disturb has (4) unique values
cyto_score has (8) unique values
diabetes has (4) unique values
tbi_status has (8) unique values
arrhythmia has (4) unique values
graft_type has (2) unique values
vent_hist has (3) unique values
renal_issue has (4) unique values
pulm_severe has (4) unique values
prim_disease_hct has (18) unique values
cmv_status has (5) unique values
tce_imm_match has (9) unique values
rituximab has (3) unique values
prod_type has (2) unique values
cyto_score_detail has (6) unique values
conditioning_intensity has (7) unique values
ethnicity has (4) unique values
obesity has (4) unique values
mrd_hct has (3) unique values
in_vivo_tcd has (3) unique values
tce_match has (5) unique values
hepatic_severe has (4) unique values
prior_tumor has (4) unique values
peptic_ulcer has (4) unique values
gvhd_proph has (18) unique values
rheum_issue has (4) unique values
sex_match has (5) unique values
race_group has (6) unique val

In [25]:
dnn_target = Targets(train, cat_cols, penalizer = 0.00, n_splits = 5)
cox_target = dnn_target.create_target1()['target1']
cus_target = dnn_target.custom_target()

5it [02:25, 29.00s/it]


Overall Stratified C-Index Score for Cox: 0.6573


In [26]:
class DNN(nn.Module):
    def __init__(self, cat_nunique, cat_emb_dims, num_input_dim):
        super(DNN, self).__init__()
        
        # 创建分类特征的嵌入层
        self.embeddings = nn.ModuleList([
            nn.Embedding(cat_nunique[i], cat_emb_dims[i]) for i in range(len(cat_nunique))
        ])
        # num_layer
        self.num_layer = nn.Sequential(
             nn.Linear(num_input_dim,512),
             nn.ReLU(),
             nn.Linear(512,64),
             nn.ReLU()
        )
        
        # 计算拼接后的输入大小
        total_emb_dim = sum(cat_emb_dims) + 64
        # MLP 结构
        self.fc_layers = nn.Sequential(
            nn.Linear(total_emb_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
        )
        self.output1 = nn.Linear(512+64,1)
        self.output2 = nn.Linear(512+64,1)
        
        
    def forward(self, cat_inputs, num_inputs):
        # 处理分类特征嵌入
        cat_embeds = [emb(cat_inputs[:, i]) for i, emb in enumerate(self.embeddings)]
        cat_embeds = torch.cat(cat_embeds, dim=1)

        num_layer = self.num_layer(num_inputs)

        # 拼接分类和数值特征
        combined = torch.cat([cat_embeds, num_layer], dim=1)
        
        # 通过 MLP 进行预测
        share = torch.cat([self.fc_layers(combined),num_layer],dim = 1)

        output1 = self.output1(share)
        output2 = self.output2(share)
        
        return output1,output2

In [27]:
hparams = {
    'CAT_SIZE':CAT_SIZE, 
    'CAT_EMB': CAT_EMB,
    'EPOCH': 14,
    'BATCH_SIZE':256,
    'lr':0.006,
    'n_splits':5,
    'weight_decay':0.001,
    'random_state':42,
    'model_type':'dnn'
}

oof_nn_multi_dnn,test_nn_multi_dnn = train_nn_multi(train,test,cat_cols,
                        num_cols, [cus_target['y'],cox_target],
                         hparams)

Fold 1
lr 0.006
Epoch 1: train loss =1.8547  Val loss=1.0146 custom_loss = 1.904895051665928 cox_loss = 0.12434296277554138
lr 0.005571857142857143
Epoch 2: train loss =0.9960  Val loss=0.9922 custom_loss = 1.8947028854618901 cox_loss = 0.0897106887853664
lr 0.005143714285714286
Epoch 3: train loss =0.9661  Val loss=0.9570 custom_loss = 1.8621558572935022 cox_loss = 0.05186691021789675
lr 0.004715571428571429
Epoch 4: train loss =0.9523  Val loss=0.9642 custom_loss = 1.8805536964665288 cox_loss = 0.04779293145174566
lr 0.004287428571428572
Epoch 5: train loss =0.9430  Val loss=0.9458 custom_loss = 1.8520610902620398 cox_loss = 0.03947203063770481
lr 0.0038592857142857153
Epoch 6: train loss =0.9357  Val loss=0.9486 custom_loss = 1.8602907709453418 cox_loss = 0.0368654132699189
lr 0.003431142857142858
Epoch 7: train loss =0.9312  Val loss=0.9501 custom_loss = 1.8556699389996736 cox_loss = 0.04450376048360182
lr 0.0030030000000000005
Epoch 8: train loss =0.9232  Val loss=0.9532 custom_lo

In [28]:
torch.cuda.empty_cache()

In [29]:
test_nn_multi_dnn

array([[-1.0630398 ,  0.20460018],
       [ 0.21203658,  1.09996986],
       [-1.24975927,  0.15364993]])

In [30]:
# import optuna
# from scipy.stats import rankdata 

# def objective(trial):
    
#     weights = [trial.suggest_float(f'w{i}', 0, 10) for i in range(1,5)]   # weight suggestions
#     # weights = np.array(weights) / np.sum(weights)                        # normalizing
#     # print(weights)
    
#     ensemble_pred = ensemble_valid.dot(weights)

#     y_pred["prediction"] = ensemble_pred
#     m = score(y_true.copy(), y_pred.copy(), "ID")
#     return m

# # Creating the predictions dataset for final prediction
# y_true = train[["ID","efs","efs_time","race_group"]].copy()
# y_pred = train[["ID"]].copy()
# ensemble_valid = np.c_[ rankdata(oof_nn_multi[:,0]),
#                            rankdata(oof_nn_multi[:,1]),
#                         rankdata(oof_nn_multi_dnn[:,0]),
#                            rankdata(oof_nn_multi_dnn[:,1]),
#                        ]

# TPESampler = optuna.samplers.TPESampler(multivariate=True, group=True)
# optimize_weights = optuna.create_study(direction='maximize', study_name='Ensemble Weights')
# optimize_weights.optimize(objective, n_trials=200, show_progress_bar=True)
# weights = optimize_weights.best_trial.params
# print('best_params:',weights)
# print('best_value:',optimize_weights.best_value)

# # Normalizing the weights
# weights = np.array(list(weights.values()))
# # weights /= np.sum(weights)
# weights

In [31]:
import functools
from typing import List
from scipy.stats import rankdata
import pytorch_lightning as pl
import numpy as np
import torch
from lifelines.utils import concordance_index
from pytorch_lightning.cli import ReduceLROnPlateau
from pytorch_tabular.models.common.layers import ODST
from torch import nn
from pytorch_lightning.utilities import grad_norm
import json
import pytorch_lightning as pl
import numpy as np, pandas as pd
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
import torch
from pytorch_lightning.callbacks import LearningRateMonitor, TQDMProgressBar
from pytorch_lightning.callbacks import StochasticWeightAveraging
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
import torch
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import TensorDataset

from typing import List
import pandas as pd
import functools
import numpy as np
import random
import os
from lifelines import CoxPHFitter
from lifelines import KaplanMeierFitter
from lifelines import NelsonAalenFitter
from torch.utils.data import TensorDataset
from tqdm import tqdm
import torch
# from metric import score
from lifelines.utils import concordance_index
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import mean_squared_error
import warnings
import pandas as pd
import numpy as np
warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [32]:
def add_features(df):
    """
    Create some new features to help the model focus on specific patterns.
    """
    df['is_cyto_score_same'] = (df['cyto_score'] == df['cyto_score_detail']).astype(int)
    df['year_hct'] -= 2000
    df['year_hct']=df['year_hct'].replace(2020,2019)
    df['age_group']=df['age_at_hct']//10
    #karnofsky_score 40 only 10 rows.
    df['karnofsky_score']=df['karnofsky_score'].replace(40,50)
    #hla_high_res_8=2 only 2 rows.
    df['hla_high_res_8']=df['hla_high_res_8'].replace(2,3)
    #hla_high_res_6=0 only 1 row.
    df['hla_high_res_6']=df['hla_high_res_6'].replace(0,2)
    #hla_high_res_10=3 only 1 row.
    df['hla_high_res_10']=df['hla_high_res_10'].replace(3,4)
    #hla_low_res_8=2 only 1 row.
    df['hla_low_res_8']=df['hla_low_res_8'].replace(2,3)

    df['donor_age-age_at_hct']=df['donor_age']-df['age_at_hct']
    df['comorbidity_score+karnofsky_score']=df['comorbidity_score']+df['karnofsky_score']
    df['comorbidity_score-karnofsky_score']=df['comorbidity_score']-df['karnofsky_score']
    df['comorbidity_score*karnofsky_score']=df['comorbidity_score']*df['karnofsky_score']
    df['comorbidity_score/karnofsky_score']=df['comorbidity_score']/df['karnofsky_score']    
    return df

In [33]:
train = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
test = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')
submission =  pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv')

train = add_features(train)
test = add_features(test)



In [34]:
@functools.lru_cache
def combinations(N):
    """
    calculates all possible 2-combinations (pairs) of a tensor of indices from 0 to N-1, 
    and caches the result using functools.lru_cache for optimization
    """
    ind = torch.arange(N)
    comb = torch.combinations(ind, r=2)
    return comb.cuda()
def get_full_loss(efs, x_cat, y, y_hat):
        """
        Output loss and race_group loss.
        """
        loss = calc_loss(y, y_hat, efs)
        race_loss = get_race_losses(efs, x_cat, y, y_hat)
        loss += 0.1 * race_loss
        return loss, race_loss

def get_race_losses( efs, x_cat, y, y_hat):
        global  hparams
        """
        Calculate loss for each race_group based on deviation/variance.
        """
        races = torch.unique(x_cat[:, hparams['race_index']])
        race_losses = []
        for race in races:
            ind = x_cat[:, hparams['race_index']] == race
            race_losses.append(calc_loss(y[ind], y_hat[ind], efs[ind]))
        race_loss = sum(race_losses) / len(race_losses)
        races_loss_std = sum((r - race_loss)**2 for r in race_losses) / len(race_losses)
        return torch.sqrt(races_loss_std)

def calc_loss( y, y_hat, efs):
        global  hparams
    
        """
        Most important part of the model : loss function used for training.
        We face survival data with event indicators along with time-to-event.

        This function computes the main loss by the following the steps :
        * create all data pairs with "combinations" function (= all "two subjects" combinations)
        * make sure that we have at least 1 event in each pair
        * convert y to +1 or -1 depending on the correct ranking
        * loss is computed using a margin-based hinge loss
        * mask is applied to ensure only valid pairs are being used (censored data can't be ranked with event in some cases)
        * average loss on all pairs is returned
        """
        N = y.shape[0]
        comb = combinations(N)
        comb = comb[(efs[comb[:, 0]] == 1) | (efs[comb[:, 1]] == 1)]
        pred_left = y_hat[comb[:, 0]]
        pred_right = y_hat[comb[:, 1]]
        y_left = y[comb[:, 0]]
        y_right = y[comb[:, 1]]
        y = 2 * (y_left > y_right).int() - 1
        loss = nn.functional.relu(-y * (pred_left - pred_right) + hparams['margin'])
        mask = get_mask(comb, efs, y_left, y_right)
        loss = (loss.double() * (mask.double())).sum() / mask.sum()
        return loss

def get_mask(comb, efs, y_left, y_right):
        """
        Defines all invalid comparisons :
        * Case 1: "Left outlived Right" but Right is censored
        * Case 2: "Right outlived Left" but Left is censored
        Masks for case 1 and case 2 are combined using |= operator and inverted using ~ to create a "valid pair mask"
        """
        left_outlived = y_left >= y_right
        left_1_right_0 = (efs[comb[:, 0]] == 1) & (efs[comb[:, 1]] == 0)
        mask2 = (left_outlived & left_1_right_0)
        right_outlived = y_right >= y_left
        right_1_left_0 = (efs[comb[:, 1]] == 1) & (efs[comb[:, 0]] == 0)
        mask2 |= (right_outlived & right_1_left_0)
        mask2 = ~mask2
        mask = mask2
        return mask

In [35]:
class CatEmbeddings(nn.Module):
    """
    Embedding module for the categorical dataframe.
    """
    def __init__(
        self,
        projection_dim: int,
        categorical_cardinality: List[int],
        embedding_dim: int
    ):
        """
        projection_dim: The dimension of the final output after projecting the concatenated embeddings into a lower-dimensional space.
        categorical_cardinality: A list where each element represents the number of unique categories (cardinality) in each categorical feature.
        embedding_dim: The size of the embedding space for each categorical feature.
        self.embeddings: list of embedding layers for each categorical feature.
        self.projection: sequential neural network that goes from the embedding to the output projection dimension with GELU activation.
        """
        super(CatEmbeddings, self).__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(cardinality, embedding_dim)
            for cardinality in categorical_cardinality
        ])
        self.projection = nn.Sequential(
            nn.Linear(embedding_dim * len(categorical_cardinality), projection_dim),
            nn.GELU(),
            nn.Linear(projection_dim, projection_dim)
        )

    def forward(self, x_cat):
        """
        Apply the projection on concatened embeddings that contains all categorical features.
        """
        x_cat = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embeddings)]
        x_cat = torch.cat(x_cat, dim=1)
        return self.projection(x_cat)

In [36]:
class NN(nn.Module):
    """
    Train a model on both categorical embeddings and numerical data.
    """
    def __init__(
            self,
            continuous_dim: int,
            categorical_cardinality: List[int],
            embedding_dim: int,
            projection_dim: int,
            hidden_dim: int,
            dropout: float = 0
    ):
        """
        continuous_dim: The number of continuous features.
        categorical_cardinality: A list of integers representing the number of unique categories in each categorical feature.
        embedding_dim: The dimensionality of the embedding space for each categorical feature.
        projection_dim: The size of the projected output space for the categorical embeddings.
        hidden_dim: The number of neurons in the hidden layer of the MLP.
        dropout: The dropout rate applied in the network.
        self.embeddings: previous embeddings for categorical data.
        self.mlp: defines an MLP model with an ODST layer followed by batch normalization and dropout.
        self.out: linear output layer that maps the output of the MLP to a single value
        self.dropout: defines dropout
        Weights initialization with xavier normal algorithm and biases with zeros.
        """
        super(NN, self).__init__()
        self.embeddings = CatEmbeddings(projection_dim, categorical_cardinality, embedding_dim)
        self.mlp = nn.Sequential(
            ODST(projection_dim + continuous_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(dropout)
        )
        self.out = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

        # initialize weights
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x_cat, x_cont):
        """
        Create embedding layers for categorical data, concatenate with continous variables.
        Add dropout and goes through MLP and return raw output and 1-dimensional output as well.
        """
        x = self.embeddings(x_cat)
        x = torch.cat([x, x_cont], dim=1)
        x = self.dropout(x)
        x = self.mlp(x)
        return self.out(x), x

In [37]:
def get_feature_types(train):
    """
    Utility function to return categorical and numerical column names.
    """
    categorical_cols = [col for i, col in enumerate(train.columns) if ((train[col].dtype == "object") | (2 < train[col].nunique() < 25))]
    RMV = ["ID", "efs", "efs_time", "y"]
    FEATURES = [c for c in train.columns if not c in RMV]
    # print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")
    numerical = [i for i in FEATURES if i not in categorical_cols]
    return categorical_cols, numerical


In [38]:
def get_X_cat(df, cat_cols, transformers=None):
    """
    Apply a specific categorical data transformer or a LabelEncoder if None.
    """
    if transformers is None:
        transformers = [LabelEncoder().fit(df[col]) for col in cat_cols]
    return transformers, np.array(
        [transformer.transform(df[col]) for col, transformer in zip(cat_cols, transformers)]
    ).T
    
def get_categoricals(train, val):
    """
    Remove constant categorical columns and transform them using LabelEncoder.
    Return the label-transformers for each categorical column, categorical dataframes and numerical columns.
    """
    categorical_cols, numerical = get_feature_types(train)
    remove = []
    for col in categorical_cols:
        if train[col].nunique() == 1:
            remove.append(col)
        ind = ~val[col].isin(train[col])
        if ind.any():
            val.loc[ind, col] = train[col].mode()[0] # np.nan
    categorical_cols = [col for col in categorical_cols if col not in remove]
    transformers, X_cat_train = get_X_cat(train, categorical_cols)
    _, X_cat_val = get_X_cat(val, categorical_cols, transformers)
    return X_cat_train, X_cat_val, numerical, transformers


In [39]:
def get_dldata(train,val,cat_cols,num_cols,bs):
    
    X_cat_train, X_cat_val, num_cols, transformers = get_categoricals(train, val)
    
    scaler = StandardScaler()
    imp = SimpleImputer(missing_values=np.nan, strategy='mean', add_indicator=True)
    X_num_train = imp.fit_transform(train[num_cols])
    X_num_train = scaler.fit_transform(X_num_train)
    X_num_val = imp.transform(val[num_cols])
    X_num_val = scaler.transform(X_num_val)
    
    dl_train = init_dl(train,X_cat_train,X_num_train,bs)
    dl_val = init_dl(val,X_cat_val,X_num_val,bs)

    return X_cat_train,X_num_train,X_cat_val,X_num_val,dl_train,dl_val,transformers
    
def init_dl(df,X_cat,X_num,bs,training=False):
    """
    Initialize data loaders with 4 dimensions : categorical dataframe, numerical dataframe and target values (efs and efs_time).
    Notice that efs_time is log-transformed.
    Fix batch size to 2048 and return dataloader for training or validation depending on training value.
    """

    ds_train = TensorDataset(
        torch.tensor(X_cat, dtype=torch.long),
        torch.tensor(X_num, dtype=torch.float32),
        torch.tensor(df.efs_time.values, dtype=torch.float32).log(),
        torch.tensor(df.efs.values, dtype=torch.long)
    )
    dl_train = torch.utils.data.DataLoader(ds_train, batch_size=bs, pin_memory=True, shuffle=training)
    return dl_train
    

In [40]:
class LitNN(nn.Module):
    """
    Main Model creation and losses definition to fully train the model.
    """
    def __init__(
            self,
            continuous_dim: int,
            categorical_cardinality: list[int],
            embedding_dim: int,
            projection_dim: int,
            hidden_dim: int,
            dropout: float = 0.2,
    ):

        super(LitNN, self).__init__()

        # Creates an instance of the NN model defined above
        self.model = NN(
            continuous_dim = continuous_dim,
            categorical_cardinality = categorical_cardinality,
            embedding_dim = embedding_dim,
            projection_dim = projection_dim,
            hidden_dim = hidden_dim,
            dropout = dropout
        )

        # Defines a small feedforward neural network that performs an auxiliary task with 1-dimensional output
        self.aux_cls = nn.Sequential(
            nn.Linear(hidden_dim,hidden_dim // 3),
            nn.GELU(),
            nn.Linear(hidden_dim //3, 1)
        )

    def forward(self, x_cat, x_cont):
        """
        Forward pass that outputs the 1-dimensional prediction and the embeddings (raw output)
        """
        x, emb = self.model(x_cat, x_cont)
        aux_pred = self.aux_cls(emb).squeeze(1)
        return x.squeeze(1),aux_pred,emb

In [41]:
def _calc_cindex(targets):
        """
        Calculate c-index accounting for each race_group or global.
        """
        y = torch.cat([t[0] for t in targets]).cpu().numpy()
        y_hat = torch.cat([t[1] for t in targets]).cpu().numpy()
        efs = torch.cat([t[2] for t in targets]).cpu().numpy()
        races = torch.cat([t[3] for t in targets]).cpu().numpy()
        metric = _metric(efs, races, y, y_hat)
        cindex = concordance_index(y, y_hat, efs)
        return cindex, metric

def _metric( efs, races, y, y_hat):
        """
        Calculate c-index accounting for each race_group
        """
        metric_list = []
        for race in np.unique(races):
            y_ = y[races == race]
            y_hat_ = y_hat[races == race]
            efs_ = efs[races == race]
            metric_list.append(concordance_index(y_, y_hat_, efs_))
        metric = float(np.mean(metric_list) - np.sqrt(np.var(metric_list)))
        return metric

In [42]:
def nn_predict(dl_test,model,model_path):
    test_pred_list = []
    test_cat_embed = []
    model.load_state_dict(torch.load(f'{model_path}'))   
    model.eval()
    with torch.no_grad():  
        for x_cat, x_num, y,efs in dl_test:
            x_cat, x_num, y, efs = x_cat.to(device), x_num.to(device),y.to(device),efs.to(device)
            y_hat,_,emb  = model(x_cat, x_num)
            test_pred_list.extend(y_hat.detach().cpu().numpy().tolist())
            test_cat_embed.extend(emb.detach().cpu().numpy().tolist())
            
    return test_pred_list,test_cat_embed

In [43]:
def train_nn(train,test,cat_cols, num_cols,hparams):
    
    test['efs_time'] = 1
    test['efs'] = 1
    oof_odst = np.zeros(len(train))
    oof_embed = np.zeros((len(train),hparams["hidden_dim"]))
    test_embed = np.zeros((len(test),hparams['hidden_dim']))
    
    kf = StratifiedKFold(n_splits=hparams['n_splits'], shuffle=hparams['shuffle'], random_state = 42)
    
    val_cindex_score1 = []
    val_cindex_score2 = []
    
    test_pred_list1 = []
    test_pred_list2 = []
    swa_pred_list= []
    
    for fold, (train_index, test_index) in enumerate(
            kf.split(
                train, train.race_group.astype(str) + (train.age_at_hct == 0.044).astype(str)
            )
        ):
            tt = train.copy()
            train_data = tt.iloc[train_index]
            val_data = tt.iloc[test_index]
        
            X_cat_train, X_num_train, X_cat_val,X_num_val, dl_train, dl_val,transformers = get_dldata(train_data, val_data, cat_cols,num_cols,bs = hparams['batch_size'])
            X_cat_train, X_num_train, X_cat_test,X_num_test, dl_train, dl_test,transformers = get_dldata(train_data, test,cat_cols,num_cols,bs = hparams['batch_size'])
            
            seed_torch(42+fold)
            model = LitNN(
                        continuous_dim = X_num_train.shape[1],
                        categorical_cardinality = [ len(t.classes_) for t in transformers],
                        projection_dim = hparams['projection_dim'],
                        embedding_dim =  hparams['embedding_dim'],
                        hidden_dim = hparams['hidden_dim'],
                        dropout = hparams['dropout']
                        ).to(device)
            optimizer = optim.Adam(model.parameters(), lr=hparams['lr'],weight_decay = hparams['weight_decay'])
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                        optimizer,
                        T_max=45,
                        eta_min=6e-3
                    )
            # 训练过程
            best_val_score1 = float('-inf')
            best_val_score2 = float('-inf')
        
            for epoch in range(hparams['EPOCH']):  # 训练50个epoch
                model.train()
                train_loss = []
                val_loss = []
                val_target = []
                val_pred = []
                
                for x_cat, x_num, y,efs in dl_train:
                    optimizer.zero_grad()
                    x_cat, x_num, y, efs = x_cat.to(device), x_num.to(device),y.to(device),efs.to(device)
                    y_hat,aux_pred,embed  = model(x_cat, x_num)
                    # multi_loss
                    hinge_loss, race_loss = get_full_loss(efs, x_cat, y, y_hat)
                    aux_loss = nn.functional.mse_loss(aux_pred, y, reduction='none')
                    aux_mask = efs == 1
                    aux_loss = (aux_loss * aux_mask).sum() / aux_mask.sum()
                    loss = hinge_loss + aux_loss * hparams['aux_weight']
                    loss.backward()
                    optimizer.step()
                    train_loss.append(loss.item())
                    
                scheduler.step()   
                
                model.eval()
                with torch.no_grad():
                    for x_cat, x_num, y,efs in dl_val:
                        x_cat, x_num, y, efs = x_cat.to(device), x_num.to(device),y.to(device),efs.to(device)
                        y_hat,aux_pred,embed  = model(x_cat, x_num)
                        # multi_loss
                        hinge_loss, race_loss = get_full_loss(efs, x_cat, y, y_hat)
                        aux_loss = nn.functional.mse_loss(aux_pred, y, reduction='none')
                        aux_mask = efs == 1
                        aux_loss = (aux_loss * aux_mask).sum() / aux_mask.sum()
                        loss = hinge_loss + aux_loss * hparams['aux_weight']
                        val_target.append([y, y_hat.detach(), efs, x_cat[:, hparams['race_index']]]) 
                        val_loss.append(loss)
                        val_pred.extend(y_hat.detach().cpu().numpy().tolist())
                    
                cindex, metric = _calc_cindex(val_target)
                # two ways of saving  one public best,the cv best
                if (metric*0.8+cindex*0.2)>=best_val_score1:
                    hparams['way1_save_epoch'][f'{fold+1}'] = epoch+1
                    best_val_score1 = metric
                    oof_odst[test_index] = -np.array(val_pred)
                    torch.save(model.state_dict(),f'hingeloss_nn_fold_{fold+1}.pth')
                    
                if metric>=best_val_score2:
                    best_val_score2 = metric
                    hparams['way2_save_epoch'][f'{fold+1}'] = epoch+1
                    torch.save(model.state_dict(),f'hingeloss_nn_fold_cvbest_{fold+1}.pth')
                
                if (epoch+1)%5 == 0:
                    print(f"fold {fold+1}/{hparams['n_splits']} epoch {epoch+1} val cindex {cindex:.5f} race_cindex {metric:.5f} saving log Public Best { hparams['way1_save_epoch'][f'{fold+1}']} epoch CV Best {hparams['way2_save_epoch'][f'{fold+1}']} epoch")
        
            print('cv best_val_race_cindex',best_val_score2)
            # test_pred pb best cv best
            val_cindex_score1.append(best_val_score1)
            val_cindex_score2.append(best_val_score2)
                
            test_pred1,test_embed1 = nn_predict(dl_test,model,model_path = f'/kaggle/working/hingeloss_nn_fold_{fold+1}.pth')
            test_pred2,test_embed2 = nn_predict(dl_test,model,model_path = f'/kaggle/working/hingeloss_nn_fold_cvbest_{fold+1}.pth')
            test_pred_list1.append(test_pred1)
            test_pred_list2.append(test_pred2)
        
            test_embed += np.array(test_embed1)/hparams['n_splits']
            val_pred,val_embed = nn_predict(dl_val,model,model_path = f'/kaggle/working/hingeloss_nn_fold_{fold+1}.pth')
                     
            oof_embed[test_index] = np.array(val_embed)
        
            # 清理显存
            torch.cuda.empty_cache()

    test_pred_pb = np.dot(val_cindex_score1,test_pred_list1)
    test_pred_cv = np.dot(val_cindex_score2,test_pred_list2)
    
    test_pred =  test_pred_pb #*0.9+test_pred_cv*0.1
    subm_data = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv")
    subm_data['prediction'] = -test_pred
    display(subm_data.head())
    print(f"{hparams['n_splits'] } fold cv mean val race cindex {np.mean(val_cindex_score2)}")
    return oof_odst,subm_data,oof_embed,test_embed
    
cat_cols, num_cols = get_feature_types(train)
hparams = {
        "embedding_dim": 16,
        "projection_dim": 112,
        "hidden_dim": 56,
        "lr": 0.06464861983337984,
        "dropout": 0.05463240181423116,
        "aux_weight": 0.5,
        "batch_size":4096,
        "margin": 0.2588153271003354,
        "weight_decay": 0.0002773544957610778,
        'seed':42,
        'race_index':cat_cols.index("race_group"),
        'EPOCH':60,
        'n_splits':5,
        'shuffle':True,
        'way1_save_epoch':{},
        'way2_save_epoch':{},
    
    }
oof_odst_my,hinge_nn_odst,oof_embed,test_embed = train_nn(train,test,cat_cols, num_cols,hparams)

fold 1/5 epoch 5 val cindex 0.51459 race_cindex 0.50365 saving log Public Best 5 epoch CV Best 4 epoch
fold 1/5 epoch 10 val cindex 0.53599 race_cindex 0.51997 saving log Public Best 10 epoch CV Best 7 epoch
fold 1/5 epoch 15 val cindex 0.64991 race_cindex 0.63468 saving log Public Best 15 epoch CV Best 15 epoch
fold 1/5 epoch 20 val cindex 0.67549 race_cindex 0.65937 saving log Public Best 19 epoch CV Best 19 epoch
fold 1/5 epoch 25 val cindex 0.68889 race_cindex 0.67449 saving log Public Best 25 epoch CV Best 25 epoch
fold 1/5 epoch 30 val cindex 0.69078 race_cindex 0.67592 saving log Public Best 30 epoch CV Best 30 epoch
fold 1/5 epoch 35 val cindex 0.69146 race_cindex 0.67823 saving log Public Best 35 epoch CV Best 35 epoch
fold 1/5 epoch 40 val cindex 0.69178 race_cindex 0.67800 saving log Public Best 40 epoch CV Best 35 epoch
fold 1/5 epoch 45 val cindex 0.69043 race_cindex 0.67516 saving log Public Best 45 epoch CV Best 35 epoch
fold 1/5 epoch 50 val cindex 0.69207 race_cindex 0

Unnamed: 0,ID,prediction
0,28800,-1.723185
1,28801,0.034232
2,28802,-2.130954


5 fold cv mean val race cindex 0.6806970821895346


In [44]:
embed_cols = [f'feat_{i+1}' for i in range(oof_embed.shape[1])]
test_embed_df = pd.DataFrame(columns = embed_cols,data = test_embed)
oof_embed_df = pd.DataFrame(columns = embed_cols,data = oof_embed)

## XGB+LGB+CAT+XGB_COX With Feature1

In [45]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)


train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
print("Train shape:",train.shape)

test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")
print("Test shape:", test.shape )

# train.head()

Train shape: (28800, 60)
Test shape: (3, 58)


In [46]:
from lifelines import *
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.preprocessing import LabelEncoder
# from cib_metric  import score

import pandas as pd
import pandas.api.types
import numpy as np
from lifelines.utils import concordance_index

class ParticipantVisibleError(Exception):
    pass

def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_pred = {'prediction': {0: 1.0, 1: 0.0, 2: 1.0}}
    >>> y_pred = pd.DataFrame(y_pred)
    >>> y_pred.insert(0, row_id_column_name, range(len(y_pred)))
    >>> y_true = { 'efs': {0: 1.0, 1: 0.0, 2: 0.0}, 'efs_time': {0: 25.1234,1: 250.1234,2: 2500.1234}, 'race_group': {0: 'race_group_1', 1: 'race_group_1', 2: 'race_group_1'}}
    >>> y_true = pd.DataFrame(y_true)
    >>> y_true.insert(0, row_id_column_name, range(len(y_true)))
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name)
    0.75
    """
    
    del solution[row_id_column_name]
    del submission[row_id_column_name]
    
    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    for col in submission.columns:
        if not pandas.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_list = []
    for race in merged_df_race_dict.keys():
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))

def c_score(train,oof):
    y_true = train[["ID","efs","efs_time","race_group"]].copy()
    y_pred = train[["ID"]].copy()
    y_pred["prediction"] = oof
    m = score(y_true.copy(), y_pred.copy(), "ID")
    
    return m

# target1
def create_target1(train,time_col='efs_time', event_col='efs'):
    data = train.copy()
    oof_preds = np.zeros(len(data))
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for train_index, valid_index in cv.split(data,data['race_group']):
        train_data = data.iloc[train_index]
        valid_data = data.iloc[valid_index]
    
        kmf = KaplanMeierFitter()
        kmf.fit(durations=train_data[time_col], event_observed=train_data[event_col])
        
        oof_preds[valid_index] = kmf.survival_function_at_times(valid_data[time_col]).values
        
    return oof_preds

# target2
def create_target2(train,time_col='efs_time', event_col='efs'):
    data = train.copy()
    oof_preds = np.zeros(len(data))
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for train_index, valid_index in cv.split(data,data['race_group']):
    
        train_data = data.iloc[train_index]
        valid_data = data.iloc[valid_index]
    
        naf = NelsonAalenFitter()
        naf.fit(durations=train_data[time_col], event_observed=train_data[event_col])
                
        oof_preds[valid_index] = -naf.cumulative_hazard_at_times(valid_data[time_col]).values
        
    return oof_preds


# target3
def create_target3(train,cat_cols,time_col='efs_time', event_col='efs'):
    data = train.copy()
    for col in cat_cols:
        data[col] = LabelEncoder().fit_transform(data[col])
    data = data.fillna(0)
    oof_preds = np.zeros(len(data))
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    for train_index, valid_index in cv.split(data):
    
        train_data = data.loc[train_index]
        valid_data = data.loc[valid_index]
    
        cph = CoxPHFitter(penalizer=1.5)
        cph.fit(train_data, duration_col='efs_time', event_col='efs')
                
        oof_preds[valid_index] = cph.predict_partial_hazard(valid_data) 
        
    return oof_preds
# target4
def create_target4(train,time_col='efs_time', event_col='efs'):
        data = train.copy()
        data['target4'] = data.efs_time.copy()
        data.loc[data.efs == 0, 'target4'] *= -1

        return data['target4']

# target5
def create_target5(train,time_col='efs_time', event_col='efs'):
    data = train.copy()
    oof_preds = np.zeros(len(data))
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for train_index, valid_index in cv.split(data,data['race_group']):
    
        train_data = data.iloc[train_index]
        valid_data = data.iloc[valid_index]
    
        bfh = BreslowFlemingHarringtonFitter()
        bfh.fit(durations=train_data[time_col], event_observed=train_data[event_col])
                
        oof_preds[valid_index] = bfh.survival_function_at_times(valid_data[time_col]).values
        
    return oof_preds

def Encoding_Processing(df, feats):
    cat_cols = []
    num_cols = []
    for c in feats:
        if df[c].dtype=="object" :
            cat_cols.append(c)
            df[c] = df[c].fillna("NAN")
        else:
            num_cols.append(c)
            # df[c] = df[c].fillna(-1)
    for c in feats:
        # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
        if c in cat_cols:
            df[c],_ = df[c].factorize()
            df[c] -= df[c].min()
            df[c] = df[c].astype("int32")
            df[c] = df[c].astype("category")
        # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
        else:
            if df[c].dtype=="float64":
                df[c] = df[c].astype("float32")
            if df[c].dtype=="int64":
                df[c] = df[c].astype("int32")

    return df,cat_cols,num_cols

In [47]:
# 目标构造
train["target1"] = create_target1(train,time_col='efs_time', event_col='efs')
train["target2"] = create_target2(train,time_col='efs_time', event_col='efs')
cat_cols = ['dri_score','psych_disturb','cyto_score','diabetes','tbi_status','arrhythmia','graft_type','vent_hist',
            'renal_issue','pulm_severe','prim_disease_hct','cmv_status','tce_imm_match','rituximab','prod_type','cyto_score_detail',
            'conditioning_intensity','ethnicity','obesity','mrd_hct','in_vivo_tcd','tce_match','hepatic_severe','prior_tumor',
            'peptic_ulcer','gvhd_proph','rheum_issue','sex_match','race_group','hepatic_mild','tce_div_match','donor_related',
            'melphalan_dose','cardiac','pulm_moderate']
train["target3"] = create_target3(train,cat_cols,time_col='efs_time', event_col='efs')
train["target4"] = create_target4(train,time_col='efs_time', event_col='efs')

# 合并
df_all = pd.concat([train,test],axis=0,ignore_index=True)
print("All data shape:",df_all.shape)

drop_feats = ["ID","efs","efs_time","target1",'target2','target3','target4','target5']
feats = [c for c in df_all.columns if not c in drop_feats]

# 交叉统计特征
for col_i in ['dri_score','cyto_score','race_group','prim_disease_hct']:
    for col_j in ['donor_age','age_at_hct']:
        df_all[f'{col_i}_{col_j}_mean'] = df_all[col_i].map(df_all.groupby(col_i)[col_j].mean()).astype("float32")
# 组合特征
df_all['donor_age/age_at_hct'] = df_all['donor_age'] / df_all['age_at_hct']

# 编码
df_all,cat_cols,num_cols = Encoding_Processing(df_all,feats)

All data shape: (28803, 64)


In [48]:
# 分离
df_all = df_all.replace([-np.inf,np.inf],0)
train = df_all.iloc[:len(train)].copy()
test = df_all.iloc[len(train):].reset_index(drop=True).copy()

feats = [col for col in df_all if col not in drop_feats]                                                                                                                                                                                      
print('feats_num:',len(feats))

feats_num: 66


In [49]:
train = pd.concat([train,oof_embed_df],axis = 1)
test = pd.concat([test,test_embed_df],axis = 1)

In [50]:
# 模型定义
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor, CatBoostClassifier
import catboost as cb

def xgb_classifier(train,test,target,feats,seed):
    FOLDS = 5
    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed)

    oof_xgb1 = np.zeros(len(train))
    pred_efs = np.zeros(len(test))

    for i, (train_index, test_index) in enumerate(kf.split(train, train[target])):

        print(f"### Fold {i+1}")
        
        x_train = train.loc[train_index, feats].copy()
        y_train = train.loc[train_index, target]
        x_valid = train.loc[test_index, feats].copy()
        y_valid = train.loc[test_index, target]
        x_test = test[feats].copy()

        model_xgb = XGBClassifier(
            device="cuda",
            max_depth=3,  
            colsample_bytree=0.7129400756425178, 
            subsample=0.8185881823156917, 
            n_estimators=20_000, 
            learning_rate=0.04425768131771064,  
            eval_metric="auc", 
            early_stopping_rounds=100, 
            objective='binary:logistic',
            scale_pos_weight=1.5379160847615545,  
            min_child_weight=4,
            enable_categorical=True,
            gamma=3.1330719334577584,
            random_seed=seed
        )
        model_xgb.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],  
            verbose=False
        )

        # INFER OOF (Probabilities -> Binary)
        oof_xgb1[test_index] = (model_xgb.predict_proba(x_valid)[:, 1] > 0.53).astype(int)
        # INFER TEST (Probabilities -> Average Probs)
        pred_efs += model_xgb.predict_proba(x_test)[:, 1]

    # COMPUTE AVERAGE TEST PREDS
    pred_efs = (pred_efs / FOLDS > 0.53).astype(int)
    # bin_pred = oof_xgb1
    # EVALUATE PERFORMANCE
    accuracy = accuracy_score(train["efs"], oof_xgb1)
    f1 = f1_score(train["efs"], oof_xgb1)
    roc_auc = roc_auc_score(train["efs"], oof_xgb1)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")

    return oof_xgb1,pred_efs

def xgb_model(train,test,target,feats,seed):
    FOLDS = 10
    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed)
        
    oof_xgb = np.zeros(len(train))
    pred_xgb = np.zeros(len(test))
    
    for i, (train_index, test_index) in enumerate(kf.split(train,train["race_group"])):
    
        print(f"### Fold {i+1}")
        
        x_train = train.loc[train_index,feats].copy()
        y_train = train.loc[train_index,target]
        x_valid = train.loc[test_index,feats].copy()
        y_valid = train.loc[test_index,target]
        x_test = test[feats].copy()
    
        model_xgb = XGBRegressor(
            objective='reg:squarederror',
            device="cuda",
            max_depth=6,  
            colsample_bytree=0.5,  
            subsample=0.8,  
            n_estimators=2500,  
            learning_rate=0.03,  
            enable_categorical=True,
            min_child_weight=80,
            # early_stopping_rounds=25,
            random_seed=seed
        )
        model_xgb.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],  
            verbose=False, 
            eval_metric='mae'
        )
    
        # INFER OOF
        pred_val = model_xgb.predict(x_valid)
        oof_xgb[test_index] = model_xgb.predict(x_valid)
        # INFER TEST
        pred_xgb += model_xgb.predict(x_test)
        
    
    # COMPUTE AVERAGE TEST PREDS
    pred_xgb /= FOLDS
    
    print(f"\nOverall CV for XGBoost with {target} =",c_score(train,oof_xgb))
    
    return oof_xgb,pred_xgb

def lgb_model(train,test,target,feats,seed):
    FOLDS = 5
    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed)
        
    oof_lgb = np.zeros(len(train))
    pred_lgb = np.zeros(len(test))
    
    for i, (train_index, test_index) in enumerate(kf.split(train,train["race_group"])):
    
        print(f"### Fold {i+1}")
        
        x_train = train.loc[train_index,feats].copy()
        y_train = train.loc[train_index,target]    
        x_valid = train.loc[test_index,feats].copy()
        y_valid = train.loc[test_index,target]
        x_test = test[feats].copy()
    
        model_lgb = LGBMRegressor(
            device="gpu", 
            max_depth=3, 
            colsample_bytree=0.4,  
            subsample=0.9, 
            n_estimators=2500, 
            learning_rate=0.03, 
            objective="regression", 
            verbose=-1, 
            # early_stopping_rounds=100,
            random_seed=seed,
        )
        model_lgb.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
        )
        
        # INFER OOF
        oof_lgb[test_index] = model_lgb.predict(x_valid)
        # INFER TEST
        pred_lgb += model_lgb.predict(x_test)
    
    # COMPUTE AVERAGE TEST PREDS
    pred_lgb /= FOLDS
    
    print(f"\nOverall CV for LightGBM with {target} =",c_score(train,oof_lgb))
    
    lgb_importance_df = pd.DataFrame({
        "Feature": feats,
        "Importance": model_lgb.feature_importances_ 
    }).sort_values(by="Importance", ascending=False)
    lgb_importance_df.head(10)
    
    return oof_lgb,pred_lgb,lgb_importance_df

def cat_model(train,test,target,feats,seed):
    FOLDS = 5
    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed)
        
    oof_cat = np.zeros(len(train))
    pred_cat = np.zeros(len(test))
    
    for i, (train_index, test_index) in enumerate(kf.split(train,train["race_group"])):
    
        print(f"### Fold {i+1}")
        
        x_train = train.loc[train_index,feats].copy()
        y_train = train.loc[train_index,target]
        x_valid = train.loc[test_index,feats].copy()
        y_valid = train.loc[test_index,target]
        x_test = test[feats].copy()
    
        model_cat = CatBoostRegressor(
            depth=6, 
            learning_rate=0.04699005545173896, 
            l2_leaf_reg=6.853082507365295, 
            colsample_bylevel = 0.9312642681213008, 
            min_data_in_leaf = 14, 
            grow_policy='Depthwise', 
            bootstrap_type='Bernoulli', 
            iterations=1727,
            # task_type="GPU",  
            eval_metric='MAE',
            early_stopping_rounds=100,
            random_seed=seed,
        )
        model_cat.fit(x_train,y_train,
                  eval_set=(x_valid, y_valid),
                  cat_features=cat_cols,
                  verbose=False)
    
        # INFER OOF
        oof_cat[test_index] = model_cat.predict(x_valid)
        # INFER TEST
        pred_cat += model_cat.predict(x_test)
    
    # COMPUTE AVERAGE TEST PREDS
    pred_cat /= FOLDS
    
    print(f"\nOverall CV for CatBoost with {target} =",c_score(train,oof_cat))
    
    return oof_cat,pred_cat

def xgb_cox_model(train,test,target,feats,seed):
    FOLDS = 5
    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed)
        
    oof_xgb_cox = np.zeros(len(train))
    pred_xgb_cox = np.zeros(len(test))
    
    for i, (train_index, test_index) in enumerate(kf.split(train,train["race_group"])):
    
        print(f"### Fold {i+1}")
        
        x_train = train.loc[train_index,feats].copy()
        y_train = train.loc[train_index,target]    
        x_valid = train.loc[test_index,feats].copy()
        y_valid = train.loc[test_index,target]
        x_test = test[feats].copy()
    
        model_xgb_cox = XGBRegressor(
            device="cuda",
            max_depth=6,  
            colsample_bytree=0.5,  
            subsample=0.8,  
            n_estimators=1000,  
            learning_rate=0.03,  
            enable_categorical=True,
            min_child_weight=80,
            objective='survival:cox',
            eval_metric='cox-nloglik',
            random_seed=seed,
        )
        model_xgb_cox.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],  
            verbose=False  
        )
        
        # INFER OOF
        oof_xgb_cox[test_index] = model_xgb_cox.predict(x_valid)
        # INFER TEST
        pred_xgb_cox += model_xgb_cox.predict(x_test)
    
    # COMPUTE AVERAGE TEST PREDS
    pred_xgb_cox /= FOLDS
    
    print(f"\nOverall CV for XGBoost Survival:Cox with {target} =",c_score(train,oof_xgb_cox))
    
    return oof_xgb_cox,pred_xgb_cox

In [51]:
# 后处理分类器一
xgb_bin_feats = [col for col in feats+embed_cols if col not in ['donor_age/age_at_hct']+['psych_disturb']]
train_copy = train.copy()
bin_pred,pred_efs = xgb_classifier(train_copy,test,'efs',xgb_bin_feats,seed=42)

### Fold 1
### Fold 2
### Fold 3
### Fold 4
### Fold 5
Accuracy: 0.6834
F1 Score: 0.7429
ROC AUC Score: 0.6693


In [52]:
%%time
# Target with Target1
# XGBoost
xgb_feats1 = [col for col in feats if col not in []]
oof_km_xgb,pred_km_xgb = xgb_model(train,test,'target1',xgb_feats1,seed=42)
# oof_na_xgb,pred_na_xgb = xgb_mdoel(train,test,'target2',feats)
# oof_cp_xgb,pred_cp_xgb = xgb_mdoel(train,test,'target3',feats)
 
# LightGBM
lgb_feats1 = [col for col in feats if col not in ['donor_age/age_at_hct','hla_match_dqb1_low']]
oof_km_lgb,pred_km_lgb,imp_lgb = lgb_model(train,test,'target1',lgb_feats1,seed=88)
# oof_na_lgb,pred_na_lgb = lgb_mdoel(train,test,'target2',feats)
# oof_cp_lgb,pred_cp_lgb = lgb_mdoel(train,test,'target3',feats)

# CatBoost
cat_feats1 = [col for col in feats if col not in ['donor_age/age_at_hct']]
# cat_cols = [col for col in cat_cols if col not in []]
oof_km_cat,pred_km_cat = cat_model(train,test,'target1',cat_feats1,seed=99)
# oof_na_cat,pred_na_cat = cat_mdoel(train,test,'target2',feats)
# oof_cp_cat,pred_cp_cat = cat_mdoel(train,test,'target3',feats)

# Target with Target3
# XGBoost with Survival:Cox
xgb_feats2 = [col for col in feats if col not in ['donor_age/age_at_hct']]
# oof_km_xgb_cox,pred_km_xgb_cox = xgb_cox_mdoel(train,test,'target1',feats)
# oof_na_xgb_cox,pred_na_xgb_cox = xgb_cox_mdoel(train,test,'target2',feats)
oof_cp_xgb_cox,pred_cp_xgb_cox = xgb_cox_model(train,test,'target3',xgb_feats2,seed=38)

# Target with Target4
# XGBoost with Survival:Cox
xgb_feats3 = [col for col in feats if col not in ['donor_age/age_at_hct']]
oof_cc_xgb_cox,pred_cc_xgb_cox = xgb_cox_model(train,test,'target4',xgb_feats3,seed=42)

### Fold 1
### Fold 2
### Fold 3
### Fold 4
### Fold 5
### Fold 6
### Fold 7
### Fold 8
### Fold 9
### Fold 10

Overall CV for XGBoost with target1 = 0.6671858423023872
### Fold 1
### Fold 2
### Fold 3
### Fold 4
### Fold 5

Overall CV for LightGBM with target1 = 0.672117865098533
### Fold 1
### Fold 2
### Fold 3
### Fold 4
### Fold 5

Overall CV for CatBoost with target1 = 0.6727972812019748
### Fold 1
### Fold 2
### Fold 3
### Fold 4
### Fold 5

Overall CV for XGBoost Survival:Cox with target3 = 0.31263431252234086
### Fold 1
### Fold 2
### Fold 3
### Fold 4
### Fold 5

Overall CV for XGBoost Survival:Cox with target4 = 0.6752528423193033
CPU times: user 22min 38s, sys: 53.4 s, total: 23min 31s
Wall time: 8min 11s


In [53]:
bin_pred_np = np.array(bin_pred)

# Target with Target1
# xgb
oof_km_xgb_np = np.array(oof_km_xgb)
oof_km_xgb_np = np.column_stack((bin_pred_np, oof_km_xgb_np))
oof_km_xgb_np[oof_km_xgb_np[:, 0] == 1, 1] += 0.15
print('Classifier+XGB with Target1:',c_score(train, oof_km_xgb_np[:, 1]))

# lgb
oof_km_lgb_np = np.array(oof_km_lgb)
oof_km_lgb_np = np.column_stack((bin_pred_np, oof_km_lgb_np))
oof_km_lgb_np[oof_km_lgb_np[:, 0] == 1, 1] += 0.15
print('Classifier+LGB with Target1:',c_score(train, oof_km_lgb_np[:, 1]))

# cat
oof_km_cat_np = np.array(oof_km_cat)
oof_km_cat_np = np.column_stack((bin_pred_np, oof_km_cat_np))
oof_km_cat_np[oof_km_cat_np[:, 0] == 1, 1] += 0.10
print('Classifier+CAT with Target1:',c_score(train, oof_km_cat_np[:, 1]))

# Target with Target3
# xgb_cox
oof_cp_xgb_cox_np = np.array(oof_cp_xgb_cox)
oof_cp_xgb_cox_np = np.column_stack((bin_pred_np, oof_cp_xgb_cox_np))
oof_cp_xgb_cox_np[oof_cp_xgb_cox_np[:, 0] == 1, 1] += -5
print('Classifier+XGB_COX with Target3:',c_score(train, -oof_cp_xgb_cox_np[:, 1]))

# Target with Target4
# xgb_cox
oof_cc_xgb_cox_np = np.array(oof_cc_xgb_cox)
oof_cc_xgb_cox_np = np.column_stack((bin_pred_np, oof_cc_xgb_cox_np))
oof_cc_xgb_cox_np[oof_cc_xgb_cox_np[:, 0] == 1, 1] += 0.10
print('Classifier+XGB_COX with Target4:',c_score(train, oof_cc_xgb_cox_np[:, 1]))

# Ensemble
from scipy.stats import rankdata 
weights_tree1 = [8.6, 0.25 , 8.9, 3.1, 4.1]
oof_preds_np1 = np.array([rankdata(oof_km_xgb_np[:, 1]),rankdata(oof_km_lgb_np[:, 1]), 
                          rankdata(oof_km_cat_np[:, 1]),rankdata(oof_cc_xgb_cox_np[:, 1]),
                          rankdata(-oof_cp_xgb_cox_np[:, 1])])
oof_tree_ensemble1 = np.dot(weights_tree1,oof_preds_np1)
print('Ensemble :',c_score(train,oof_tree_ensemble1))

Classifier+XGB with Target1: 0.6821489070633318
Classifier+LGB with Target1: 0.6812527150949776
Classifier+CAT with Target1: 0.6836237377827695
Classifier+XGB_COX with Target3: 0.6748935714172661
Classifier+XGB_COX with Target4: 0.6755431037596067
Ensemble : 0.6863671516597791


In [54]:
############################ submit 1 ############################

# prediction = 9.5*pred_xgb + 4.5*pred_cat + 0.2*pred_lgb

pred_classifier_np = np.array(pred_efs)
# Target with Target1
# xgb
prediction_km_xgb = np.array(pred_km_xgb)
prediction_km_xgb = np.column_stack((pred_classifier_np, prediction_km_xgb))
prediction_km_xgb[prediction_km_xgb[:, 0] == 1, 1] += 0.15

# lgb
prediction_km_lgb = np.array(pred_km_lgb)
prediction_km_lgb = np.column_stack((pred_classifier_np, prediction_km_lgb))
prediction_km_lgb[prediction_km_lgb[:, 0] == 1, 1] += 0.15

# cat
prediction_km_cat = np.array(pred_km_cat)
prediction_km_cat = np.column_stack((pred_classifier_np, prediction_km_cat))
prediction_km_cat[prediction_km_cat[:, 0] == 1, 1] += 0.1
# Target with Target4
# xgb_cox
prediction_cc_xgb_cox = np.array(pred_cc_xgb_cox)
prediction_cc_xgb_cox = np.column_stack((pred_classifier_np, prediction_cc_xgb_cox))
prediction_cc_xgb_cox[prediction_cc_xgb_cox[:, 0] == 1, 1] += 0.1

# Target with Target3
# xgb_cox
prediction_cp_xgb_cox = np.array(pred_cp_xgb_cox)
prediction_cp_xgb_cox = np.column_stack((pred_classifier_np, prediction_cp_xgb_cox))
prediction_cp_xgb_cox[prediction_cp_xgb_cox[:, 0] == 1, 1] += -5

prediction_np1 = np.array([rankdata(prediction_km_xgb[:, 1]),rankdata(prediction_km_lgb[:, 1]),
                           rankdata(prediction_km_cat[:, 1]),rankdata(prediction_cc_xgb_cox[:, 1]),
                           rankdata(-prediction_cp_xgb_cox[:, 1])])
tree_ensemble1 = np.dot(weights_tree1,prediction_np1)

## XGB+LGB+CAT With Feature2

In [55]:
import numpy as np, pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
print("Train shape:",train.shape)

test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")
print("Test shape:", test.shape )
# 目标构造
train["target1"] = create_target1(train,time_col='efs_time', event_col='efs')
train["target4"] = create_target4(train,time_col='efs_time', event_col='efs')

# 合并
df_all = pd.concat([train,test],axis=0,ignore_index=True)

def add_features(df):
 
    sex_match = df.sex_match.astype(str)
    df['sex_match_bool'] = sex_match.str.split("-").str[0] == sex_match.str.split("-").str[1]
    df['sex_match_bool'] = df['sex_match_bool'].astype(int)
    df.loc[df.sex_match.isna(), 'sex_match_bool'] = np.nan
    df['big_age'] = df.age_at_hct > 16
    df['big_age'] = df['big_age'].astype(int)
    df['is_cyto_score_same'] = (df['cyto_score'] == df['cyto_score_detail']).astype(int)
    df['strange_age'] = df.age_at_hct == 0.044
    df['age_bin'] = pd.cut(df.age_at_hct, [0, 0.0441, 16, 30, 50, 100]).factorize()[0]
    df['age_ts'] = df.age_at_hct / df.donor_age+1e-10
    df['is_cyto_score_same'] = (df['cyto_score'] == df['cyto_score_detail']).astype(int)
    df['year_hct'] -= 2000
    # df['age_difference'] = abs(df['donor_age'] - df['age_at_hct'])

    print("< deal with outlier >")
    return df
df_all = add_features(df_all)

Train shape: (28800, 60)
Test shape: (3, 58)
< deal with outlier >


In [56]:
drop_feats = ["ID","efs","efs_time","target1",'target2','target3','target4','target5']
feats = [c for c in df_all.columns if not c in drop_feats]

# 编码
df_all,cat_cols,num_cols = Encoding_Processing(df_all,feats)
        
#分离
df_all = df_all.replace([-np.inf,np.inf],0)
train = df_all.iloc[:len(train)].copy()
test = df_all.iloc[len(train):].reset_index(drop=True).copy()

feats = [col for col in df_all if col not in drop_feats]                                                                                                                                                                                      
print('feats_num:',len(feats))

feats_num: 63


In [57]:
%%time
# 后处理分类器二
xgb_bin_feats2 = [col for col in feats if col not in []]
bin_pred2,pred_efs2 = xgb_classifier(train,test,'efs',xgb_bin_feats2,seed=3047)

### Fold 1
### Fold 2
### Fold 3
### Fold 4
### Fold 5
Accuracy: 0.6856
F1 Score: 0.7455
ROC AUC Score: 0.6713
CPU times: user 13.9 s, sys: 172 ms, total: 14 s
Wall time: 13.2 s


In [58]:
%%time
# XGBoost 
xgb_feats3 = [col for col in feats if col not in []]
oof_km_xgb2,pred_km_xgb2 = xgb_model(train,test,'target1',xgb_feats3,seed=42)

# LightGBM 
lgb_feats2 = [col for col in feats if col not in []]
oof_km_lgb2,pred_km_lgb2,imp_lgb2 = lgb_model(train,test,'target1',lgb_feats2,seed=42)

# CatBoost 
cat_feats2 = [col for col in feats if col not in []]
# cat_cols = [col for col in cat_cols if col not in []]
oof_km_cat2,pred_km_cat2 = cat_model(train,test,'target1',cat_feats2,seed=42)

# XGBoost with Survival:Cox
# Target4
xgb_feats5 = [col for col in feats if col not in []]
oof_cc_xgb_cox3,pred_cc_xgb_cox3 = xgb_cox_model(train,test,'target4',xgb_feats5,seed=42)
# # Target3
# oof_cp_xgb_cox3,pred_cp_xgb_cox3 = xgb_cox_mdoel(train,test,'target3',feats,seed=42)

### Fold 1
### Fold 2
### Fold 3
### Fold 4
### Fold 5
### Fold 6
### Fold 7
### Fold 8
### Fold 9
### Fold 10

Overall CV for XGBoost with target1 = 0.6688326897635961
### Fold 1
### Fold 2
### Fold 3
### Fold 4
### Fold 5

Overall CV for LightGBM with target1 = 0.6713649500486902
### Fold 1
### Fold 2
### Fold 3
### Fold 4
### Fold 5

Overall CV for CatBoost with target1 = 0.6716839606407026
### Fold 1
### Fold 2
### Fold 3
### Fold 4
### Fold 5

Overall CV for XGBoost Survival:Cox with target4 = 0.675923790009344
CPU times: user 21min 3s, sys: 53.5 s, total: 21min 57s
Wall time: 7min 32s


## XGB_COX With Feature3

In [59]:
train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")

# 目标构造
train["target1"] = create_target1(train,time_col='efs_time', event_col='efs')
train["target4"] = create_target4(train,time_col='efs_time', event_col='efs')

# 合并
df_all = pd.concat([train,test],axis=0,ignore_index=True)

def add_features(df):
 
    sex_match = df.sex_match.astype(str)
    df['sex_match_bool'] = sex_match.str.split("-").str[0] == sex_match.str.split("-").str[1]
    df['sex_match_bool'] = df['sex_match_bool'].astype(int)
    df.loc[df.sex_match.isna(), 'sex_match_bool'] = np.nan
    df['big_age'] = df.age_at_hct > 16
    df['big_age'] = df['big_age'].astype(int)
    df['is_cyto_score_same'] = (df['cyto_score'] == df['cyto_score_detail']).astype(int)
    df['strange_age'] = df.age_at_hct == 0.044
    df['age_bin'] = pd.cut(df.age_at_hct, [0, 0.0441, 16, 30, 50, 100]).factorize()[0]
    df['age_ts'] = df.age_at_hct / df.donor_age+1e-10
    df['is_cyto_score_same'] = (df['cyto_score'] == df['cyto_score_detail']).astype(int)
    df['year_hct'] -= 2000
    # df['age_difference'] = abs(df['donor_age'] - df['age_at_hct'])
    print("< deal with outlier >")
    df['nan_value_each_row'] = df.isnull().sum(axis=1)
    #year_hct=2020 only 4 rows.
    df['year_hct']=df['year_hct'].replace(2020,2019)
    df['age_group']=df['age_at_hct']//10
    #karnofsky_score 40 only 10 rows.
    df['karnofsky_score']=df['karnofsky_score'].replace(40,50)
    #hla_high_res_8=2 only 2 rows.
    df['hla_high_res_8']=df['hla_high_res_8'].replace(2,3)
    #hla_high_res_6=0 only 1 row.
    df['hla_high_res_6']=df['hla_high_res_6'].replace(0,2)
    #hla_high_res_10=3 only 1 row.
    df['hla_high_res_10']=df['hla_high_res_10'].replace(3,4)
    #hla_low_res_8=2 only 1 row.
    df['hla_low_res_8']=df['hla_low_res_8'].replace(2,3)
    df['dri_score']=df['dri_score'].replace('Missing disease status','N/A - disease not classifiable')
    df['dri_score_NA']=df['dri_score'].apply(lambda x:int('N/A' in str(x)))
    for col in ['diabetes','pulm_moderate','cardiac']:
        df.loc[df[col].isna(),col]='Not done'

    print("< cross feature >")
    df['donor_age-age_at_hct']=df['donor_age']-df['age_at_hct']
    df['comorbidity_score+karnofsky_score']=df['comorbidity_score']+df['karnofsky_score']
    df['comorbidity_score-karnofsky_score']=df['comorbidity_score']-df['karnofsky_score']
    df['comorbidity_score*karnofsky_score']=df['comorbidity_score']*df['karnofsky_score']
    df['comorbidity_score/karnofsky_score']=df['comorbidity_score']/df['karnofsky_score']
    
    # print("< fillna >")
       
    # print("< drop useless columns >")
    return df

df_all = add_features(df_all)

< deal with outlier >
< cross feature >


In [60]:
drop_feats = ["ID","efs","efs_time","target1",'target2','target3','target4','target5']
feats = [c for c in df_all.columns if not c in drop_feats]

# 编码
df_all,cat_cols,num_cols = Encoding_Processing(df_all,feats)

# 分离
df_all = df_all.replace([-np.inf,np.inf],0)
train = df_all.iloc[:len(train)].copy()
test = df_all.iloc[len(train):].reset_index(drop=True).copy()

feats = [col for col in df_all if col not in drop_feats]                                                                                                                                                                                      
print('feats_num:',len(feats))

feats_num: 71


In [61]:
%%time
# XGBoost with Survival:Cox
xgb_feats4 = [col for col in feats if col not in []]
oof_cc_xgb_cox2,pred_cc_xgb_cox2 = xgb_cox_model(train,test,'target4',xgb_feats4,seed=42)

### Fold 1
### Fold 2
### Fold 3
### Fold 4
### Fold 5

Overall CV for XGBoost Survival:Cox with target4 = 0.6746962320110944
CPU times: user 55.5 s, sys: 150 ms, total: 55.7 s
Wall time: 28.5 s


In [62]:
bin_pred_np2 = np.array(bin_pred2)

# Target with Target1
# xgb
oof_km_xgb2_np = np.array(oof_km_xgb2)
oof_km_xgb2_np = np.column_stack((bin_pred_np2, oof_km_xgb2_np))
oof_km_xgb2_np[oof_km_xgb2_np[:, 0] == 1, 1] += 0.15
print('Classifier+XGB with Target1:',c_score(train, oof_km_xgb2_np[:, 1]))

# lgb
oof_km_lgb2_np = np.array(oof_km_lgb2)
oof_km_lgb2_np = np.column_stack((bin_pred_np2, oof_km_lgb2_np))
oof_km_lgb2_np[oof_km_lgb2_np[:, 0] == 1, 1] += 0.15
print('Classifier+LGB with Target1:',c_score(train, oof_km_lgb2_np[:, 1]))

# cat
oof_km_cat2_np = np.array(oof_km_cat2)
oof_km_cat2_np = np.column_stack((bin_pred_np2, oof_km_cat2_np))
oof_km_cat2_np[oof_km_cat2_np[:, 0] == 1, 1] += 0.15
print('Classifier+CAT with Target1:',c_score(train, oof_km_cat2_np[:, 1]))

# Target with Target4
# xgb_cox2
oof_cc_xgb_cox2_np = np.array(oof_cc_xgb_cox2)
oof_cc_xgb_cox2_np = np.column_stack((bin_pred_np2, oof_cc_xgb_cox2_np))
oof_cc_xgb_cox2_np[oof_cc_xgb_cox2_np[:, 0] == 1, 1] += 0.15
print('Classifier+XGB_COX2 with Target4:',c_score(train, oof_cc_xgb_cox2_np[:, 1]))

# xgb_cox3
oof_cc_xgb_cox3_np = np.array(oof_cc_xgb_cox3)
oof_cc_xgb_cox3_np = np.column_stack((bin_pred_np2, oof_cc_xgb_cox3_np))
oof_cc_xgb_cox3_np[oof_cc_xgb_cox3_np[:, 0] == 1, 1] += 0.15
print('Classifier+XGB_COX3 with Target4:',c_score(train, oof_cc_xgb_cox3_np[:, 1]))

# # Target with Target3
# # xgb_cox3
# oof_cp_xgb_cox3_np = np.array(oof_cp_xgb_cox3)
# oof_cp_xgb_cox3_np = np.column_stack((bin_pred_np2, oof_cp_xgb_cox3_np))
# oof_cp_xgb_cox3_np[oof_cp_xgb_cox3_np[:, 0] == 1, 1] += -5
# print('Classifier+XGB_COX3 with Target3:',c_score(train, -oof_cp_xgb_cox3_np[:, 1]))

# Ensemble
from scipy.stats import rankdata 
weights_tree2 = [9.98, 0.01, 2.47, 0.25, 1.54]
oof_preds_np2 = np.array([rankdata(oof_km_xgb2_np[:, 1]),rankdata(oof_km_lgb2_np[:, 1]),
                          rankdata(oof_km_cat2_np[:, 1]),rankdata(oof_cc_xgb_cox2_np[:, 1]),
                          rankdata(oof_cc_xgb_cox3_np[:, 1])])
oof_tree_ensemble2 = np.dot(weights_tree2,oof_preds_np2)
print('Ensemble :',c_score(train,oof_tree_ensemble2))

Classifier+XGB with Target1: 0.6826835922779488
Classifier+LGB with Target1: 0.6822911975208037
Classifier+CAT with Target1: 0.6818758792272545
Classifier+XGB_COX2 with Target4: 0.6750770914232703
Classifier+XGB_COX3 with Target4: 0.6762702846074273
Ensemble : 0.6847136721248132


In [63]:
c_score(train,(oof_tree_ensemble1*0.47+oof_tree_ensemble2*0.53))

0.6874868806396811

In [64]:
############################ submit 2 ############################

pred_classifier_np2 = np.array(pred_efs2)
# Target with Target1
# xgb
prediction_km_xgb2 = np.array(pred_km_xgb2)
prediction_km_xgb2 = np.column_stack((pred_classifier_np2, prediction_km_xgb2))
prediction_km_xgb2[prediction_km_xgb2[:, 0] == 1, 1] += 0.15

# lgb
prediction_km_lgb2 = np.array(pred_km_lgb2)
prediction_km_lgb2 = np.column_stack((pred_classifier_np2, prediction_km_lgb2))
prediction_km_lgb2[prediction_km_lgb2[:, 0] == 1, 1] += 0.15

# cat
prediction_km_cat2 = np.array(pred_km_cat2)
prediction_km_cat2 = np.column_stack((pred_classifier_np2, prediction_km_cat2))
prediction_km_cat2[prediction_km_cat2[:, 0] == 1, 1] += 0.15

# Target with Target4
# xgb_cox
prediction_cc_xgb_cox2 = np.array(pred_cc_xgb_cox2)
prediction_cc_xgb_cox2 = np.column_stack((pred_classifier_np2, prediction_cc_xgb_cox2))
prediction_cc_xgb_cox2[prediction_cc_xgb_cox2[:, 0] == 1, 1] += 0.15

prediction_cc_xgb_cox3 = np.array(pred_cc_xgb_cox3)
prediction_cc_xgb_cox3 = np.column_stack((pred_classifier_np2, prediction_cc_xgb_cox3))
prediction_cc_xgb_cox3[prediction_cc_xgb_cox3[:, 0] == 1, 1] += 0.15

# Target with Target3
# # xgb_cox3
# prediction_cp_xgb_cox3 = np.array(pred_cp_xgb_cox3)
# prediction_cp_xgb_cox3 = np.column_stack((pred_classifier_np2, prediction_cp_xgb_cox3))
# prediction_cp_xgb_cox3[prediction_cp_xgb_cox3[:, 0] == 1, 1] += -5

prediction_np2 = np.array([rankdata(prediction_km_xgb2[:, 1]),rankdata(prediction_km_lgb2[:, 1]),
                           rankdata(prediction_km_cat2[:, 1]),rankdata(prediction_cc_xgb_cox2[:, 1]),
                           rankdata(prediction_cc_xgb_cox3[:, 1])])

tree_ensemble2 = np.dot(weights_tree2,prediction_np2)

## NN Post Process ,cross fusion:(low1+high1)*w1+(low1+high2)*w2

In [65]:
combined_pred_odst = np.column_stack((pred_classifier_np2,hinge_nn_odst.prediction.values))
combined_pred_odst[combined_pred_odst[:, 0] == 1, 1] += 0.2
my_odst = rankdata(combined_pred_odst[:,1])

combined_pred_open = np.column_stack((pred_classifier_np2,res.prediction.values))
combined_pred_open[combined_pred_open[:, 0] == 1, 1] += 0.2
nn_0690 = rankdata(combined_pred_open[:,1])


preds_test = [
    test_nn_multi[:,0],
    test_nn_multi[:,1],
    test_nn_multi_dnn[:,0],
    test_nn_multi_dnn[:,1],
]
combined_pred_multi0  = np.column_stack((pred_classifier_np2,preds_test[0]))
combined_pred_multi1  = np.column_stack((pred_classifier_np2,preds_test[1]))
combined_pred_multi2  = np.column_stack((pred_classifier_np2,preds_test[2]))
combined_pred_multi3  = np.column_stack((pred_classifier_np2,preds_test[3]))

combined_pred_multi0[combined_pred_multi0[:, 0] == 1, 1] += 0.4
combined_pred_multi1[combined_pred_multi1[:, 0] == 1, 1] += 0.25
combined_pred_multi2[combined_pred_multi2[:, 0] == 1, 1] += 0.45
combined_pred_multi3[combined_pred_multi3[:, 0] == 1, 1] += 0.25

preds_test[0] =combined_pred_multi0[:,1]
preds_test[1] =combined_pred_multi1[:,1]
preds_test[2] =combined_pred_multi2[:,1]
preds_test[3] =combined_pred_multi3[:,1]


weights = [0.55987332, 0.01202344, 0.2947752 , 0.13332805]
nn_ranked_preds = np.array([rankdata(p) for p in preds_test])
test_nn_ensemble_preds = np.dot(weights, nn_ranked_preds)

# cross fussion
final_ensemble_nn = (test_nn_ensemble_preds*0.15 + nn_0690 *0.85)*0.41+(test_nn_ensemble_preds*0.20 + my_odst*0.80)*0.59

In [66]:
#final ensembel
whole_ensemble = rankdata(final_ensemble_nn)*0.545+(rankdata(tree_ensemble1)*0.47+rankdata(tree_ensemble2)*0.53)*0.455

In [67]:
sub = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv")
sub.prediction = whole_ensemble
sub.to_csv("submission.csv",index=False)
print("Sub shape:",sub.shape)
sub.head()

Sub shape: (3, 2)


Unnamed: 0,ID,prediction
0,28800,2.0
1,28801,3.0
2,28802,1.0
