In [109]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import random
from sksurv.metrics import concordance_index_censored

## Model hyperparameters

In [110]:
num_pred_nodes = 1000
hidden = [400,500,600,800]
learning_rate = 1e-5
epoch = 100

## Data preprocessing

In [111]:
data_path = "./data/hvgs"
f_list = os.listdir(data_path)

file = os.path.join(data_path, f_list[0])

df = pd.read_csv(file)
# device = torch.device('cuda:0')
device = torch.device('cpu')

x = df.values[:, 3:]
y = df.values[:, 1:3]
seed = 42 ## shuffle random seed num
duration_max = df.duration.max()

## split time point into num_pred_nodes
duration_reference = [
    duration_max / num_pred_nodes * i for i in range(num_pred_nodes)
]

x_train, x_test, y_train, y_test = train_test_split(
    x, y, random_state=42, train_size=0.8,stratify=y[:,1]
)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train, y_train, random_state=42, train_size=0.8,stratify=y_train[:,1]
)


print(f"x train {x_train.shape}, positive ratio {y_train[:,1].sum()/len(y_train):.3f}")
print(f"x valid {x_valid.shape}, positive ratio {y_valid[:,1].sum()/len(y_valid):.3f}")
print(f"x test {x_test.shape}, positive ratio {y_test[:,1].sum()/len(y_test):.3f}")


x train (156, 246), positive ratio 0.058
x valid (40, 246), positive ratio 0.050
x test (50, 246), positive ratio 0.060


In [112]:
def label_modifier(y):
    """ 
    Converting duration and event label into vector type label.

    eg1. Uncensored case
        [3, 1] (duration, event) 
        duration_reference [0,2,4,6,8,10]
        --> y_extend_lab: [0,0,1,1,1,1]
        --> y_mask: [1,1,1,1,1,1]
    eg2. Censored case
        [2, 0] (duration, event)
        --> y_extend_lab: [0,1,1,1,1,1]
        --> y_mask: [1,0,0,0,0,0]

    Args:
        y (list, np.array): Label information including duration and event occurrence

    Returns:
        (y, y_extend_lab, y_mask)
    """
    y_extend_lab = []
    y_mask = []
    for _y in y:
        y_extend_lab.append((np.array(duration_reference) >= _y[0]) * 1)
        if _y[1] == 0:
            y_mask.append(~(np.array(duration_reference) >= _y[0]) * 1)
        else:
            y_mask.append(np.ones_like(y_extend_lab[-1]))
    y_extend_lab = torch.tensor(np.array(y_extend_lab).astype(float))
    y_mask = torch.tensor(np.array(y_mask).astype(int))
    return y, y_extend_lab, y_mask

In [113]:
y_train_mask = label_modifier(y_train)
y_valid_mask = label_modifier(y_valid)
y_test_mask = label_modifier(y_test)

## Data loader setting

In [114]:
class data_process(Dataset):
    """ Return batch data including x, y, y_mask, y_orig
    """
    def __init__(self, x, y, device="cpu", x_mean=None, x_std=None) -> None:
        super().__init__()
        self.x = torch.tensor(x.astype(np.float32))
        self.y_orig = torch.tensor(y[0].astype(int)).to(device)
        self.y = y[1].to(device)
        self.y_mask = y[2].to(device)
        
        if x_mean is None:
            self.mean = self.x.mean(0)
            self.std = self.x.std(0)
            # self.x = (self.x - self.mean) / self.std
        else:
            self.mean = x_mean
            self.std = x_std
        self.x = (self.x - self.mean) / (self.std+1e-6)
        self.x = self.x.to(device)
    def return_mean_std(self):
        return self.mean, self.std

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx], self.y_mask[idx], self.y_orig[idx]

In [115]:
data_tr = data_process(x_train, y_train_mask)
mean, std = data_tr.return_mean_std()
data_val = data_process(x_valid, y_valid_mask, x_mean=mean, x_std=std)
data_test = data_process(x_test, y_test_mask, x_mean=mean, x_std=std)
dl_train = DataLoader(data_tr, batch_size=64, shuffle=True)
dl_valid = DataLoader(data_val, batch_size=64)
dl_test = DataLoader(data_test, batch_size=64)

## Model

In [116]:
class model(nn.Module):
    def __init__(self, inp_size, num_pred_tgt, hidden=[400,500,600,800]):
        super().__init__()
        hidden = [inp_size] + hidden + [num_pred_tgt]
        self.linears = nn.ModuleList(
            [nn.Linear(hidden[i], hidden[i + 1]) for i in range(len(hidden) - 1)]
        )
        self.n_linears = len(self.linears)

    def forward(self, x):
        for i in range(self.n_linears - 1):
            x = self.linears[i](x)
            x = F.relu(x)
        x = self.linears[-1](x)
        return x

In [117]:
loss_fn = nn.BCEWithLogitsLoss(reduction="none")
m = model(x.shape[1], num_pred_nodes, hidden).to(device)
optimizer = torch.optim.Adam(m.parameters(), lr=learning_rate)

## Performance check metric calc

In [118]:
def c_index_calc(logit, true_y):
    pred = []
    for i in range(logit.shape[0]):
        print(i, logit[i])
        min_idx = torch.where(logit[i]>0)[0].min().item()
        pred.append(1/(min_idx+torch.sigmoid(logit[i][min_idx]).item()))
    # print(pred)
    event_indicator = (true_y[:, 1]==1)
    event_time = true_y[:,0]
    return concordance_index_censored(event_indicator,event_time, pred)

## Model training

In [119]:
def eval(model, data_loader, device):
    loss_tot = 0
    for i, batch in enumerate(data_loader):
        model.eval()
        logit = model(batch[0].to(device))
        if i==0:
            logit_tot = logit
            y_tot = batch[3]
        else:
            logit_tot = torch.concat((logit_tot, logit), 0)
            y_tot = torch.concat((y_tot, batch[3]), 0)
        loss = (loss_fn(logit,batch[1].to(device)) * batch[2].to(device)).mean()
        loss_tot+=loss
    # print(logit_tot, y_tot)
    c_idx = c_index_calc(logit_tot, y_tot)
    return loss_tot/(i+1), c_idx