# TensorCox
- example script to apply for a CSV file.

## Load modules:

- Core dependency: 
    numpy, torch

In [1]:
# import the necessary modules
import sys
import os
import torch
import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from torch.utils.data import DataLoader
from sklearn.model_selection import KFold #

# set root directory
# path to the folder
dir_root = '/nfs/nobackup/gerstung/awj/projects/TensorCox_/'
os.chdir(dir_root)

# appends the path to the COX script 
sys.path.append(dir_root + 'TensorCox/')

# import COX model
from TensorCox import loglikelihood
from TensorCox import Fisher
from metrics import concordance
from metrics import RMSE
from dataloader import CSV_Dataset
from dataloader import ToTensor
from dataloader import custom_collate

torch.manual_seed(7)
np.random.seed(7)

## Data:

- usually this is not how you would process your data. (but since this is just a small csv file...) 
- the standard approch is to use a dataloader object (handels preprocessing and distributes the data loading over processes).
- see https://pytorch.org/docs/stable/data.html
- the data need to be standardized (can lead to numerical instabillity otherwise - Nan or Inf) 

- usually we have a surv file (numpy array Nx3 - start, stop, event) - this is to handle time-dependent covariates. 

In [2]:
# importing csv data
heart = pd.read_csv('data/heart.csv', sep=';')
surv = np.asarray(heart[['start', 'stop', 'event']])
X = np.asarray(heart[['age', 'year', 'surgery', 'transplant']])
#X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

# transforming them to tensors
surv = torch.from_numpy(surv)
X = torch.from_numpy(X)

heart

Unnamed: 0,start,stop,event,age,year,surgery,transplant
0,0.0,50.0,1,-17.155373,0.123203,0,0
1,0.0,6.0,1,3.835729,0.254620,0,0
2,0.0,1.0,0,6.297057,0.265572,0,0
3,1.0,16.0,1,6.297057,0.265572,0,1
4,0.0,36.0,0,-7.737166,0.490075,0,0
...,...,...,...,...,...,...,...
167,0.0,38.0,0,-12.939083,6.395619,1,0
168,38.0,39.0,0,-12.939083,6.395619,1,1
169,0.0,31.0,0,1.516769,6.417522,0,0
170,0.0,11.0,0,-7.608487,6.472279,0,0


## Opimizer:

In [3]:
# optimizer
parameters = X.shape[1]
theta = torch.normal(0, 0.01, (parameters, 1), dtype=torch.float64, requires_grad=True)
eta = 0.00
lr = 0.01
optimizer = torch.optim.Adam([theta], lr=lr)

## Model fit:

In [6]:
for _ in tqdm.tqdm(range(1000)):
    idx = np.random.choice(X.shape[0], 50, replace=True)
    optimizer.zero_grad()
    linpred = torch.mm(X, theta)
    logL = -loglikelihood(surv, linpred) 
    logL.backward()
    optimizer.step()


100%|██████████| 1000/1000 [00:01<00:00, 516.26it/s]


In [7]:
for ii in theta:
    print(ii[0].detach().numpy())

0.027152080764527135
-0.14611575000260033
-0.6358434755992775
-0.011895850963813822


## Concordance

In [8]:
concordance(surv.detach().numpy(), torch.mm(X, theta).detach().numpy())

0.6355388360256694

## Variance estimation

- variance estiamtion is usualy done as an individual pass through the data after the model has been fit. 
- important here is to do it only once over the data - usefull to increase batch size. 

In [9]:
# Fisher information
# split into random data splits - emulate batch sampling
kf = KFold(n_splits=int(np.round(heart.shape[0]/50)), shuffle=True)
kf.get_n_splits(np.arange(X.shape[0]))

F = np.zeros((X.shape[1],X.shape[1]))
F = torch.from_numpy(F)
with torch.no_grad():
    for _, idx in kf.split(X):
            linpred = torch.mm(X, theta)
            F += Fisher(surv[idx], X[idx], linpred[idx])
print(torch.diagonal(torch.sqrt(torch.inverse(F))))

tensor([0.0142, 0.0729, 0.3811, 0.3260], dtype=torch.float64)


# R - Code
- code to fit the same model in R
- to compare model estimates 

rm(list=ls())
library(survival)
library(stargazer)

data("heart")
heart

heart <- heart[is.finite(rowSums(colon)),]

names(heart)
m1 = coxph(Surv(start, stop, event) ~age  + year + surgery + transplant, data=heart)
summary(m1)