In [16]:
#import necessary libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from pycox.models import CoxPH, DeepHit

from pycox.evaluation import EvalSurv
from pycox.simulations import SimStudyLinearPH

import torch
import torchtuples as tt

In [17]:
import shap
import xgboost as xgb
from sklearn.model_selection import train_test_split
import matplotlib.pylab as pl

## data

In [18]:
url = 'https://raw.githubusercontent.com/camicallierotti/imperial-summer-project/main/pbc.csv'
df = pd.read_csv(url, sep=";", encoding='latin1',engine='python', header=0, decimal=',')
df = pd.DataFrame(df)

In [19]:
df.head()

Unnamed: 0,id,time,status,trt,age,sex,ascites,hepato,spiders,edema,bili,chol,albumin,copper,alk.phos,ast,trig,platelet,protime,stage
0,1,400,2,D-penicillmain,58.765229,female,yes,yes,yes,edema,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,IV
1,2,4500,0,D-penicillmain,56.44627,female,no,yes,yes,no,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,III
2,3,1012,2,D-penicillmain,70.072553,male,no,no,no,untreated,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,IV
3,4,1925,2,D-penicillmain,54.740589,female,no,yes,yes,untreated,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,IV
4,5,1504,1,Placebo,38.105407,female,no,yes,yes,no,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,III


In [30]:
# to label encoding categorical variables
le = LabelEncoder() #label encoder
cat_list = ['trt', 'age', 'sex', 'ascites', 'hepato', 'spiders', 'edema','stage'] #list of categorical variables
for cat in cat_list:
    df[cat] = le.fit_transform(df[cat])
df['sex'] = le.fit_transform(df['sex'])

In [31]:
df.head()

Unnamed: 0,id,time,status,trt,age,sex,ascites,hepato,spiders,edema,bili,chol,albumin,copper,alk.phos,ast,trig,platelet,protime,stage
0,1,400,2,0,219,0,1,1,1,0,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,3
1,2,4500,0,0,205,0,0,1,1,1,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,2
2,3,1012,2,0,266,1,0,0,0,2,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,3
3,4,1925,2,0,184,0,0,1,1,2,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,3
4,5,1504,1,1,42,0,0,1,1,1,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,2


In [32]:
# summary of the data set
df.isnull().sum() # number of missing values

id          0
time        0
status      0
trt         0
age         0
sex         0
ascites     0
hepato      0
spiders     0
edema       0
bili        0
chol        0
albumin     0
copper      0
alk.phos    0
ast         0
trig        0
platelet    0
protime     0
stage       0
dtype: int64

In [33]:
# to check if there are missing values in the variables 
print(f'total numbers of missing values :{df.isnull().sum().max()} and total rows in dataframe {len(df)}')

total numbers of missing values :0 and total rows in dataframe 276


In [69]:
# to drop missing value from the dataframe
df_new = df
df_new.dropna(how='any', inplace= True)

In [70]:
len(df_new)

276

In [71]:
# descriptive of the data i
df_new.describe()

Unnamed: 0,id,time,status,trt,age,sex,ascites,hepato,spiders,edema,bili,chol,albumin,copper,alk.phos,ast,trig,platelet,protime,stage
count,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0
mean,158.615942,1979.166667,0.869565,0.507246,137.394928,0.123188,0.068841,0.514493,0.289855,1.028986,3.333696,371.26087,3.516812,100.768116,1996.611594,124.119239,124.978261,261.771739,10.735507,2.039855
std,91.662926,1112.380295,0.959743,0.500856,79.655804,0.32925,0.253643,0.500698,0.454519,0.389723,4.601074,234.788363,0.404789,88.268746,2115.477894,56.719952,65.280761,93.128857,1.008315,0.853999
min,1.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,120.0,1.96,4.0,289.0,28.38,33.0,62.0,9.0,0.0
25%,79.75,1185.75,0.0,0.0,68.75,0.0,0.0,0.0,0.0,1.0,0.8,249.5,3.31,42.75,922.5,82.4575,85.0,200.0,10.0,1.0
50%,157.5,1788.0,0.0,1.0,137.5,0.0,0.0,1.0,0.0,1.0,1.4,310.0,3.545,74.0,1277.5,116.625,108.0,257.0,10.6,2.0
75%,240.25,2689.75,2.0,1.0,206.25,0.0,0.0,1.0,1.0,1.0,3.525,401.0,3.7725,129.25,2068.25,153.45,151.25,318.25,11.2,3.0
max,312.0,4556.0,2.0,1.0,274.0,1.0,1.0,1.0,1.0,2.0,28.0,1775.0,4.4,588.0,13862.4,457.25,598.0,563.0,17.1,3.0


In [72]:
data_train = df_new #train data 
data_test = df_new.sample(frac= 0.2) #test data
data_val = df_new.sample (frac= 0.2) #val data

In [73]:
# function to get targets variable
Y = lambda df_new: (df_new['time'].values, df_new['status'].values)

# to standardize and get dependent variables 
x_num = ['age','bili', 'chol', 'albumin', 'copper', 'alk.phos',
       'ast', 'trig', 'platelet', 'protime'] # list of variables for standardization
x_cat = ['trt','sex', 'ascites', 'hepato',
       'spiders', 'edema', 'stage'] #categorical variables 
standardize = [([num], StandardScaler()) for num in x_num]
label = [([cat], None) for cat in x_cat]

X_map = DataFrameMapper(standardize + label)

In [74]:
x_train = X_map.fit_transform(data_train).astype('float32') # x train
x_test= X_map.fit_transform(data_test).astype('float32') # x test
x_val = X_map.fit_transform(data_val).astype('float32') # x va


In [75]:
y_train = Y(data_train) # y train
time_test , status_test = Y(data_test) # test dependent variables Time & Status
y_val = Y(data_val) # y val
val = x_val, y_val # validation data for x and y 

In [80]:
y_train

(array([ 400, 4500, 1012, 1925, 1504, 1832, 2466, 2400,   51, 3762,  304,
        3577, 3584, 3672,  769,  131, 4232, 1356, 3445,  673,  264, 4079,
        4127, 1444,   77,  549, 4509,  321, 3839, 4523, 3170, 3933, 2847,
        3611,  223, 3244, 2297, 4556, 3428, 2256, 2576, 4427, 2598, 3853,
        2386, 1434, 1360, 1847, 3282, 2224, 4365, 4256, 3090,  859, 1487,
        3992, 4191, 2769, 4039, 1170, 4196, 4184, 4190, 1827, 1191,   71,
         326, 1690, 3707,  890, 2540, 3574, 4050, 4032, 3358, 1657,  198,
        2452, 1741, 2689,  460,  388, 3913,  750,  611, 3823, 3820,  552,
        3581, 3099,  110, 3086, 3092, 3388, 2583, 2504, 2105, 2350, 3445,
         980, 3395, 3422, 3336, 1083, 2288,  515, 2033,  191, 3297, 3069,
        2468, 3255, 1413,  850, 2944, 2796, 3149, 3150, 3098, 2990, 1297,
        2106, 3059, 3050, 2419,  786,  943, 2976, 2995, 1427,  762, 2870,
        1152, 2863,  140, 2666,  853, 2835, 2475, 1536, 2772, 2797,  186,
        2055, 1077, 2721, 1682, 1212, 

In [81]:
y_train_structured = y_train.to_records(index=False)

AttributeError: 'tuple' object has no attribute 'to_records'

## xgboost

In [76]:
# create xgboost dmatrix
xgb_train = xgb.DMatrix(x_train, label=y_train)
xgb_test = xgb.DMatrix(x_test, label=y_test)

In [77]:
xgb_train

<xgboost.core.DMatrix at 0x7fa21adc77c0>

In [78]:
xgb_test

<xgboost.core.DMatrix at 0x7fa21a823b50>

In [79]:
# use validation set to choose # of trees
params = {
    "eta": 0.002,
    "max_depth": 3,
    "objective": "survival:cox",
    "subsample": 0.5
}
model_train = xgb.train(params, xgb_train, 10000, evals = [(xgb_test, "test")], verbose_eval=1000)

XGBoostError: [12:43:29] /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/data/data.cc:556: Check failed: labels_.Size() == num_row_ (2 vs. 276) : Size of labels must equal to number of rows.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x000000015af5a0fe dmlc::LogMessageFatal::~LogMessageFatal() + 110
  [bt] (1) 2   libxgboost.dylib                    0x000000015afad55c xgboost::MetaInfo::Validate(int) const + 1164
  [bt] (2) 3   libxgboost.dylib                    0x000000015b0140ca xgboost::LearnerImpl::ValidateDMatrix(xgboost::DMatrix*, bool) const + 58
  [bt] (3) 4   libxgboost.dylib                    0x000000015b003c01 xgboost::LearnerImpl::UpdateOneIter(int, std::__1::shared_ptr<xgboost::DMatrix>) + 273
  [bt] (4) 5   libxgboost.dylib                    0x000000015af5009c XGBoosterUpdateOneIter + 156
  [bt] (5) 6   libffi.7.dylib                      0x00000001101f8ead ffi_call_unix64 + 85
  [bt] (6) 7   ???                                 0x00007ffee05e69f0 0x0 + 140732662704624



In [None]:
def c_statistic_harrell(pred, labels):
    total = 0
    matches = 0
    for i in range(len(labels)):
        for j in range(len(labels)):
            if labels[j] > 0 and abs(labels[i]) > labels[j]:
                total += 1
                if pred[j] > pred[i]:
                    matches += 1
    return matches/total

# see how well we can order people by survival
c_statistic_harrell(model_train.predict(xgb_test, ntree_limit=5000), y_test)