# DeepSurv

In [86]:
# import libraries

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

In [90]:
# import dataset

url = 'https://raw.githubusercontent.com/camicallierotti/imperial-summer-project/main/pbc.csv'
df = pd.read_csv(url, sep=";", encoding='latin1',engine='python', header=0)
df.head(5)

Unnamed: 0,id,time,status,trt,age,sex,ascites,hepato,spiders,edema,bili,chol,albumin,copper,alk.phos,ast,trig,platelet,protime,stage
0,1,400,2,D-penicillmain,587652293,female,yes,yes,yes,edema,145,261.0,26,156.0,1718,13795,172.0,190.0,122,IV
1,2,4500,0,D-penicillmain,5644626968,female,no,yes,yes,no,11,302.0,414,54.0,73948,11352,88.0,221.0,106,III
2,3,1012,2,D-penicillmain,7007255305,male,no,no,no,untreated,14,176.0,348,210.0,516,961,55.0,151.0,12,IV
3,4,1925,2,D-penicillmain,5474058864,female,no,yes,yes,untreated,18,244.0,254,64.0,61218,6063,92.0,183.0,103,IV
4,5,1504,1,Placebo,3810540726,female,no,yes,yes,no,34,279.0,353,143.0,671,11315,72.0,136.0,109,III


In [91]:
# label encoding

labelencoder = LabelEncoder()
df = df.apply(lambda col: labelencoder.fit_transform(col.astype(str)), axis=0, result_type='expand')
df['trt'] = labelencoder.fit_transform(df['trt'])
df

Unnamed: 0,id,time,status,trt,age,sex,ascites,hepato,spiders,edema,bili,chol,albumin,copper,alk.phos,ast,trig,platelet,protime,stage
0,0,312,2,0,267,0,2,2,2,0,29,68,13,38,111,44,56,55,22,3
1,111,334,0,0,249,0,1,2,2,1,8,96,138,120,246,15,134,80,6,2
2,222,1,2,0,333,1,1,1,1,2,11,18,78,62,192,172,103,31,20,3
3,332,138,2,0,223,0,1,2,2,2,15,54,10,129,209,132,138,51,3,3
4,362,86,1,2,52,0,1,2,2,1,58,81,83,30,227,14,118,19,9,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412,349,354,2,1,317,0,0,0,0,1,9,201,30,158,295,179,146,44,9,2
413,350,17,0,1,61,0,0,0,0,1,6,201,112,158,295,179,146,48,12,3
414,351,5,0,1,256,0,0,0,0,1,13,201,72,158,295,179,146,25,47,2
415,352,355,0,1,260,0,0,0,0,1,5,201,104,158,295,179,146,115,4,2


In [92]:
# split dataset

def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

train, validate, test = train_validate_test_split(df)

In [93]:
# make deepsurv datasets

def dataframe_to_deepsurv_ds(df, event_col = 'status', time_col = 'time'):
    # Extract the event and time columns as numpy arrays
    e = df[event_col].values.astype(np.int32)
    t = df[time_col].values.astype(np.float32)

    # Extract the patient's covariates as a numpy array
    x_df = df.drop([event_col, time_col], axis = 1)
    x = x_df.values.astype(np.float32)
    
    # Return the deep surv dataframe
    return {
        'x' : x,
        'e' : e,
        't' : t
    }

train_data = dataframe_to_deepsurv_ds(train, event_col = 'status', time_col = 'time')
test_data = dataframe_to_deepsurv_ds(test, event_col = 'status', time_col = 'time')
validate_data = dataframe_to_deepsurv_ds(validate, event_col = 'status', time_col = 'time')


In [95]:
# use standard hyperparameters (for now)

hyperparams = {
    'L2_reg': 10.0,
    'batch_norm': True,
    'dropout': 0.4,
    'hidden_layers_sizes': [25, 25],
    'learning_rate': 1e-05,
    'lr_decay': 0.001,
    'momentum': 0.9,
    'n_in': train_data['x'].shape[1],
    'standardize': True
}

In [None]:
# train deepsurv

