In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from Data_Loader import sort_data
from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_censored
from Data_Loader import load_data
from Train import trainCox_nnet

import torch
import numpy as np
import pandas as pd
from torch.utils.data.dataloader import DataLoader
dtype = torch.FloatTensor


# Data preprocessing

### Import the data

In [None]:
from Data_extraction_lung import data_norm_df_lung, output_df_lung

data_norm_df_lung= data_norm_df_lung.reset_index(drop=True)
output_df_lung = output_df_lung.reset_index(drop=True)

In [None]:
data = pd.concat([data_norm_df_lung,output_df_lung], axis=1)

### Standardisation + PCA

In [None]:
from sklearn.preprocessing import StandardScaler
x, ytime, yevent, age = sort_data(data)
x = StandardScaler().fit_transform(x)
pca = PCA(100)
pca.fit(x)
x = pca.transform(x)

# Model 

In [None]:
from lifelines import CoxPHFitter

cph = CoxPHFitter(penalizer=0.001, l1_ratio = 0.5)
x_df=pd.DataFrame(x, columns = np.arange(len(x[0,:])))
data_cph = pd.concat([x_df,pd.DataFrame({'age': age[:,0]/365, 'yevent': yevent[:,0], 'ytime': ytime[:,0]})], axis=1)
cph.fit(data_cph, duration_col = 'ytime', event_col = 'yevent')

### Cross validation

In [None]:
from sklearn.model_selection import KFold
k_folds = 5
kfold = KFold(n_splits=k_folds, shuffle=True)
c_index_cph = []
cph_AIC = []
for fold,(train_idx,test_idx) in enumerate(kfold.split(data_cph)):
    cph = CoxPHFitter(penalizer=0.1, l1_ratio=0.1)
    data_train = data_cph.loc[train_idx].reset_index( drop = True )
    data_test  = data_cph.loc[test_idx].reset_index( drop = True )

    # Creating the X, T and E input
    X_train, X_test = data_train.drop(['ytime','yevent'], axis=1).values, data_test.drop(['ytime','yevent'], axis=1).values
    T_train, T_test = data_train['ytime'].values, data_test['ytime'].values
    E_train, E_test = data_train['yevent'].values, data_test['yevent'].values
    cph.fit(data_train, duration_col = 'ytime', event_col = 'yevent')
    Estimate = cph.predict_partial_hazard(X_test)
    E_test2= [bool(k) for k in E_test]
    c_index_cph.append(concordance_index_censored(E_test2, T_test,Estimate))

In [None]:
print([c_index_cph[k][0] for k in range (5)])
print(np.sum(c_index_cph[k][0] for k in range (5))/5)
print(np.std(np.array([c_index_cph[k][0] for k in range (5)])))

### Summary of the results

In [None]:
summary = cph.print_summary()


In [None]:
cph.check_assumptions(data_cph, p_value_threshold = 0.05)

# Visualisation

### Survival function for 4 different ages

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(5,10))
cph.plot_partial_effects_on_outcome(covariates = 'age', values = [41, 45, 70, 84], cmap = 'coolwarm')
plt.xlim((0,5000))
plt.ylabel("Survival probability")
plt.xlabel("Time in days")
plt.legend()
plt.grid(True)