# Baseline Model Implementation using Lifelines package

In [1]:
import pandas as pd
import numpy as np
import os
from lifelines import KaplanMeierFitter,CoxPHFitter

In [2]:
train_path = '../files_provided/data/train/'
test_path = '../files_provided/data/test/'

In [61]:
features_to_select = ['original_shape_Sphericity','original_shape_SurfaceVolumeRatio','original_shape_Maximum3DDiameter',
                      'SourceDataset','Nstage','original_glcm_JointEntropy',
                      'original_glcm_Idn','original_glcm_Idmn']

In [62]:
def get_features_from_csvs(path):
    """Extract features from radiomics and clinical_data csvs into a combined DataFrame"""
    dfr = pd.read_csv(path + 'features/radiomics.csv',skiprows=[0,2])
    dfc = pd.read_csv(path + 'features/clinical_data.csv')
    # Labeling the unlabled column
    dfr.columns.values[0] = "PatientID"
    # Changing the 'SourceDataset' column to categorical
    dfc.SourceDataset = pd.Categorical(dfc.SourceDataset)
    dfc.SourceDataset = dfc.SourceDataset.cat.codes
    # Joining the dataframes on 'PatientID'
    df_joined = dfc.join(dfr.set_index('PatientID'), on='PatientID')
    return df_joined

# print(df_joined.head())

In [63]:
# Creating the Training DataFrame
df_train = get_features_from_csvs(train_path)
dfl = pd.read_csv(train_path + 'train_labels.csv')
# Join with labels 
df_train = df_train.join(dfl.set_index('PatientID'), on='PatientID')
# Selecting the 8 baseline features along with ['SurvivalTime','Event']
df_train = df_train[features_to_select + ['SurvivalTime','Event']]
# df_train

In [64]:
# Creating the Testing DataFrame
df_test = get_features_from_csvs(test_path)
# Selecting the 8 baseline features
df_test = df_test[features_to_select]
# df_test

In [65]:
# Using the Cox’s proportional hazard model implementation from the 'lifelines' package
cph = CoxPHFitter()
cph.fit(df_train, 'SurvivalTime', event_col='Event')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'SurvivalTime'
event col,'Event'
number of observations,300
number of events observed,162
partial log-likelihood,-801.67
time fit was run,2020-02-13 13:29:09 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
original_shape_Sphericity,2.27,9.69,1.37,-0.41,4.95,0.66,141.7,1.66,0.10,3.37
original_shape_SurfaceVolumeRatio,1.36,3.91,1.15,-0.89,3.62,0.41,37.38,1.18,0.24,2.08
original_shape_Maximum3DDiameter,0.01,1.01,0.0,0.0,0.02,1.0,1.02,2.13,0.03,4.93
SourceDataset,-0.94,0.39,0.29,-1.51,-0.38,0.22,0.69,-3.27,<0.005,9.86
Nstage,0.12,1.13,0.07,-0.02,0.27,0.98,1.31,1.66,0.10,3.36
original_glcm_JointEntropy,0.22,1.25,0.13,-0.03,0.47,0.97,1.6,1.75,0.08,3.65
original_glcm_Idn,46.72,1.95e+20,18.45,10.56,82.88,38466.17,9.899999999999999e+35,2.53,0.01,6.46
original_glcm_Idmn,-89.09,0.0,38.69,-164.93,-13.25,0.0,0.0,-2.3,0.02,5.55

0,1
Concordance,0.70
Log-likelihood ratio test,"74.54 on 8 df, -log2(p)=40.58"


In [66]:
# Get test PatientsID list
df = pd.read_csv(test_path + 'features/clinical_data.csv')
test_patient_list = df['PatientID'].to_list()  
len(test_patient_list)

125

In [67]:
# Predicting on the test set
survival_times = cph.predict_expectation(df_test)
survival_times = list(survival_times.values.flatten())
df_predicted = pd.DataFrame({'PatientID':test_patient_list, 'SurvivalTime':survival_times, 'Event':np.nan})
print(df_predicted)
df_predicted.to_csv('out.csv',index=False)

     PatientID  SurvivalTime  Event
0           13    652.803074    NaN
1          155   1444.081720    NaN
2          404    828.677308    NaN
3          407    965.317241    NaN
4            9   2764.311832    NaN
..         ...           ...    ...
120         66    365.062549    NaN
121        132    671.351587    NaN
122        169   2455.078042    NaN
123        199    421.190052    NaN
124        274    853.650339    NaN

[125 rows x 3 columns]
