In [265]:
import pandas as pd
import numpy as np
import time

In [211]:
from sksurv.datasets import load_veterans_lung_cancer
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline

In [204]:
data = pd.read_pickle('ALLDATA.pkl')

In [205]:
data=data.loc[-data['Wait Time (s)'].isin(['Err1','Err2','Err3','Err4','Err5'])]
data=data.loc[data['Age_9-12'].isin([0,1])]
data_y = pd.DataFrame(index=range( data.shape[0]),columns=['Status','Survival'])
data_y['Survival']=data['Wait Time (s)'].values
data_y["Status"] = pd.DataFrame([True] * data.shape[0]).values   #all pedestrians cross, so no right censored data
data_x=data.loc[:,['Speed Limit', 'Lane Width', 'Minimum Gap', 'Mean Arrival Rate', 'AV', 
                   'Full Braking Before Impact_-1.0', 'Full Braking Before Impact_1', 
                   'Full Braking Before Impact_2', 'Full Braking Before Impact_3', 'Clear', 'Snowy',
                   'One way', 'two way', 'Two way with median', 'Day', 'Night', 'numcars', 
                    'Age_9-12', 'Age_15-18', 'Age_12-15', 'Age_18 - 24', 'Age_25 - 29', 'Age_30 - 39', 'Age_40 - 49', 'Age_50 - 59', 'Age_60+', 'Gender_Female', 'Occupation_Employed', 'Occupation_Student', 'Occupation_Unemployed', 
                   'Occupation_kid', 'Education_Bachelors degree', 'Education_College/University student', 
                   'Education_Doctorate degree', 'Education_High school diploma', 'Education_Masters degree', 
                   'Education_Professional degree', 'Education_kid', 'driving license_Yes', 'mode_Bike', 'mode_Car',
                   'mode_Public Transit', 'mode_Walking', 'workwalk_No', 'workwalk_Sometimes', 'workwalk_Yes', 
                   'shopwalk_No', 'shopwalk_Sometimes', 'shopwalk_Yes', 'shopwalk_kid', 'Vrexp_Yes', 'Heart_Currently',
                   'Heart_Over the years', 'vision_Currently', 'vision_Currently;Over the years', 
                   'vision_Over the years', 'anxiety_Currently', 'anxiety_Over the years', 'Headaches_Currently',
                   'Headaches_Over the years', 'dizziness_Over the years']]       #numwalk and VRexpnum removed because of some false inputs in the data should be fixed later

In [206]:
status = data_y["Status"]
survival = data_y['Survival']
data_y = np.zeros(data.shape[0], dtype={'names':('Status', 'Survival'),
                          'formats':('bool', 'f8')})

data_y['Status'] = status
data_y['Survival'] = survival

In [219]:
estimator = CoxPHSurvivalAnalysis(alpha=0.000001, n_iter=100, tol=1e-09, verbose=0)
estimator.fit(data_x,data_y)

CoxPHSurvivalAnalysis(alpha=1e-06, n_iter=100, tol=1e-09, verbose=0)

In [221]:
estimator.score(data_x, data_y)

0.6153736986757725

In [None]:
def fit_and_score_features(X, y):
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis(alpha=0.000001,n_iter=100, tol=1e-09, verbose=0)
    for j in range(n_features):
        Xj = X[:, j:j+1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores

scores = fit_and_score_features(data_x.values, data_y)
pd.Series(scores, index=data_x.columns).sort_values(ascending=False)

In [238]:
data_x=data_x.loc[:,pd.Series(scores, index=data_x.columns)[scores>=0.5].index]   #covariates that have no effect are removed

estimator.fit(data_x,data_y)
estimator.score(data_x, data_y)

0.6140138252685933

In [None]:
pd.Series(estimator.coef_, index=data_x.columns)

In [272]:
pipe = Pipeline([('select', SelectKBest(fit_and_score_features, k=3)),
                 ('model', CoxPHSurvivalAnalysis(alpha=0.000001,n_iter=100, tol=1e-09, verbose=10))])

In [269]:
starttime=time.time()

param_grid = {'select__k': np.arange(1,data_x.shape[1]+1)}
gcv = GridSearchCV(pipe, param_grid, return_train_score=True)

gcv.fit(data_x, data_y)
endtime=time.time()


pd.DataFrame(gcv.cv_results_).sort_values(by='mean_test_score', ascending=False)
timeelapsed=endtime-starttime



In [243]:
pipe.set_params(**gcv.best_params_)
pipe.fit(data_x, data_y)

encoder, transformer, final_estimator = [s[1] for s in pipe.steps]
pd.Series(final_estimator.coef_, index=encoder.encoded_columns_[transformer.get_support()])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_select__k,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,7.819082,0.159009,0.014175,0.000523,1,{'select__k': 1},0.521824,0.511175,0.529566,0.520855,0.007539,1,0.548833,0.539363,0.548,0.545399,0.004282


In [253]:
pipe.set_params(**gcv.best_params_)
pipe.fit(data_x, data_y)

Pipeline(memory=None,
     steps=[('select', SelectKBest(k=1,
      score_func=<function fit_and_score_features at 0x7f1ff329f7b8>)), ('model', CoxPHSurvivalAnalysis(alpha=1e-06, n_iter=100, tol=1e-09, verbose=0))])