In [124]:
import pandas as pd
import numpy as np
import time
from lifelines import CoxPHFitter
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor    
from lifelines import WeibullAFTFitter
from lifelines.utils import k_fold_cross_validation


In [79]:
data = pd.read_pickle('ALLDATA.pkl')
data=data.loc[-data['Wait Time (s)'].isin(['Err1','Err2','Err3','Err4','Err5'])]
data=data.loc[data['Age_9-12'].isin([0,1])]


data=data.loc[:,['Wait Time (s)','Speed Limit', 'Lane Width', 'Minimum Gap', 'Mean Arrival Rate', 'AV', 
                   'Full Braking Before Impact_-1.0', 'Full Braking Before Impact_1', 
                   'Full Braking Before Impact_2', 'Full Braking Before Impact_3', 'Snowy',
                   'One way', 'two way', 'Two way with median', 'Night', 'numcars', 
                    'Age_9-12', 'Age_15-18', 'Age_12-15', 'Age_18 - 24', 'Age_25 - 29', 'Age_30 - 39', 'Age_40 - 49', 'Age_50 - 59', 'Age_60+', 'Gender_Female', 'Occupation_Employed', 'Occupation_Student', 'Occupation_Unemployed', 
                   'Occupation_kid', 'Education_Bachelors degree', 'Education_College/University student', 
                   'Education_Doctorate degree', 'Education_High school diploma', 'Education_Masters degree', 
                   'Education_Professional degree', 'driving license_Yes', 'mode_Bike', 'mode_Car',
                   'mode_Public Transit', 'mode_Walking', 'workwalk_No', 'workwalk_Sometimes', 'workwalk_Yes', 
                   'shopwalk_No', 'shopwalk_Sometimes', 'shopwalk_Yes', 'shopwalk_kid', 'Vrexp_Yes', 'Heart_Currently',
                   'Heart_Over the years', 'vision_Currently', 
                   'vision_Over the years', 'anxiety_Currently', 'anxiety_Over the years', 'Headaches_Currently',
                   'Headaches_Over the years', 'dizziness_Over the years']]       #numwalk and VRexpnum removed because of some false inputs in the data should be fixed later


data=data.apply(pd.to_numeric, errors='coerce')

In [59]:
def plot_corr(df,size=10):
    '''Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot'''

    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)
    plt.xticks(range(len(corr.columns)), corr.columns);
    plt.yticks(range(len(corr.columns)), corr.columns);

In [60]:
def calculate_vif_(X, thresh=20.0):
    variables = list(range(X.shape[1]))
    dropped = True
    while dropped:
        dropped = False
        vif = [variance_inflation_factor(X.iloc[:, variables].values, ix)
               for ix in range(X.iloc[:, variables].shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X.iloc[:, variables].columns[maxloc] +
                  '\' at index: ' + str(maxloc))
            del variables[maxloc]
            dropped = True

    print('Remaining variables:')
    print(X.columns[variables])
    return X.iloc[:, variables]

data=calculate_vif_(data, thresh=180.0)

In [81]:
data['E']=1   #all pedestrians cross, so no right censored data

In [82]:
cph = CoxPHFitter()
cph.fit(data, 'Wait Time (s)', event_col='E')
cph.print_summary()

<lifelines.CoxPHFitter: fitted with 2463 observations, 0 censored>
      duration col = 'Wait Time (s)'
         event col = 'E'
number of subjects = 2463
  number of events = 2463
    log-likelihood = -16560.60
  time fit was run = 2019-03-25 17:36:12 UTC

---
                                      coef  exp(coef)  se(coef)     z      p  -log2(p)  lower 0.95  upper 0.95
Speed Limit                           0.02       1.02      0.00  6.19 <0.005     30.59        0.01        0.02
Lane Width                           -0.59       0.56      0.11 -5.56 <0.005     25.14       -0.79       -0.38
Minimum Gap                           0.00       1.00      0.05  0.01   0.99      0.01       -0.10        0.10
Mean Arrival Rate                    -0.00       1.00      0.00 -8.96 <0.005     61.36       -0.00       -0.00
AV                                   -0.00       1.00      0.00 -0.88   0.38      1.40       -0.01        0.00
Full Braking Before Impact_1          0.14       1.15      0.21  0.66   

In [83]:
data=data.drop(columns=['Age_15-18','Age_12-15','Age_60+','Minimum Gap'
                       ],axis=1)

In [84]:
cph = CoxPHFitter()
cph.fit(data, 'Wait Time (s)', event_col='E')
cph.print_summary()

<lifelines.CoxPHFitter: fitted with 2463 observations, 0 censored>
      duration col = 'Wait Time (s)'
         event col = 'E'
number of subjects = 2463
  number of events = 2463
    log-likelihood = -16574.07
  time fit was run = 2019-03-25 17:38:01 UTC

---
                                      coef  exp(coef)  se(coef)     z      p  -log2(p)  lower 0.95  upper 0.95
Speed Limit                           0.02       1.02      0.00  5.90 <0.005     28.03        0.01        0.02
Lane Width                           -0.57       0.56      0.11 -5.43 <0.005     24.11       -0.78       -0.37
Mean Arrival Rate                    -0.00       1.00      0.00 -9.01 <0.005     62.14       -0.00       -0.00
AV                                   -0.00       1.00      0.00 -0.93   0.35      1.51       -0.01        0.00
Full Braking Before Impact_1          0.15       1.17      0.21  0.74   0.46      1.12       -0.25        0.56
Full Braking Before Impact_2          0.09       1.09      0.21  0.44   

In [85]:
data=data.drop(columns=['Full Braking Before Impact_1','numcars','Full Braking Before Impact_3','Full Braking Before Impact_2','Night',
                        'Education_Professional degree','Occupation_Student',
                        'shopwalk_kid','Heart_Currently','anxiety_Over the years'],axis=1)

In [86]:
cph = CoxPHFitter()
cph.fit(data, 'Wait Time (s)', event_col='E')
cph.print_summary()

<lifelines.CoxPHFitter: fitted with 2463 observations, 0 censored>
      duration col = 'Wait Time (s)'
         event col = 'E'
number of subjects = 2463
  number of events = 2463
    log-likelihood = -16586.50
  time fit was run = 2019-03-25 17:38:58 UTC

---
                                      coef  exp(coef)  se(coef)     z      p  -log2(p)  lower 0.95  upper 0.95
Speed Limit                           0.02       1.02      0.00  5.92 <0.005     28.19        0.01        0.02
Lane Width                           -0.58       0.56      0.10 -5.58 <0.005     25.28       -0.78       -0.38
Mean Arrival Rate                    -0.00       1.00      0.00 -9.00 <0.005     61.94       -0.00       -0.00
AV                                   -0.00       1.00      0.00 -0.92   0.36      1.48       -0.00        0.00
Snowy                                -0.05       0.95      0.04 -1.05   0.29      1.77       -0.13        0.04
two way                               0.09       1.10      0.05  1.85   

In [87]:
data=data.drop(columns=['shopwalk_Sometimes','mode_Car','Education_College/University student','mode_Walking','shopwalk_Sometimes' ],axis=1)

In [88]:
cph = CoxPHFitter()
cph.fit(data, 'Wait Time (s)', event_col='E')
cph.print_summary()

<lifelines.CoxPHFitter: fitted with 2463 observations, 0 censored>
      duration col = 'Wait Time (s)'
         event col = 'E'
number of subjects = 2463
  number of events = 2463
    log-likelihood = -16587.34
  time fit was run = 2019-03-25 17:40:20 UTC

---
                               coef  exp(coef)  se(coef)     z      p  -log2(p)  lower 0.95  upper 0.95
Speed Limit                    0.02       1.02      0.00  5.91 <0.005     28.13        0.01        0.02
Lane Width                    -0.58       0.56      0.10 -5.57 <0.005     25.22       -0.78       -0.37
Mean Arrival Rate             -0.00       1.00      0.00 -8.93 <0.005     61.05       -0.00       -0.00
AV                            -0.00       1.00      0.00 -0.91   0.36      1.47       -0.00        0.00
Snowy                         -0.05       0.95      0.04 -1.09   0.28      1.86       -0.13        0.04
two way                        0.09       1.10      0.05  1.84   0.07      3.92       -0.01        0.19
Two way wi

In [89]:
data=data.drop(columns=['Age_25 - 29','Occupation_Unemployed','Headaches_Currently'],axis=1)

In [90]:
cph = CoxPHFitter()
cph.fit(data, 'Wait Time (s)', event_col='E')
cph.print_summary()

<lifelines.CoxPHFitter: fitted with 2463 observations, 0 censored>
      duration col = 'Wait Time (s)'
         event col = 'E'
number of subjects = 2463
  number of events = 2463
    log-likelihood = -16587.79
  time fit was run = 2019-03-25 17:41:20 UTC

---
                               coef  exp(coef)  se(coef)     z      p  -log2(p)  lower 0.95  upper 0.95
Speed Limit                    0.02       1.02      0.00  5.92 <0.005     28.18        0.01        0.02
Lane Width                    -0.58       0.56      0.10 -5.64 <0.005     25.81       -0.78       -0.38
Mean Arrival Rate             -0.00       1.00      0.00 -8.91 <0.005     60.79       -0.00       -0.00
AV                            -0.00       1.00      0.00 -0.92   0.36      1.49       -0.00        0.00
Snowy                         -0.05       0.96      0.04 -1.05   0.30      1.76       -0.13        0.04
two way                        0.09       1.09      0.05  1.80   0.07      3.80       -0.01        0.19
Two way wi

In [91]:
data=data.drop(columns=['Gender_Female','Education_Doctorate degree','dizziness_Over the years'],axis=1)

In [92]:
cph = CoxPHFitter()
cph.fit(data, 'Wait Time (s)', event_col='E')
cph.print_summary()

<lifelines.CoxPHFitter: fitted with 2463 observations, 0 censored>
      duration col = 'Wait Time (s)'
         event col = 'E'
number of subjects = 2463
  number of events = 2463
    log-likelihood = -16589.65
  time fit was run = 2019-03-25 17:41:59 UTC

---
                               coef  exp(coef)  se(coef)     z      p  -log2(p)  lower 0.95  upper 0.95
Speed Limit                    0.02       1.02      0.00  6.05 <0.005     29.33        0.01        0.02
Lane Width                    -0.58       0.56      0.10 -5.66 <0.005     25.94       -0.78       -0.38
Mean Arrival Rate             -0.00       1.00      0.00 -8.91 <0.005     60.79       -0.00       -0.00
AV                            -0.00       1.00      0.00 -0.91   0.36      1.46       -0.00        0.00
Snowy                         -0.04       0.96      0.04 -1.02   0.31      1.71       -0.13        0.04
two way                        0.09       1.09      0.05  1.72   0.09      3.55       -0.01        0.18
Two way wi

In [93]:
data=data.drop(columns=['workwalk_Sometimes'],axis=1)

In [94]:
cph = CoxPHFitter()
cph.fit(data, 'Wait Time (s)', event_col='E')
cph.print_summary()

<lifelines.CoxPHFitter: fitted with 2463 observations, 0 censored>
      duration col = 'Wait Time (s)'
         event col = 'E'
number of subjects = 2463
  number of events = 2463
    log-likelihood = -16590.55
  time fit was run = 2019-03-25 17:53:33 UTC

---
                               coef  exp(coef)  se(coef)     z      p  -log2(p)  lower 0.95  upper 0.95
Speed Limit                    0.02       1.02      0.00  5.97 <0.005     28.66        0.01        0.02
Lane Width                    -0.59       0.56      0.10 -5.73 <0.005     26.58       -0.79       -0.39
Mean Arrival Rate             -0.00       1.00      0.00 -8.97 <0.005     61.54       -0.00       -0.00
AV                            -0.00       1.00      0.00 -0.90   0.37      1.44       -0.00        0.00
Snowy                         -0.04       0.96      0.04 -1.00   0.32      1.65       -0.13        0.04
two way                        0.09       1.09      0.05  1.74   0.08      3.61       -0.01        0.18
Two way wi

In [95]:
data=data.drop(columns=['vision_Currently'],axis=1)

In [96]:
cph = CoxPHFitter()
cph.fit(data, 'Wait Time (s)', event_col='E')
cph.print_summary()

<lifelines.CoxPHFitter: fitted with 2463 observations, 0 censored>
      duration col = 'Wait Time (s)'
         event col = 'E'
number of subjects = 2463
  number of events = 2463
    log-likelihood = -16590.61
  time fit was run = 2019-03-25 17:54:04 UTC

---
                               coef  exp(coef)  se(coef)     z      p  -log2(p)  lower 0.95  upper 0.95
Speed Limit                    0.02       1.02      0.00  5.99 <0.005     28.79        0.01        0.02
Lane Width                    -0.59       0.56      0.10 -5.73 <0.005     26.55       -0.79       -0.39
Mean Arrival Rate             -0.00       1.00      0.00 -8.98 <0.005     61.69       -0.00       -0.00
AV                            -0.00       1.00      0.00 -0.88   0.38      1.41       -0.00        0.00
Snowy                         -0.04       0.96      0.04 -1.02   0.31      1.70       -0.13        0.04
two way                        0.09       1.09      0.05  1.73   0.08      3.57       -0.01        0.18
Two way wi

In [101]:
data=data.drop(columns=['AV','Snowy'],axis=1)

In [103]:
cph = CoxPHFitter()
cph.fit(data, 'Wait Time (s)', event_col='E')
cph.print_summary()

<lifelines.CoxPHFitter: fitted with 2463 observations, 0 censored>
      duration col = 'Wait Time (s)'
         event col = 'E'
number of subjects = 2463
  number of events = 2463
    log-likelihood = -16591.50
  time fit was run = 2019-03-25 17:55:36 UTC

---
                               coef  exp(coef)  se(coef)     z      p  -log2(p)  lower 0.95  upper 0.95
Speed Limit                    0.02       1.02      0.00  6.08 <0.005     29.64        0.01        0.02
Lane Width                    -0.56       0.57      0.10 -5.73 <0.005     26.57       -0.76       -0.37
Mean Arrival Rate             -0.00       1.00      0.00 -9.11 <0.005     63.40       -0.00       -0.00
two way                        0.08       1.09      0.05  1.66   0.10      3.37       -0.01        0.18
Two way with median            0.22       1.24      0.05  4.30 <0.005     15.83        0.12        0.31
Age_30 - 39                    0.33       1.39      0.05  6.57 <0.005     34.26        0.23        0.43
Age_40 - 4

TypeError: 'CoxPHFitter' object is not callable

In [112]:
data2=data.drop(columns=['Age_40 - 49','Age_30 - 39','Education_High school diploma','Education_Masters degree','anxiety_Currently','two way'],axis=1)

In [125]:
cph = CoxPHFitter()
cph.fit(data2, 'Wait Time (s)', event_col='E')
cph.print_summary()

<lifelines.CoxPHFitter: fitted with 2463 observations, 0 censored>
      duration col = 'Wait Time (s)'
         event col = 'E'
number of subjects = 2463
  number of events = 2463
    log-likelihood = -16639.57
  time fit was run = 2019-03-25 21:33:43 UTC

---
                          coef  exp(coef)  se(coef)     z      p  -log2(p)  lower 0.95  upper 0.95
Speed Limit               0.01       1.01      0.00  5.54 <0.005     24.97        0.01        0.02
Lane Width               -0.59       0.55      0.10 -6.02 <0.005     29.09       -0.78       -0.40
Mean Arrival Rate        -0.00       1.00      0.00 -9.20 <0.005     64.55       -0.00       -0.00
Two way with median       0.19       1.21      0.04  4.43 <0.005     16.71        0.11        0.28
Age_50 - 59              -0.45       0.64      0.08 -5.36 <0.005     23.54       -0.61       -0.28
driving license_Yes       0.10       1.11      0.08  1.37   0.17      2.54       -0.05        0.25
mode_Public Transit       0.21       1.24    

In [126]:
scores = k_fold_cross_validation(cph, data2, 'Wait Time (s)', event_col='E', k=10)

In [127]:
scores

[0.5901149349362573,
 0.5850833333333333,
 0.5728764478764479,
 0.5549645390070922,
 0.5938746104614148,
 0.6179718537472224,
 0.572569526179507,
 0.5586955794264298,
 0.6271796943378442,
 0.5953938584779707]

In [128]:
print(np.mean(scores))

0.5868724377783521
