## Linear Regression Final Project: Logistic Regression with Traffic Collision Data
### Zachary Barnes and Bing Wang

##### Housekeeping

In [143]:
# Load Python libraries
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import matplotlib.pyplot as plt
import itertools
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

# Run R code adjacent to Python code
%load_ext rpy2.ipython

# Load ggplot R library
%R library(ggplot2)
%R library(scales)

# Avoid kernal death
os.environ['KMP_DUPLICATE_LIB_OK']='True'

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


# Read in and organize data from TIMS (Collisions and Victims)

#### Collisions data

In [144]:
c = pd.read_csv("Collisions.csv")

Response variable: Collision severity. We will code this as a binary variable, with 1 = fatality and 0 = not a fatality

Keep a subset of predictors as full model: Based on previous knowledge, these are likely to be predictors of collision severity

In [145]:
c = c[['CASE_ID','COLLISION_DATE','COLLISION_TIME','INTERSECTION','COLLISION_SEVERITY',
       'LIGHTING','LOCATION_TYPE', 'ROAD_SURFACE', 'ROAD_COND_1', 'PEDESTRIAN_ACCIDENT',
       'BICYCLE_ACCIDENT', 'MOTORCYCLE_ACCIDENT', 'ALCOHOL_INVOLVED','PCF_VIOL_CATEGORY']]

Make datetime variables

In [146]:
# Convert COLLISION_TIME to hour of day (use tlater to match speeds to collisions)
c['COLLISION_TIME'] = [int(i[:-2]) if len(i) > 2 else 0 for i in c['COLLISION_TIME'].astype(str).values]

# Make Collision_Date a DateTime object
c["COLLISION_DATE"] = pd.to_datetime(c.COLLISION_DATE)

# Add quarter column (use later in match speeds to collisions)
c['QUARTER'] = c['COLLISION_DATE'].map(pd.to_datetime).dt.quarter

c.sample(5)

Unnamed: 0,CASE_ID,COLLISION_DATE,COLLISION_TIME,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,ROAD_SURFACE,ROAD_COND_1,PEDESTRIAN_ACCIDENT,BICYCLE_ACCIDENT,MOTORCYCLE_ACCIDENT,ALCOHOL_INVOLVED,PCF_VIOL_CATEGORY,QUARTER
3437,8465554,2018-12-24,23,N,1,C,,B,H,,,,,8,4
1779,8683740,2018-07-12,12,Y,4,A,,A,H,,,,,12,3
2910,8593774,2018-03-02,17,Y,4,A,,B,H,,,,,8,1
3718,90837809,2018-10-07,13,N,4,A,,A,H,,,,,3,4
3304,8552925,2018-01-29,2,Y,3,C,,A,H,,,,Y,1,1


In [147]:
# recode variables in Collision (as 1s and 0s, reduce categories down, make dummies)

# NOTE: ROAD_SURFACE and ROAD_COND_1 had some observations unstated
# If not stated, assumed no issues with road surface or con'd

# Intersection: Make dummy, intersection/not
c.loc[c.INTERSECTION == "Y", "INTERSECTION"] = 1
c.loc[c.INTERSECTION == "N", "INTERSECTION"] = 0

# Lighting: Make 1, 2, 3 for ordinal values for light, dusk/dawn, dark
c.loc[c.LIGHTING == "A", "LIGHTING"] = 2  # Light
c.loc[c.LIGHTING == "B", "LIGHTING"] = 1  # Dusk/Dawn
c.loc[c.LIGHTING.isin(["C", "D", "E"]), "LIGHTING"] = 0  # Dark (w and w/o streetlight)

# Road_Surface: Convert to dummy, wet/not 
c.loc[c.ROAD_SURFACE.isin(["B", "C", "D"]), "WET_ROAD_SURFACE"] = 1
c.loc[~(c.ROAD_SURFACE.isin(["B", "C", "D"])), "WET_ROAD_SURFACE"] = 0

# Road_Cond_1: Convert to dummy, issue/not
c.loc[c.ROAD_COND_1.isin(["H", ""]), "ROAD_COND_ISSUE"] = 0
c.loc[~(c.ROAD_COND_1.isin(["H", ""])), "ROAD_COND_ISSUE"] = 1

# Location_Type: Convert to dummies for if Highway, if Ramp, or if

Recode some variables

In [148]:
# for dummies: recode Y as 1, blank as 0
def Yfor1(s):
    s = s.replace("Y", 1)
    s = s.fillna(0)
    return s

In [149]:
c["PEDESTRIAN_ACCIDENT"] = Yfor1(c.PEDESTRIAN_ACCIDENT)
c["BICYCLE_ACCIDENT"] = Yfor1(c.BICYCLE_ACCIDENT)
c["MOTORCYCLE_ACCIDENT"] = Yfor1(c.MOTORCYCLE_ACCIDENT)

In [150]:
c.sample(5)

Unnamed: 0,CASE_ID,COLLISION_DATE,COLLISION_TIME,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,ROAD_SURFACE,ROAD_COND_1,PEDESTRIAN_ACCIDENT,BICYCLE_ACCIDENT,MOTORCYCLE_ACCIDENT,ALCOHOL_INVOLVED,PCF_VIOL_CATEGORY,QUARTER,WET_ROAD_SURFACE,ROAD_COND_ISSUE
3817,90885426,2018-12-09,18,0,3,0,,A,H,0.0,0.0,0.0,,3,4,0.0,0.0
2744,8597236,2018-03-14,8,1,2,2,,B,H,0.0,1.0,0.0,,3,1,1.0,0.0
2994,8573885,2018-02-21,9,1,2,2,,A,H,1.0,0.0,1.0,,11,1,0.0,0.0
1676,8685881,2018-07-30,16,0,4,2,,A,H,1.0,0.0,0.0,,6,3,0.0,0.0
593,8758348,2018-11-30,18,1,4,1,,-,-,0.0,1.0,0.0,,9,4,0.0,1.0


Make response binary variable from Collision_Severity: 1 for fatality (COLLISION_SEVERITY = 1), 0 for not fatality (COLLISION_SEVERITY != 1)

In [151]:
c.loc[c.COLLISION_SEVERITY == 1, "Fatality"] = 1
c.loc[c.COLLISION_SEVERITY != 1, "Fatality"] = 0

Drop vars we won't use

In [152]:
c = c[['COLLISION_TIME','INTERSECTION', 'PEDESTRIAN_ACCIDENT','BICYCLE_ACCIDENT','MOTORCYCLE_ACCIDENT','WET_ROAD_SURFACE','ROAD_COND_ISSUE','Fatality','PCF_VIOL_CATEGORY']]

In [153]:
m =  pd.get_dummies(c, columns = ['PCF_VIOL_CATEGORY'])


# Model Selection 

In [154]:
import statsmodels.formula.api as smf

def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().aic # evaluate by AIC or BIC
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

In [155]:
model = forward_selected(c,'Fatality')

In [156]:
model.summary()

  return self.ess/self.df_model


0,1,2,3
Dep. Variable:,Fatality,R-squared:,-0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,-inf
Date:,"Fri, 11 Oct 2019",Prob (F-statistic):,
Time:,21:13:44,Log-Likelihood:,4356.3
No. Observations:,3870,AIC:,-8711.0
Df Residuals:,3869,BIC:,-8704.0
Df Model:,0,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0062,0.001,4.914,0.000,0.004,0.009

0,1,2,3
Omnibus:,6765.193,Durbin-Watson:,0.671
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4039157.309
Skew:,12.58,Prob(JB):,0.0
Kurtosis:,159.256,Cond. No.,1.0


Unfortunatly, using AIC and BIC does not produce any predictors.

In [157]:
c.head()

Unnamed: 0,COLLISION_TIME,INTERSECTION,PEDESTRIAN_ACCIDENT,BICYCLE_ACCIDENT,MOTORCYCLE_ACCIDENT,WET_ROAD_SURFACE,ROAD_COND_ISSUE,Fatality,PCF_VIOL_CATEGORY
0,7,0,0.0,0.0,0.0,0.0,0.0,0.0,3
1,12,0,0.0,0.0,0.0,0.0,0.0,0.0,3
2,10,0,0.0,0.0,0.0,0.0,0.0,0.0,7
3,10,0,0.0,0.0,0.0,0.0,0.0,0.0,3
4,17,0,0.0,0.0,0.0,0.0,0.0,0.0,3


In [158]:
import statsmodels.api as sm
import patsy
formula = "Fatality ~ COLLISION_TIME + INTERSECTION + PEDESTRIAN_ACCIDENT + BICYCLE_ACCIDENT + MOTORCYCLE_ACCIDENT + WET_ROAD_SURFACE + ROAD_COND_ISSUE"
y,X = patsy.dmatrices(formula,data=c)
model = sm.Logit(y,X).fit()
model.summary()

         Current function value: 0.033757
         Iterations: 35




0,1,2,3
Dep. Variable:,Fatality,No. Observations:,3870.0
Model:,Logit,Df Residuals:,3861.0
Method:,MLE,Df Model:,8.0
Date:,"Fri, 11 Oct 2019",Pseudo R-squ.:,0.1047
Time:,21:13:44,Log-Likelihood:,-130.64
converged:,False,LL-Null:,-145.92
Covariance Type:,nonrobust,LLR p-value:,0.0001689

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-5.3503,0.617,-8.667,0.000,-6.560,-4.140
INTERSECTION[T.1],-1.6054,0.510,-3.148,0.002,-2.605,-0.606
INTERSECTION[T.-],-25.0463,3.43e+05,-7.3e-05,1.000,-6.72e+05,6.72e+05
COLLISION_TIME,-0.0142,0.037,-0.388,0.698,-0.086,0.057
PEDESTRIAN_ACCIDENT,1.8776,0.458,4.103,0.000,0.981,2.774
BICYCLE_ACCIDENT,0.4103,0.662,0.619,0.536,-0.888,1.708
MOTORCYCLE_ACCIDENT,0.8851,0.594,1.491,0.136,-0.278,2.049
WET_ROAD_SURFACE,0.1707,0.630,0.271,0.787,-1.065,1.406
ROAD_COND_ISSUE,1.2134,0.565,2.149,0.032,0.107,2.320


In [162]:
m.columns

Index(['COLLISION_TIME', 'INTERSECTION', 'PEDESTRIAN_ACCIDENT',
       'BICYCLE_ACCIDENT', 'MOTORCYCLE_ACCIDENT', 'WET_ROAD_SURFACE',
       'ROAD_COND_ISSUE', 'Fatality', 'PCF_VIOL_CATEGORY_-',
       'PCF_VIOL_CATEGORY_00', 'PCF_VIOL_CATEGORY_01', 'PCF_VIOL_CATEGORY_02',
       'PCF_VIOL_CATEGORY_03', 'PCF_VIOL_CATEGORY_04', 'PCF_VIOL_CATEGORY_05',
       'PCF_VIOL_CATEGORY_06', 'PCF_VIOL_CATEGORY_07', 'PCF_VIOL_CATEGORY_08',
       'PCF_VIOL_CATEGORY_09', 'PCF_VIOL_CATEGORY_10', 'PCF_VIOL_CATEGORY_11',
       'PCF_VIOL_CATEGORY_12', 'PCF_VIOL_CATEGORY_13', 'PCF_VIOL_CATEGORY_14',
       'PCF_VIOL_CATEGORY_15', 'PCF_VIOL_CATEGORY_16', 'PCF_VIOL_CATEGORY_17',
       'PCF_VIOL_CATEGORY_18', 'PCF_VIOL_CATEGORY_21', 'PCF_VIOL_CATEGORY_22'],
      dtype='object')

In [166]:
import statsmodels.api as sm
import patsy
formula = "Fatality ~ INTERSECTION + PCF_VIOL_CATEGORY_03"
y,X = patsy.dmatrices(formula,data=m)
model = sm.Logit(y,X).fit()
model.summary()

         Current function value: 0.035203
         Iterations: 35




0,1,2,3
Dep. Variable:,Fatality,No. Observations:,3870.0
Model:,Logit,Df Residuals:,3866.0
Method:,MLE,Df Model:,3.0
Date:,"Fri, 11 Oct 2019",Pseudo R-squ.:,0.06635
Time:,21:15:10,Log-Likelihood:,-136.23
converged:,False,LL-Null:,-145.92
Covariance Type:,nonrobust,LLR p-value:,0.0002298

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-4.2484,0.236,-17.966,0.000,-4.712,-3.785
INTERSECTION[T.1],-1.5690,0.505,-3.106,0.002,-2.559,-0.579
INTERSECTION[T.-],-13.4849,1217.770,-0.011,0.991,-2400.270,2373.300
PCF_VIOL_CATEGORY_03,-2.2916,1.025,-2.236,0.025,-4.301,-0.283


In [None]:
PCF_VIOL_CATEGORY_11

In [None]:
c = pd.read_csv("Collisions.csv")
c = c[['CASE_ID','COLLISION_DATE','COLLISION_TIME','INTERSECTION','COLLISION_SEVERITY',
       'LIGHTING','LOCATION_TYPE', 'ROAD_SURFACE', 'ROAD_COND_1', 'PEDESTRIAN_ACCIDENT',
       'BICYCLE_ACCIDENT', 'MOTORCYCLE_ACCIDENT', 'ALCOHOL_INVOLVED','PCF_VIOL_CATEGORY']]

In [None]:
%matplotlib inline

In [None]:
c['PCF_VIOL_CATEGORY'].value_counts()