In [1]:
%%latex
\tableofcontents

<IPython.core.display.Latex object>

# Data Cleanup

## Import Libraries

### General Libraries

In [2]:
import sys
print ('Python version: {}'.format(sys.version))
import pandas as pd
print ('pandas version: {}'.format(pd.__version__))
import matplotlib
print ('matplotlib version: {}'.format(matplotlib.__version__))
import numpy as np
print ('NumPy version: {}'.format(np.__version__))
import scipy as sp
print ('SciPy version: {}'.format(sp.__version__))
import IPython
print ('IPython version: {}'.format(IPython.__version__))
import sklearn
print ('scikit-learn version: {}'.format(sklearn.__version__))
import imblearn

import random
import time
import warnings
import copy
warnings.filterwarnings('ignore')
print ('-*'*20)
from subprocess import check_output
print (check_output(['ls', './data']).decode('utf8'))

Python version: 3.8.8 (default, Apr 13 2021, 12:59:45) 
[Clang 10.0.0 ]
pandas version: 1.2.4
matplotlib version: 3.3.4
NumPy version: 1.20.1
SciPy version: 1.6.2
IPython version: 7.22.0
scikit-learn version: 0.24.1
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
2019_Crash_1_Database.csv
CODE_TB.xlsx



### Visualization Libraries

In [3]:
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Machine Learning Libraries

In [4]:
# machine learning
from sklearn.model_selection import train_test_split

from sklearn.ensemble import (
    RandomForestClassifier, 
    GradientBoostingClassifier, 
    ExtraTreesClassifier, 
    BaggingClassifier,
    AdaBoostClassifier,
    VotingClassifier,
    StackingClassifier
)

from sklearn.linear_model import (
    LogisticRegression,
    Perceptron, 
    SGDClassifier, 
    RidgeClassifier, 
    RidgeClassifierCV
)

from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import (
    KNeighborsClassifier, 
    RadiusNeighborsClassifier
)

from sklearn.neural_network import MLPClassifier

from sklearn.svm import (
    SVC, 
    LinearSVC,
    NuSVC
)

from sklearn.tree import (
    DecisionTreeClassifier,
    ExtraTreeClassifier
)

from sklearn.metrics import (
    precision_score, 
    recall_score, 
    accuracy_score, 
    balanced_accuracy_score, 
    f1_score, 
    confusion_matrix
)

from sklearn.utils import shuffle

## Import Data

We will bring in the data as data_originalk, and make a deep copy, data_raw.
Then we will process each column, copy it to data, and delete it from data_raw

In [5]:
data_original = pd.read_csv('./data/2019_Crash_1_Database.csv', parse_dates = ['crash_date', 'crash_hour', 'crash_time'])
data_raw = data_original.copy(deep=True)
data = pd.DataFrame()
data_dummy_fields = []
data_raw_fields_to_drop = []

## Fields

### Fields with the Dependent Variable

The dependent variable y=1 if somebody died, y=0 otherwise.  
We have two fields that give us this information.  
'num_tot_kil' [0, 1, 2, 3, 4], giving the number killed.

'severity_cd' ['E' 'D' 'C' 'B' 'A'], with 'A' being 'Fatal.'

Do these columns agree?  Yes (below).

In [6]:
for x in ['num_tot_kil', 'severity_cd']:
    print (data_raw[x].isnull().sum())
A = np.where( (
    (data_raw['num_tot_kil'] == 0) & (data_raw['severity_cd'] == 'A') |
    (data_raw['num_tot_kil'] > 0) & (data_raw['severity_cd'] != 'A') 
))
print (A)

0
0
(array([], dtype=int64),)


In [7]:
data['fatal'] = data_raw['num_tot_kil'].apply(lambda x: 1 if x>0 else x)
for x in ['num_tot_kil', 'severity_cd']:
    data_raw_fields_to_drop.append(x)

In [8]:
data['injury'] = data_raw['num_tot_inj'].apply(lambda x: 1 if x>0 else x)
for x in ['num_tot_inj']:
    data_raw_fields_to_drop.append(x)

### Fields where I have No Idea What This Means

In [9]:
for x in ['quadrant', 'spotted_by', 'bypass']:
    data_raw_fields_to_drop.append(x)

### Fields with Time

- crash_date
- crash_hour
- crash_year
- crash_time

The years are all the same.

What might be interesting is the month and the day of week.

In [10]:
for x in ['crash_date', 'crash_hour']:
    print (data_raw[x].isnull().sum())
#print (data_raw.crash_hour.value_counts())
data_raw.crash_hour.replace(['  '], 25, inplace=True)
#print (data_raw.crash_hour.unique())



data['crash_month'] = data_raw['crash_date'].dt.month
data['crash_dayofweek'] = data_raw['crash_date'].dt.dayofweek
data['crash_hour'] = data_raw['crash_hour'].astype(int)

for x in ['crash_month', 'crash_dayofweek', 'crash_hour']:
    data_dummy_fields.append(x)
for x in ['crash_date', 'crash_hour', 'crash_year', 'crash_time']:
    data_raw_fields_to_drop.append(x)
print (data.head())
print (data.info())

0
0
   fatal  injury  crash_month  crash_dayofweek  crash_hour
0      0       0            1                1          17
1      0       0            1                1          23
2      0       0            1                1          22
3      0       0            1                1          18
4      0       1            1                1          19
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160186 entries, 0 to 160185
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype
---  ------           --------------   -----
 0   fatal            160186 non-null  int64
 1   injury           160186 non-null  int64
 2   crash_month      160186 non-null  int64
 3   crash_dayofweek  160186 non-null  int64
 4   crash_hour       160186 non-null  int64
dtypes: int64(5)
memory usage: 6.1 MB
None


### Fields with Direction
I think I'll use just 'pri_road_dir,', which seems clean enough.  
Change the blanks to 'Z'.
Convert to type 'category'.
Convert category to codes.

In [11]:
for x in data_raw:
    if 'dir' in x:
        values = data_raw[x].unique()
        print (x, sorted(values, key=lambda x: (str(type(x)), x)))
data_raw['pri_road_dir'].replace([' '], 'Z', inplace=True)

data_raw['pri_road_dir'] = data_raw['pri_road_dir'].astype('object')
#data_raw['pri_road_dir'] = data_raw['pri_road_dir'].cat.codes

print (data_raw['pri_road_dir'].unique())
                                    
data['pri_road_dir'] = data_raw['pri_road_dir']

for x in ['pri_road_dir']:
    data_dummy_fields.append(x)
for x in ['travel_dirs', 'pri_dir', 'pri_road_dir']:
    data_raw_fields_to_drop.append(x)

print ()
print (data.head())
print (data.info())

travel_dirs ['    ', 'E   ', 'EE  ', 'EEE ', 'EEEE', 'EEEN', 'EEES', 'EEEW', 'EEN ', 'EENE', 'EENN', 'EENW', 'EES ', 'EESS', 'EESW', 'EEW ', 'EEWE', 'EEWW', 'EN  ', 'ENE ', 'ENEE', 'ENN ', 'ENNE', 'ENNN', 'ENNS', 'ENS ', 'ENSS', 'ENSW', 'ENW ', 'ENWN', 'ENWW', 'ES  ', 'ESE ', 'ESEE', 'ESN ', 'ESNE', 'ESNN', 'ESS ', 'ESSE', 'ESSS', 'ESSW', 'ESW ', 'ESWW', 'EW  ', 'EWE ', 'EWEE', 'EWN ', 'EWNN', 'EWS ', 'EWSS', 'EWW ', 'EWWE', 'EWWS', 'EWWW', 'N   ', 'NE  ', 'NEE ', 'NEEE', 'NEEW', 'NEN ', 'NES ', 'NESN', 'NESS', 'NEW ', 'NEWE', 'NN  ', 'NNE ', 'NNEE', 'NNES', 'NNN ', 'NNNE', 'NNNN', 'NNNS', 'NNNW', 'NNS ', 'NNSN', 'NNSS', 'NNW ', 'NNWN', 'NNWW', 'NS  ', 'NSE ', 'NSEE', 'NSEN', 'NSN ', 'NSNN', 'NSNW', 'NSS ', 'NSSE', 'NSSS', 'NSW ', 'NSWW', 'NW  ', 'NWE ', 'NWEE', 'NWEN', 'NWN ', 'NWNS', 'NWS ', 'NWSS', 'NWW ', 'NWWW', 'S   ', 'S S ', 'SE  ', 'SEE ', 'SEEE', 'SEN ', 'SENN', 'SES ', 'SESS', 'SEW ', 'SEWW', 'SN  ', 'SNE ', 'SNEN', 'SNN ', 'SNNN', 'SNNS', 'SNS ', 'SNSN', 'SNSS', 'SNW ', 'SN

### Fields of Binary Features

In [12]:
for x in ['intersection', 'alcohol', 'roadway_departure', 'lane_departure', 'dr_sex_1', 'dr_sex_2']:
    data_raw[x].fillna(' ', inplace=True)
    data_raw[x].replace([' '], data_raw[x].mode(), inplace=True)
    data_raw[x].replace({'M':1, 'F':0, 'No':0, 'Yes':1}, inplace=True)
    values = data_raw[x].unique()
    print (x, sorted(values, key=lambda x: (str(type(x)), x)))  
    data[x] = data_raw[x]
    data_raw_fields_to_drop.append(x)

print ()
print (data.head())
print (data.info()) 

intersection [0, 1]
alcohol [0, 1]
roadway_departure [0, 1]
lane_departure [0, 1]
dr_sex_1 [0, 1]
dr_sex_2 [0, 1]

   fatal  injury  crash_month  crash_dayofweek  crash_hour pri_road_dir  \
0      0       0            1                1          17            Z   
1      0       0            1                1          23            Z   
2      0       0            1                1          22            W   
3      0       0            1                1          18            Z   
4      0       1            1                1          19            Z   

   intersection  alcohol  roadway_departure  lane_departure  dr_sex_1  \
0             1        0                  0               0         1   
1             1        0                  0               0         1   
2             0        0                  0               0         1   
3             0        0                  1               1         0   
4             1        0                  0               0         0

### Fields with Integer Values
- 'num_veh'
    - Not dummy, because increasing values mean something.
    - Make into:
        - single_vehicle
        - two_vehicle
        - multi_vehicle
- 'parish_cd'
- 'parish_cd.1'
- 'city_cd'  
    - I don't know what cities the codes correlate to, but the '0' is probably 'Not in any of the 19 cities,' and may correlate to 'not urban.'

In [13]:
# If more than 10, just lump in with 10.
data['single_vehicle'] = data_raw['num_veh'].apply(lambda x: 1 if x==1 else 0)
data['two_vehicle'] = data_raw['num_veh'].apply(lambda x: 1 if x==2 else 0)
data['multi_vehicle'] = data_raw['num_veh'].apply(lambda x: 1 if x>2 else 0)

for x in ['num_veh']:
    data_raw_fields_to_drop.append(x)
    


for x in ['parish_cd', 'parish_cd.1', 'city_cd']:
    data_raw[x].fillna(data_raw[x].mode(), inplace=True)
    values = data_raw[x].unique()
    print (x, sorted(values, key=lambda x: (str(type(x)), x)))  
    data[x] = data_raw[x]
    data_dummy_fields.append(x)
    data_raw_fields_to_drop.append(x)


print ()
print (data.head())
print (data.info())

parish_cd [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]
parish_cd.1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]
city_cd [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 44]

   fatal  injury  crash_month  crash_dayofweek  crash_hour pri_road_dir  \
0      0       0            1                1          17            Z   
1      0       0            1                1          23            Z   
2      0       0            1                1          22            W   
3      0       0            1                1          18            Z   
4      0       1        

### Fields to Put in Ranges
- 'dr_age_1'
- 'dr_age_2'

In [14]:
for x in ['dr_age_1','dr_age_2']:
    data_raw[x].fillna(0, inplace=True)
    data_raw[x] = data_raw[x].astype(int)
    data_raw.loc[(data_raw[x]>100), x] = 0
    xbin = x + '_bin'
    data_raw[xbin] = data_raw[x].replace(0,np.nan)    
    data_raw[xbin] = pd.qcut(data_raw[xbin], 5, duplicates='drop').cat.add_categories(0)
    data_raw[xbin].fillna(0, inplace=True)    
    
    data_raw[xbin] = data_raw[xbin].astype('category')
#    data_raw[xbin] = data_raw[xbin].cat.codes

    data[xbin] = data_raw[xbin]
    values = data_raw[xbin].unique()
    print (data[xbin].value_counts())
    data_dummy_fields.append(xbin)
    data_raw_fields_to_drop.append(x)
    data_raw_fields_to_drop.append(xbin)

print ()
print (data.head())
print (data.info())

(22.0, 30.0]     29092
(1.999, 22.0]    28982
(40.0, 56.0]     28270
(30.0, 40.0]     27890
(56.0, 99.0]     27462
0                18490
Name: dr_age_1_bin, dtype: int64
0                33099
(1.999, 26.0]    27664
(34.0, 45.0]     27379
(57.0, 97.0]     24573
(45.0, 57.0]     24222
(26.0, 34.0]     23249
Name: dr_age_2_bin, dtype: int64

   fatal  injury  crash_month  crash_dayofweek  crash_hour pri_road_dir  \
0      0       0            1                1          17            Z   
1      0       0            1                1          23            Z   
2      0       0            1                1          22            W   
3      0       0            1                1          18            Z   
4      0       1            1                1          19            Z   

   intersection  alcohol  roadway_departure  lane_departure  dr_sex_1  \
0             1        0                  0               0         1   
1             1        0                  0               0 

### Distance from the Road
This one is weird.  

- The units are either in feet or miles.
- For many values the units are missing, and for others the measure is extreme. 
- I'm going to 
-- make all of the entries with missing units or negative distance zero, 
-- change all of the lengths to feet,
-- take out the zeroes, 
-- put in ranges,
-- and put the zeroes back.

In [15]:
for x in ['pri_measure']:
    data_raw[x] = data_raw[x].str.strip()
    values = data_raw[x].unique()
    print (x, len(values), values)
data_raw.loc[data_raw['pri_measure'] == '', 'pri_dist'] = 0
data_raw.loc[data_raw['pri_dist'] <= 0, 'pri_dist'] = 0
data_raw.loc[data_raw['pri_measure'] == 'MI', 'pri_dist'] *= 5280

for x in ['pri_dist']:
    data_raw[x].fillna(0, inplace=True)
#    data_raw[x] = data_raw[x].astype(int)
#    data_raw.loc[(data_raw[x]>100), x] = 0
    xbin = x + '_bin'
    data_raw[xbin] = data_raw[x].replace(0,np.nan)    
    data_raw[xbin] = pd.qcut(data_raw[xbin], 5, duplicates='drop').cat.add_categories(0)
    data_raw[xbin].fillna(0, inplace=True)
    
    data_raw[xbin] = data_raw[xbin].astype('category')
#    data_raw[xbin] = data_raw[xbin].cat.codes
    
    data[xbin] = data_raw[xbin]
    print (data_raw[xbin].value_counts())
    data_dummy_fields.append(xbin)
    data_raw_fields_to_drop.append(x)
    data_raw_fields_to_drop.append(xbin)

for x in ['pri_measure']:
    data_raw_fields_to_drop.append(x)
    

print ()
print (data.head())
print (data.info())

pri_measure 3 ['' 'FT' 'MI']
0                        60888
(-0.0009, 50.0]          25224
(150.0, 528.0]           22264
(2901.02, 52794720.0]    19860
(50.0, 150.0]            15993
(528.0, 2901.02]         15957
Name: pri_dist_bin, dtype: int64

   fatal  injury  crash_month  crash_dayofweek  crash_hour pri_road_dir  \
0      0       0            1                1          17            Z   
1      0       0            1                1          23            Z   
2      0       0            1                1          22            W   
3      0       0            1                1          18            Z   
4      0       1            1                1          19            Z   

   intersection  alcohol  roadway_departure  lane_departure  ...  dr_sex_2  \
0             1        0                  0               0  ...         1   
1             1        0                  0               0  ...         1   
2             0        0                  0               0  ...  

### Alpha fields with 'Y' = 'Unknown' or 'Z' = 'Other'

We have lots of fields where 'Y' is 'Unknown' and 'Z' is 'Other.  
- Merge nan, blank, erroneous intergers, Y, and Z, into 'Z'. 

Other related Alpha fields:
- 'crash_type' does not have a Y or Z, and I can't figure out what it means.
- 'hwy_class' is mixed Alpha and integers, and I have no idea what it means.
- 'contributing_factor' has two values, 'R' and 'O', and I have no idea what it means.
- 'veh_severity' has five values, and I have no idea what it means.

These fields have trailing spaces I had to remove:
- 'f_harm_ev_cd1'
- 'm_harm_ev_cd1'

I lumped in some other fields here:
 - 'crash_type'
 - 'pri_contrib_fac_cd'
 - 'sec_pri_contrib_fac_cd'
 - 'hwy_type_cd'

In [16]:
for x in ['f_harm_ev_cd1', 'm_harm_ev_cd1']:
    data_raw[x] = data_raw[x].str.strip()

for x in data_raw:
    values = data_raw[x].unique()
    if (
        (('Y' in values or 'Z' in values) and len(values)<50)
        or x in ['crash_type', 'pri_contrib_fac_cd', 'sec_contrib_fac_cd', 'hwy_type_cd']
    ):
        print (x, sorted(values, key=lambda x: (str(type(x)), x)))
        data_raw[x].fillna('Z', inplace=True)
        data_raw[x].replace([' ', 'Y'], 'Z', inplace=True)
        data_raw[x] = data_raw[x].apply(lambda x: 'Z' if x.isnumeric() else x)
        values = data_raw[x].unique()
        print (x, sorted(values, key=lambda x: (str(type(x)), x)))
        print ()
        
        data_raw[x] = data_raw[x].astype('category')
#        data_raw[x] = data_raw[x].cat.codes
        
        data[x] = data_raw[x]
        data_dummy_fields.append(x)
        data_raw_fields_to_drop.append(x)
        
for x in ['hwy_class', 'contributing_factor']:
    data_raw_fields_to_drop.append(x)
    

print ()
print (data.head())
print (data.info())

f_harm_ev_cd1 ['', 'A', 'AA', 'B', 'BB', 'C', 'CC', 'D', 'DD', 'E', 'EE', 'F', 'FF', 'G', 'GG', 'H', 'HH', 'I', 'II', 'J', 'JJ', 'K', 'KK', 'L', 'LL', 'M', 'MM', 'N', 'NN', 'O', 'OO', 'P', 'PP', 'Q', 'QQ', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'YY', 'Z']
f_harm_ev_cd1 ['', 'A', 'AA', 'B', 'BB', 'C', 'CC', 'D', 'DD', 'E', 'EE', 'F', 'FF', 'G', 'GG', 'H', 'HH', 'I', 'II', 'J', 'JJ', 'K', 'KK', 'L', 'LL', 'M', 'MM', 'N', 'NN', 'O', 'OO', 'P', 'PP', 'Q', 'QQ', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'YY', 'Z']

m_harm_ev_cd1 ['', 'A', 'AA', 'B', 'BB', 'C', 'CC', 'D', 'DD', 'E', 'EE', 'F', 'FF', 'G', 'GG', 'H', 'HH', 'I', 'II', 'J', 'JJ', 'K', 'KK', 'L', 'LL', 'M', 'MM', 'N', 'NN', 'O', 'OO', 'P', 'PP', 'Q', 'QQ', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'YY', 'Z']
m_harm_ev_cd1 ['', 'A', 'AA', 'B', 'BB', 'C', 'CC', 'D', 'DD', 'E', 'EE', 'F', 'FF', 'G', 'GG', 'H', 'HH', 'I', 'II', 'J', 'JJ', 'K', 'KK', 'L', 'LL', 'M', 'MM', 'N', 'NN', 'O', 'OO', 'P', 'PP', 'Q', 'QQ', 'R', 'S', 'T', 'U', 'V', 'W

## Blank Field

In [17]:
for x in ['ped_actions_2']:
    data_raw_fields_to_drop.append(x)

## Review Data

### Drop Used Fields from 'data_raw'

In [18]:
print (data_raw.shape)
data_raw_dropped = data_raw.drop(data_raw_fields_to_drop, axis=1, inplace=False)
print (data_raw_dropped.shape)

(160186, 79)
(160186, 18)


### Remaining Fields
I don't know that any of these are likely to correlate.  

In [19]:
for x in data_raw_dropped:
    values = data_raw_dropped[x].unique()
    print (x, len(values))

route 1024
milepoint 53808
crash_num 160186
prior_movements 1322
csect 2141
logmile 18203
lrs_id 4788
lrs_logmile 18093
adt 749
intersection_id 15037
ORIG_LATITUDE 91540
ORIG_LONGITUDE 87921
DOTD_LATITUDE 127283
DOTD_LONGITUDE 130753
pri_hwy_num 1006
milepost 7172
pri_road_name 15724
inter_road 35371


### Fields in 'data' dataframe

In [20]:
for x in data:
    print (x)

fatal
injury
crash_month
crash_dayofweek
crash_hour
pri_road_dir
intersection
alcohol
roadway_departure
lane_departure
dr_sex_1
dr_sex_2
single_vehicle
two_vehicle
multi_vehicle
parish_cd
parish_cd.1
city_cd
dr_age_1_bin
dr_age_2_bin
pri_dist_bin
f_harm_ev_cd1
m_harm_ev_cd1
man_coll_cd
crash_type
surf_cond_cd
invest_agency_cd
veh_type_cd1
veh_type_cd2
road_rel_cd
location_type
veh_severity_cd
hwy_type_cd
bypass
pri_contrib_fac_cd
sec_contrib_fac_cd
vision_obscure_1
vision_obscure_2
movement_reason_1
movement_reason_2
ped_actions_1
veh_lighting_1
veh_lighting_2
traff_cntl_cond_1
traff_cntl_cond_2
lighting_cd
dr_cond_cd1
dr_cond_cd2
veh_cond_cd1
veh_cond_cd2


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160186 entries, 0 to 160185
Data columns (total 50 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   fatal               160186 non-null  int64   
 1   injury              160186 non-null  int64   
 2   crash_month         160186 non-null  int64   
 3   crash_dayofweek     160186 non-null  int64   
 4   crash_hour          160186 non-null  int64   
 5   pri_road_dir        160186 non-null  category
 6   intersection        160186 non-null  int64   
 7   alcohol             160186 non-null  int64   
 8   roadway_departure   160186 non-null  int64   
 9   lane_departure      160186 non-null  int64   
 10  dr_sex_1            160186 non-null  int64   
 11  dr_sex_2            160186 non-null  int64   
 12  single_vehicle      160186 non-null  int64   
 13  two_vehicle         160186 non-null  int64   
 14  multi_vehicle       160186 non-null  int64   
 15  parish_cd        

In [22]:
data.head()

Unnamed: 0,fatal,injury,crash_month,crash_dayofweek,crash_hour,pri_road_dir,intersection,alcohol,roadway_departure,lane_departure,...,ped_actions_1,veh_lighting_1,veh_lighting_2,traff_cntl_cond_1,traff_cntl_cond_2,lighting_cd,dr_cond_cd1,dr_cond_cd2,veh_cond_cd1,veh_cond_cd2
0,0,0,1,1,17,Z,1,0,0,0,...,Z,Z,A,A,V,A,Z,A,Z,K
1,0,0,1,1,23,Z,1,0,0,0,...,Z,Z,A,A,U,C,A,A,K,K
2,0,0,1,1,22,W,0,0,0,0,...,Z,A,A,Q,C,C,A,A,K,K
3,0,0,1,1,18,Z,0,0,1,1,...,Z,A,Z,S,Z,B,A,Z,K,Z
4,0,1,1,1,19,Z,1,0,0,0,...,Z,A,A,R,R,C,A,A,K,K


## Get Dummies

### Fields to become Dummies

In [23]:
# Remove duplicates
data_dummy_fields = list(set(data_dummy_fields))
for field in data_dummy_fields:
    print (field)

crash_month
crash_type
surf_cond_cd
crash_dayofweek
city_cd
parish_cd.1
pri_contrib_fac_cd
traff_cntl_cond_2
movement_reason_2
dr_cond_cd2
traff_cntl_cond_1
veh_type_cd1
parish_cd
dr_cond_cd1
crash_hour
hwy_type_cd
movement_reason_1
road_rel_cd
veh_cond_cd2
veh_type_cd2
veh_cond_cd1
veh_severity_cd
veh_lighting_2
dr_age_1_bin
bypass
sec_contrib_fac_cd
vision_obscure_2
veh_lighting_1
dr_age_2_bin
lighting_cd
ped_actions_1
pri_road_dir
f_harm_ev_cd1
pri_dist_bin
location_type
vision_obscure_1
man_coll_cd
m_harm_ev_cd1
invest_agency_cd


In [24]:
for field in data_dummy_fields:
    print (field)
    data = pd.get_dummies(data,columns=[field],drop_first=False)
for x in data:
    print (x)
print (data.head())

crash_month
crash_type
surf_cond_cd
crash_dayofweek
city_cd
parish_cd.1
pri_contrib_fac_cd
traff_cntl_cond_2
movement_reason_2
dr_cond_cd2
traff_cntl_cond_1
veh_type_cd1
parish_cd
dr_cond_cd1
crash_hour
hwy_type_cd
movement_reason_1
road_rel_cd
veh_cond_cd2
veh_type_cd2
veh_cond_cd1
veh_severity_cd
veh_lighting_2
dr_age_1_bin
bypass
sec_contrib_fac_cd
vision_obscure_2
veh_lighting_1
dr_age_2_bin
lighting_cd
ped_actions_1
pri_road_dir
f_harm_ev_cd1
pri_dist_bin
location_type
vision_obscure_1
man_coll_cd
m_harm_ev_cd1
invest_agency_cd
fatal
injury
intersection
alcohol
roadway_departure
lane_departure
dr_sex_1
dr_sex_2
single_vehicle
two_vehicle
multi_vehicle
crash_month_1
crash_month_2
crash_month_3
crash_month_4
crash_month_5
crash_month_6
crash_month_7
crash_month_8
crash_month_9
crash_month_10
crash_month_11
crash_month_12
crash_type_A
crash_type_B
crash_type_C
crash_type_D
crash_type_E
crash_type_F
crash_type_G
crash_type_H
crash_type_J
crash_type_K
crash_type_M
crash_type_N
crash_ty

In [25]:
data = data.astype('bool')

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160186 entries, 0 to 160185
Columns: 659 entries, fatal to invest_agency_cd_Z
dtypes: bool(659)
memory usage: 100.7 MB


In [27]:
data.select_dtypes(include=['int64']).columns 

Index([], dtype='object')

In [28]:
data.to_csv('../../../619_Big_Files/06_10_21_Data.csv', index=False)

## Split into 'train' and 'test' Randomly

In [29]:
#train, test = train_test_split(data, test_size=0.2)
#x_train = train.drop(['fatal'], axis=1)
#y_train = train['fatal']
#x_test = test.drop(['fatal'], axis=1)
#y_test = test['fatal']
#print (x_train.shape, y_train.shape, x_test.shape, y_test.shape, y_test[y_test==1].shape)

## Split into 'train' and 'test' Sets with Proportional number of Fatalities

In [30]:
data_positive = data[data['fatal'] == 1]
data_negative = data[data['fatal'] == 0]
train_positive, test_positive = train_test_split(data_positive, test_size=0.2)
train_negative, test_negative = train_test_split(data_negative, test_size=0.2)
train = pd.concat([train_positive, train_negative])
test = pd.concat([test_positive, test_negative])
# Randomly shuffle the rows of the train and test sets,
# because otherwise they have the positive on top and the negative on the bottom.
# "shuffle" is an sklearn function.
train = shuffle(train)
test = shuffle(test)
x_train = train.drop(['fatal'], axis=1)
y_train = train['fatal']
x_test = test.drop(['fatal'], axis=1)
y_test = test['fatal']

print (x_train.shape, y_train.shape, x_test.shape, y_test.shape)
print (train_positive.shape, test_positive.shape, y_test[y_test==1].shape)

(128148, 658) (128148,) (32038, 658) (32038,)
(544, 659) (137, 659) (137,)


## Oversample with RandomOverSampler (Naive Random Oversampling)
- Use this after we've split into train and test sets, because we only want to over/under sample the train set.  

import imblearn
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)
x_train, y_train = ros.fit_resample(x_train, y_train)
print (x_train.shape, y_train.shape, y_train[y_train==0].shape, y_train[y_train==1].shape)

## Oversample with SMOTE
- Shouldn't work well because SMOTE only works with continuous data

from imblearn.over_sampling import SMOTE
x_train, y_train = SMOTE().fit_resample(x_train, y_train)
print (x_train.shape, y_train.shape, y_train[y_train==0].shape, y_train[y_train==1].shape)


## Oversample with ADASYN
-- Adaptive Synthetic sampling

from imblearn.over_sampling import ADASYN
x_train, y_train = ADASYN().fit_resample(x_train, y_train)
print (x_train.shape, y_train.shape, y_train[y_train==0].shape, y_train[y_train==1].shape)


## Undersample with Cluster Centroids

from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=0)
x_train, y_train = cc.fit_resample(x_train, y_train)
print (x_train.shape, y_train.shape, y_train[y_train==0].shape, y_train[y_train==1].shape)



## Undersample with Random UnderSampler

In [31]:
from imblearn.under_sampling import RandomUnderSampler
sampler = RandomUnderSampler(random_state=0)
x_train, y_train = sampler.fit_resample(x_train, y_train)
print (x_train.shape, y_train.shape, y_train[y_train==0].shape, y_train[y_train==1].shape)



(1088, 658) (1088,) (544,) (544,)


In [32]:
print (x_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1088 entries, 0 to 1087
Columns: 658 entries, injury to invest_agency_cd_Z
dtypes: bool(658)
memory usage: 699.2 KB
None


## Perform Exploratory Analysis with Statistics on the 'train' Set

### Logistic Regression

In [33]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
acc_log = round(logreg.score(x_train, y_train) * 100, 2)
acc_log

97.24

In [34]:
coeff_data = pd.DataFrame(train.columns.delete(0))
coeff_data.columns = ['Feature']
coeff_data["Correlation"] = pd.Series(logreg.coef_[0])

coeff_data.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
426,veh_severity_cd_B,2.191444
22,crash_type_A,2.165418
324,dr_cond_cd1_Z,1.772338
25,crash_type_D,1.392194
439,"dr_age_1_bin_(56.0, 99.0]",1.330564
...,...,...
656,invest_agency_cd_C,-1.295002
329,crash_hour_4,-1.315540
440,dr_age_1_bin_0,-1.635329
515,ped_actions_1_Z,-1.643782


In [35]:
for x in x_train:
    print ('fatal correlation by: ', x)
    A = train[[x, 'fatal']].groupby(x, as_index=False).mean()
    A = A.sort_values(by='fatal', ascending=False)
    print (A)
    print ()

fatal correlation by:  injury
   injury     fatal
1    True  0.006451
0   False  0.003385

fatal correlation by:  intersection
   intersection     fatal
0         False  0.005178
1          True  0.002636

fatal correlation by:  alcohol
   alcohol     fatal
1     True  0.029916
0    False  0.003487

fatal correlation by:  roadway_departure
   roadway_departure     fatal
1               True  0.014248
0              False  0.002144

fatal correlation by:  lane_departure
   lane_departure     fatal
1            True  0.014641
0           False  0.001640

fatal correlation by:  dr_sex_1
   dr_sex_1     fatal
1      True  0.005301
0     False  0.002546

fatal correlation by:  dr_sex_2
   dr_sex_2     fatal
0     False  0.004840
1      True  0.003317

fatal correlation by:  single_vehicle
   single_vehicle     fatal
1            True  0.014067
0           False  0.002378

fatal correlation by:  two_vehicle
   two_vehicle     fatal
0        False  0.012029
1         True  0.002026

fatal cor

   parish_cd.1_24     fatal
1            True  0.013870
0           False  0.004191

fatal correlation by:  parish_cd.1_25
   parish_cd.1_25     fatal
1            True  0.021429
0           False  0.004226

fatal correlation by:  parish_cd.1_26
   parish_cd.1_26     fatal
0           False  0.004464
1            True  0.002341

fatal correlation by:  parish_cd.1_27
   parish_cd.1_27     fatal
1            True  0.009259
0           False  0.004220

fatal correlation by:  parish_cd.1_28
   parish_cd.1_28     fatal
0           False  0.004427
1            True  0.001803

fatal correlation by:  parish_cd.1_29
   parish_cd.1_29     fatal
1            True  0.006159
0           False  0.004218

fatal correlation by:  parish_cd.1_30
   parish_cd.1_30     fatal
1            True  0.015873
0           False  0.004234

fatal correlation by:  parish_cd.1_31
   parish_cd.1_31     fatal
1            True  0.004870
0           False  0.004239

fatal correlation by:  parish_cd.1_32
   parish_cd.1_3

   traff_cntl_cond_2_L     fatal
0                False  0.004246
1                 True  0.000000

fatal correlation by:  traff_cntl_cond_2_M
   traff_cntl_cond_2_M     fatal
0                False  0.004248
1                 True  0.000000

fatal correlation by:  traff_cntl_cond_2_N
   traff_cntl_cond_2_N     fatal
0                False  0.004246
1                 True  0.000000

fatal correlation by:  traff_cntl_cond_2_O
   traff_cntl_cond_2_O     fatal
0                False  0.004247
1                 True  0.000000

fatal correlation by:  traff_cntl_cond_2_P
   traff_cntl_cond_2_P     fatal
0                False  0.004246
1                 True  0.000000

fatal correlation by:  traff_cntl_cond_2_Q
   traff_cntl_cond_2_Q     fatal
1                 True  0.006032
0                False  0.004081

fatal correlation by:  traff_cntl_cond_2_R
   traff_cntl_cond_2_R     fatal
0                False  0.005076
1                 True  0.002098

fatal correlation by:  traff_cntl_cond_2_S

   parish_cd_10     fatal
0         False  0.004294
1          True  0.003435

fatal correlation by:  parish_cd_11
   parish_cd_11     fatal
0         False  0.004249
1          True  0.000000

fatal correlation by:  parish_cd_12
   parish_cd_12     fatal
1          True  0.014493
0         False  0.004234

fatal correlation by:  parish_cd_13
   parish_cd_13     fatal
1          True  0.015625
0         False  0.004239

fatal correlation by:  parish_cd_14
   parish_cd_14     fatal
1          True  0.006579
0         False  0.004242

fatal correlation by:  parish_cd_15
   parish_cd_15     fatal
1          True  0.008547
0         False  0.004237

fatal correlation by:  parish_cd_16
   parish_cd_16     fatal
0         False  0.004250
1          True  0.003311

fatal correlation by:  parish_cd_17
   parish_cd_17     fatal
0         False  0.004486
1          True  0.002731

fatal correlation by:  parish_cd_18
   parish_cd_18     fatal
0         False  0.004248
1          True  0.000000

f

   movement_reason_1_I     fatal
0                False  0.004257
1                 True  0.002418

fatal correlation by:  movement_reason_1_J
   movement_reason_1_J     fatal
0                False  0.004254
1                 True  0.000000

fatal correlation by:  movement_reason_1_K
   movement_reason_1_K     fatal
1                 True  0.010315
0                False  0.003864

fatal correlation by:  movement_reason_1_L
   movement_reason_1_L     fatal
0                False  0.005971
1                 True  0.002991

fatal correlation by:  movement_reason_1_M
   movement_reason_1_M     fatal
1                 True  0.005435
0                False  0.004233

fatal correlation by:  movement_reason_1_N
   movement_reason_1_N     fatal
0                False  0.004252
1                 True  0.002331

fatal correlation by:  movement_reason_1_O
   movement_reason_1_O     fatal
0                False  0.004246
1                 True  0.000000

fatal correlation by:  movement_reason_1_P

   vision_obscure_2_A     fatal
0               False  0.004267
1                True  0.002199

fatal correlation by:  vision_obscure_2_B
   vision_obscure_2_B     fatal
0               False  0.004246
1                True  0.000000

fatal correlation by:  vision_obscure_2_C
   vision_obscure_2_C     fatal
0               False  0.004246
1                True  0.000000

fatal correlation by:  vision_obscure_2_D
   vision_obscure_2_D     fatal
1                True  0.028571
0               False  0.004238

fatal correlation by:  vision_obscure_2_E
   vision_obscure_2_E     fatal
0               False  0.004245
1                True  0.000000

fatal correlation by:  vision_obscure_2_F
   vision_obscure_2_F     fatal
0               False  0.004245
1                True  0.000000

fatal correlation by:  vision_obscure_2_G
   vision_obscure_2_G     fatal
0               False  0.004245
1                True  0.000000

fatal correlation by:  vision_obscure_2_H
   vision_obscure_2_H     f

   f_harm_ev_cd1_EE     fatal
0             False  0.004246
1              True  0.000000

fatal correlation by:  f_harm_ev_cd1_F
   f_harm_ev_cd1_F     fatal
1             True  0.083333
0            False  0.004215

fatal correlation by:  f_harm_ev_cd1_FF
   f_harm_ev_cd1_FF     fatal
1              True  0.005848
0             False  0.004243

fatal correlation by:  f_harm_ev_cd1_G
   f_harm_ev_cd1_G     fatal
0            False  0.004248
1             True  0.000000

fatal correlation by:  f_harm_ev_cd1_GG
   f_harm_ev_cd1_GG     fatal
0             False  0.004247
1              True  0.000000

fatal correlation by:  f_harm_ev_cd1_H
   f_harm_ev_cd1_H     fatal
1             True  0.012077
0            False  0.004220

fatal correlation by:  f_harm_ev_cd1_HH
   f_harm_ev_cd1_HH     fatal
0             False  0.004254
1              True  0.000000

fatal correlation by:  f_harm_ev_cd1_I
   f_harm_ev_cd1_I     fatal
0            False  0.004246
1             True  0.000000

fatal co

   m_harm_ev_cd1_M     fatal
0            False  0.004245
1             True  0.000000

fatal correlation by:  m_harm_ev_cd1_MM
   m_harm_ev_cd1_MM     fatal
1              True  0.025641
0             False  0.004232

fatal correlation by:  m_harm_ev_cd1_N
   m_harm_ev_cd1_N     fatal
1             True  0.013672
0            False  0.004207

fatal correlation by:  m_harm_ev_cd1_NN
   m_harm_ev_cd1_NN     fatal
0             False  0.004252
1              True  0.002257

fatal correlation by:  m_harm_ev_cd1_O
   m_harm_ev_cd1_O     fatal
1             True  0.084423
0            False  0.003715

fatal correlation by:  m_harm_ev_cd1_OO
   m_harm_ev_cd1_OO     fatal
1              True  0.004474
0             False  0.004244

fatal correlation by:  m_harm_ev_cd1_P
   m_harm_ev_cd1_P     fatal
1             True  0.027888
0            False  0.004199

fatal correlation by:  m_harm_ev_cd1_PP
   m_harm_ev_cd1_PP     fatal
0             False  0.004259
1              True  0.000000

fatal c

## What are these highly correlated features?

### crash_type_0

In [36]:
data_raw['crash_type'].value_counts()

S    76102
U    28173
R     8657
Q     8629
P     6973
E     6317
G     5602
K     3020
N     2939
T     2842
X     2408
A     1711
F     1552
D     1523
H     1315
M     1222
B      969
C      153
J       79
Name: crash_type, dtype: int64

# Run ML Algorithms

In [37]:
print (x_train.shape, y_train.shape, y_train[y_train==0].shape, y_train[y_train==1].shape)



(1088, 658) (1088,) (544,) (544,)


In [38]:
Models = [
    # Ensemble
    AdaBoostClassifier(),
#    AdaBoostClassifier(class_weight="balanced"),
    BaggingClassifier(),
#    BaggingClassifier(class_weight="balanced"),
    ExtraTreesClassifier(), 
    ExtraTreesClassifier(class_weight="balanced"), 
    GradientBoostingClassifier(), 
#    GradientBoostingClassifier(class_weight="balanced"), 
    RandomForestClassifier(), 
    RandomForestClassifier(class_weight="balanced"), 
#    StackingClassifier(),# Needs extra argument, estimators, that I haven't looked up yet.
#    StackingClassifier(class_weight="balanced"),
#    VotingClassifier(),# Needs extra argument, estimators, that I haven't looked up yet.
#    VotingClassifier(class_weight="balanced"),
    
    # Linear Model
#    LogisticRegression(),
#    LogisticRegression(class_weight="balanced"),
#    Perceptron(), 
#    Perceptron(class_weight="balanced"), 
#    RidgeClassifier(), 
#    RidgeClassifier(class_weight="balanced"), 
#    RidgeClassifierCV(),
#    RidgeClassifierCV(class_weight="balanced"),
#    SGDClassifier(), 
#    SGDClassifier(class_weight="balanced"), 

    # Naive Bayes
    GaussianNB(),
    
    # Neighbors
#    KNeighborsClassifier(), # Commented out because it takes too long.
#    KNeighborsClassifier(class_weight = "balanced"), 
#    KNeighborsClassifier(n_neighbors = 3), 
#    RadiusNeighborsClassifier(),
#    RadiusNeighborsClassifier(class_weight="balanced"),

    # Neural Network
    MLPClassifier(),
#    MLPClassifier(class_weight = "balanced"),
    MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1),

    # SVM
    LinearSVC(),
    LinearSVC(class_weight="balanced"),
#    NuSVC(),
#    NuSVC(class_weight="balanced"),
    SVC(), 
    SVC(class_weight="balanced"), 

    # Tree
    DecisionTreeClassifier(),
    DecisionTreeClassifier(class_weight="balanced"),
    ExtraTreeClassifier(),
    ExtraTreeClassifier(class_weight="balanced"),
]


#By definition a confusion matrix C  is such that 
#C[i][j] is equal to the number of observations 
#known to be in group i and predicted to be in group j.

#Thus in binary classification, the count of 
#true negatives is C[0][0],
#false negatives is C[1][0], 
#true positives is C[1][1],
# and false positives is C[0][1].



for model in Models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    model_score = round(model.score(x_train,y_train) * 100, 2)
#    model_score = round(model.score(y_test, y_pred) * 100, 2)
    accuracy = round(accuracy_score(y_test, y_pred)*100,2)
    precision = round(precision_score(y_test, y_pred)*100,2)
    recall = round(recall_score(y_test, y_pred)*100,2)
    f1 = round(f1_score(y_test, y_pred)*100,2)
    C = confusion_matrix(y_test, y_pred)
    my_precision = C[1][1]/(C[1][1] + C[0][1])
    my_precision = round(my_precision*100,2)
    scale_factor = (C[1][0]+C[1][1])/(C[0][0] + C[0][1])
    balanced_precision = C[1][1]/(C[1][1] + C[0][1] * (C[1][0]+C[1][1])/(C[0][0] + C[0][1]))
    balanced_precision = round(balanced_precision*100,2)
    balanced_f1 = 2/(1/recall + 1/balanced_precision)
    balanced_f1 = round(balanced_f1,2)
    balanced_accuracy = (C[0][0]*scale_factor + C[1][1])/((C[0][0]+C[0][1])*scale_factor + C[1][0] + C[1][1])
    balanced_accuracy = round(balanced_accuracy*100,2)

    print (model)
    print ("Accuracy, Precision, Recall, f1")
    print (accuracy, precision, recall, f1)
    print (balanced_accuracy, balanced_precision, recall, balanced_f1)
    print (C)
    print ()

AdaBoostClassifier()
Accuracy, Precision, Recall, f1
89.82 3.79 93.43 7.28
91.62 90.16 93.43 91.77
[[28648  3253]
 [    9   128]]

BaggingClassifier()
Accuracy, Precision, Recall, f1
87.38 3.1 94.16 6.0
90.76 88.16 94.16 91.06
[[27866  4035]
 [    8   129]]

ExtraTreesClassifier()
Accuracy, Precision, Recall, f1
87.12 3.01 93.43 5.84
90.26 87.86 93.43 90.56
[[27782  4119]
 [    9   128]]

ExtraTreesClassifier(class_weight='balanced')
Accuracy, Precision, Recall, f1
87.49 3.06 91.97 5.92
89.72 88.01 91.97 89.95
[[27905  3996]
 [   11   126]]

GradientBoostingClassifier()
Accuracy, Precision, Recall, f1
89.75 3.76 93.43 7.23
91.58 90.1 93.43 91.73
[[28627  3274]
 [    9   128]]

RandomForestClassifier()
Accuracy, Precision, Recall, f1
86.17 2.71 89.78 5.26
87.97 86.64 89.78 88.18
[[27485  4416]
 [   14   123]]

RandomForestClassifier(class_weight='balanced')
Accuracy, Precision, Recall, f1
85.72 2.71 92.7 5.26
89.19 86.63 92.7 89.56
[[27335  4566]
 [   10   127]]

GaussianNB()
Accuracy, 