In [1]:
%%latex
\tableofcontents

<IPython.core.display.Latex object>

# Data Cleanup

## Import Libraries

### General Libraries

In [2]:
import sys
print ('Python version: {}'.format(sys.version))
import pandas as pd
print ('pandas version: {}'.format(pd.__version__))
import matplotlib
print ('matplotlib version: {}'.format(matplotlib.__version__))
import numpy as np
print ('NumPy version: {}'.format(np.__version__))
import scipy as sp
print ('SciPy version: {}'.format(sp.__version__))
import IPython
print ('IPython version: {}'.format(IPython.__version__))
import sklearn
print ('scikit-learn version: {}'.format(sklearn.__version__))

import random
import time
import warnings
import copy
warnings.filterwarnings('ignore')
print ('-*'*20)
from subprocess import check_output
print (check_output(['ls', './data']).decode('utf8'))

Python version: 3.9.1 (v3.9.1:1e5d33e9b9, Dec  7 2020, 12:10:52) 
[Clang 6.0 (clang-600.0.57)]
pandas version: 1.2.3
matplotlib version: 3.3.3
NumPy version: 1.19.5
SciPy version: 1.6.0
IPython version: 7.21.0
scikit-learn version: 0.24.1
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
2019_Crash_1_Database.csv
CODE_TB.xlsx



### Visualization Libraries

In [3]:
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Machine Learning Libraries

In [4]:
# machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC, LinearSVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix
from sklearn.utils import shuffle

## Import Data

We will bring in the data as data_originalk, and make a deep copy, data_raw.
Then we will process each column, copy it to data, and delete it from data_raw

In [5]:
data_original = pd.read_csv('./data/2019_Crash_1_Database.csv', parse_dates = ['crash_date', 'crash_hour', 'crash_time'])
data_raw = data_original.copy(deep=True)
data = pd.DataFrame()
data_dummy_fields = []
data_raw_fields_to_drop = []

## Fields

### Fields with the Dependent Variable

The dependent variable y=1 if somebody died, y=0 otherwise.  
We have two fields that give us this information.  
'num_tot_kil' [0, 1, 2, 3, 4], giving the number killed.

'severity_cd' ['E' 'D' 'C' 'B' 'A'], with 'A' being 'Fatal.'

Do these columns agree?  Yes (below).

In [6]:
for x in ['num_tot_kil', 'severity_cd']:
    print (data_raw[x].isnull().sum())
A = np.where( (
    (data_raw['num_tot_kil'] == 0) & (data_raw['severity_cd'] == 'A') |
    (data_raw['num_tot_kil'] > 0) & (data_raw['severity_cd'] != 'A') 
))
print (A)

0
0
(array([], dtype=int64),)


In [7]:
data['fatal'] = data_raw['num_tot_kil'].apply(lambda x: 1 if x>0 else x)
for x in ['num_tot_kil', 'severity_cd']:
    data_raw_fields_to_drop.append(x)

### Fields where I have No Idea What This Means

In [8]:
for x in ['quadrant', 'spotted_by', 'bypass']:
    data_raw_fields_to_drop.append(x)

### Fields with Time

- crash_date
- crash_hour
- crash_year
- crash_time

The years are all the same.

What might be interesting is the month and the day of week.

In [9]:
for x in ['crash_date', 'crash_hour']:
    print (data_raw[x].isnull().sum())
#print (data_raw.crash_hour.value_counts())
data_raw.crash_hour.replace(['  '], 25, inplace=True)
#print (data_raw.crash_hour.unique())



data['crash_month'] = data_raw['crash_date'].dt.month
data['crash_dayofweek'] = data_raw['crash_date'].dt.dayofweek
data['crash_hour'] = data_raw['crash_hour'].astype(int)

for x in ['crash_month', 'crash_dayofweek', 'crash_hour']:
    data_dummy_fields.append(x)
for x in ['crash_date', 'crash_hour', 'crash_year', 'crash_time']:
    data_raw_fields_to_drop.append(x)
print (data.head())
print (data.info())

0
0
   fatal  crash_month  crash_dayofweek  crash_hour
0      0            1                1          17
1      0            1                1          23
2      0            1                1          22
3      0            1                1          18
4      0            1                1          19
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160186 entries, 0 to 160185
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype
---  ------           --------------   -----
 0   fatal            160186 non-null  int64
 1   crash_month      160186 non-null  int64
 2   crash_dayofweek  160186 non-null  int64
 3   crash_hour       160186 non-null  int64
dtypes: int64(4)
memory usage: 4.9 MB
None


### Fields with Direction
I think I'll use just 'pri_road_dir,', which seems clean enough.  
Change the blanks to 'Z'.
Convert to type 'category'.
Convert category to codes.

In [10]:
for x in data_raw:
    if 'dir' in x:
        values = data_raw[x].unique()
        print (x, sorted(values, key=lambda x: (str(type(x)), x)))
data_raw['pri_road_dir'].replace([' '], 'Z', inplace=True)

data_raw['pri_road_dir'] = data_raw['pri_road_dir'].astype('object')
#data_raw['pri_road_dir'] = data_raw['pri_road_dir'].cat.codes

print (data_raw['pri_road_dir'].unique())
                                    
data['pri_road_dir'] = data_raw['pri_road_dir']

for x in ['pri_road_dir']:
    data_dummy_fields.append(x)
for x in ['travel_dirs', 'pri_dir', 'pri_road_dir']:
    data_raw_fields_to_drop.append(x)

print ()
print (data.head())
print (data.info())

travel_dirs ['    ', 'E   ', 'EE  ', 'EEE ', 'EEEE', 'EEEN', 'EEES', 'EEEW', 'EEN ', 'EENE', 'EENN', 'EENW', 'EES ', 'EESS', 'EESW', 'EEW ', 'EEWE', 'EEWW', 'EN  ', 'ENE ', 'ENEE', 'ENN ', 'ENNE', 'ENNN', 'ENNS', 'ENS ', 'ENSS', 'ENSW', 'ENW ', 'ENWN', 'ENWW', 'ES  ', 'ESE ', 'ESEE', 'ESN ', 'ESNE', 'ESNN', 'ESS ', 'ESSE', 'ESSS', 'ESSW', 'ESW ', 'ESWW', 'EW  ', 'EWE ', 'EWEE', 'EWN ', 'EWNN', 'EWS ', 'EWSS', 'EWW ', 'EWWE', 'EWWS', 'EWWW', 'N   ', 'NE  ', 'NEE ', 'NEEE', 'NEEW', 'NEN ', 'NES ', 'NESN', 'NESS', 'NEW ', 'NEWE', 'NN  ', 'NNE ', 'NNEE', 'NNES', 'NNN ', 'NNNE', 'NNNN', 'NNNS', 'NNNW', 'NNS ', 'NNSN', 'NNSS', 'NNW ', 'NNWN', 'NNWW', 'NS  ', 'NSE ', 'NSEE', 'NSEN', 'NSN ', 'NSNN', 'NSNW', 'NSS ', 'NSSE', 'NSSS', 'NSW ', 'NSWW', 'NW  ', 'NWE ', 'NWEE', 'NWEN', 'NWN ', 'NWNS', 'NWS ', 'NWSS', 'NWW ', 'NWWW', 'S   ', 'S S ', 'SE  ', 'SEE ', 'SEEE', 'SEN ', 'SENN', 'SES ', 'SESS', 'SEW ', 'SEWW', 'SN  ', 'SNE ', 'SNEN', 'SNN ', 'SNNN', 'SNNS', 'SNS ', 'SNSN', 'SNSS', 'SNW ', 'SN

### Fields of Binary Features

In [11]:
for x in ['intersection', 'alcohol', 'roadway_departure', 'lane_departure', 'dr_sex_1', 'dr_sex_2']:
    data_raw[x].fillna(' ', inplace=True)
    data_raw[x].replace([' '], data_raw[x].mode(), inplace=True)
    data_raw[x].replace({'M':1, 'F':0, 'No':0, 'Yes':1}, inplace=True)
    values = data_raw[x].unique()
    print (x, sorted(values, key=lambda x: (str(type(x)), x)))  
    data[x] = data_raw[x]
    data_raw_fields_to_drop.append(x)

print ()
print (data.head())
print (data.info()) 

intersection [0, 1]
alcohol [0, 1]
roadway_departure [0, 1]
lane_departure [0, 1]
dr_sex_1 [0, 1]
dr_sex_2 [0, 1]

   fatal  crash_month  crash_dayofweek  crash_hour pri_road_dir  intersection  \
0      0            1                1          17            Z             1   
1      0            1                1          23            Z             1   
2      0            1                1          22            W             0   
3      0            1                1          18            Z             0   
4      0            1                1          19            Z             1   

   alcohol  roadway_departure  lane_departure  dr_sex_1  dr_sex_2  
0        0                  0               0         1         1  
1        0                  0               0         1         1  
2        0                  0               0         1         0  
3        0                  1               1         0         0  
4        0                  0               0         0   

### Fields with Integer Values
- 'num_tot_inj'
    - Not dummy, because increasing values mean something.
- 'num_veh'
    - Not dummy, because increasing values mean something.
- 'parish_cd'
- 'parish_cd.1'
- 'city_cd'  
    - I don't know what cities the codes correlate to, but the '0' is probably 'Not in any of the 19 cities,' and may correlate to 'not urban.'

In [12]:
# If more than 10, just lump in with 10.
for x in ['num_tot_inj', 'num_veh']:
    data_raw[x].fillna(data_raw[x].mode(), inplace=True)
    values = data_raw[x].unique()
    print (x, sorted(values, key=lambda x: (str(type(x)), x)))  
    data_raw[x].where(data_raw[x] <10, 10, inplace=True)
    values = data_raw[x].unique()
    print (x, sorted(values, key=lambda x: (str(type(x)), x)))  
    data[x] = data_raw[x]
    data_raw_fields_to_drop.append(x)


for x in ['parish_cd', 'parish_cd.1', 'city_cd']:
    data_raw[x].fillna(data_raw[x].mode(), inplace=True)
    values = data_raw[x].unique()
    print (x, sorted(values, key=lambda x: (str(type(x)), x)))  
    data[x] = data_raw[x]
    data_dummy_fields.append(x)
    data_raw_fields_to_drop.append(x)


print ()
print (data.head())
print (data.info())

num_tot_inj [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 36, 37, 52, 64]
num_tot_inj [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
num_veh [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 16]
num_veh [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
parish_cd [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]
parish_cd.1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]
city_cd [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 44]

   fatal  crash_month  crash_dayofweek  crash_hour pri_road_dir  intersection  \
0      0            1                1          17            Z         

### Fields to Put in Ranges
- 'dr_age_1'
- 'dr_age_2'

In [13]:
for x in ['dr_age_1','dr_age_2']:
    data_raw[x].fillna(0, inplace=True)
    data_raw[x] = data_raw[x].astype(int)
    data_raw.loc[(data_raw[x]>100), x] = 0
    xbin = x + '_bin'
    data_raw[xbin] = data_raw[x].replace(0,np.nan)    
    data_raw[xbin] = pd.qcut(data_raw[xbin], 5, duplicates='drop').cat.add_categories(0)
    data_raw[xbin].fillna(0, inplace=True)    
    
    data_raw[xbin] = data_raw[xbin].astype('category')
#    data_raw[xbin] = data_raw[xbin].cat.codes

    data[xbin] = data_raw[xbin]
    values = data_raw[xbin].unique()
    print (data[xbin].value_counts())
    data_dummy_fields.append(xbin)
    data_raw_fields_to_drop.append(x)
    data_raw_fields_to_drop.append(xbin)

print ()
print (data.head())
print (data.info())

(22.0, 30.0]     29092
(1.999, 22.0]    28982
(40.0, 56.0]     28270
(30.0, 40.0]     27890
(56.0, 99.0]     27462
0                18490
Name: dr_age_1_bin, dtype: int64
0                33099
(1.999, 26.0]    27664
(34.0, 45.0]     27379
(57.0, 97.0]     24573
(45.0, 57.0]     24222
(26.0, 34.0]     23249
Name: dr_age_2_bin, dtype: int64

   fatal  crash_month  crash_dayofweek  crash_hour pri_road_dir  intersection  \
0      0            1                1          17            Z             1   
1      0            1                1          23            Z             1   
2      0            1                1          22            W             0   
3      0            1                1          18            Z             0   
4      0            1                1          19            Z             1   

   alcohol  roadway_departure  lane_departure  dr_sex_1  dr_sex_2  \
0        0                  0               0         1         1   
1        0                  0   

### Distance from the Road
This one is weird.  

- The units are either in feet or miles.
- For many values the units are missing, and for others the measure is extreme. 
- I'm going to 
-- make all of the entries with missing units or negative distance zero, 
-- change all of the lengths to feet,
-- take out the zeroes, 
-- put in ranges,
-- and put the zeroes back.

In [14]:
for x in ['pri_measure']:
    data_raw[x] = data_raw[x].str.strip()
    values = data_raw[x].unique()
    print (x, len(values), values)
data_raw.loc[data_raw['pri_measure'] == '', 'pri_dist'] = 0
data_raw.loc[data_raw['pri_dist'] <= 0, 'pri_dist'] = 0
data_raw.loc[data_raw['pri_measure'] == 'MI', 'pri_dist'] *= 5280

for x in ['pri_dist']:
    data_raw[x].fillna(0, inplace=True)
#    data_raw[x] = data_raw[x].astype(int)
#    data_raw.loc[(data_raw[x]>100), x] = 0
    xbin = x + '_bin'
    data_raw[xbin] = data_raw[x].replace(0,np.nan)    
    data_raw[xbin] = pd.qcut(data_raw[xbin], 5, duplicates='drop').cat.add_categories(0)
    data_raw[xbin].fillna(0, inplace=True)
    
    data_raw[xbin] = data_raw[xbin].astype('category')
#    data_raw[xbin] = data_raw[xbin].cat.codes
    
    data[xbin] = data_raw[xbin]
    print (data_raw[xbin].value_counts())
    data_dummy_fields.append(xbin)
    data_raw_fields_to_drop.append(x)
    data_raw_fields_to_drop.append(xbin)

for x in ['pri_measure']:
    data_raw_fields_to_drop.append(x)
    

print ()
print (data.head())
print (data.info())

pri_measure 3 ['' 'FT' 'MI']
0                        60888
(-0.0009, 50.0]          25224
(150.0, 528.0]           22264
(2901.02, 52794720.0]    19860
(50.0, 150.0]            15993
(528.0, 2901.02]         15957
Name: pri_dist_bin, dtype: int64

   fatal  crash_month  crash_dayofweek  crash_hour pri_road_dir  intersection  \
0      0            1                1          17            Z             1   
1      0            1                1          23            Z             1   
2      0            1                1          22            W             0   
3      0            1                1          18            Z             0   
4      0            1                1          19            Z             1   

   alcohol  roadway_departure  lane_departure  dr_sex_1  dr_sex_2  \
0        0                  0               0         1         1   
1        0                  0               0         1         1   
2        0                  0               0         1  

### Alpha fields with 'Y' = 'Unknown' or 'Z' = 'Other'

We have lots of fields where 'Y' is 'Unknown' and 'Z' is 'Other.  
- Merge nan, blank, erroneous intergers, Y, and Z, into 'Z'. 

Other related Alpha fields:
- 'crash_type' does not have a Y or Z, and I can't figure out what it means.
- 'hwy_class' is mixed Alpha and integers, and I have no idea what it means.
- 'contributing_factor' has two values, 'R' and 'O', and I have no idea what it means.
- 'veh_severity' has five values, and I have no idea what it means.

These fields have trailing spaces I had to remove:
- 'f_harm_ev_cd1'
- 'm_harm_ev_cd1'

I lumped in some other fields here:
 - 'crash_type'
 - 'pri_contrib_fac_cd'
 - 'sec_pri_contrib_fac_cd'
 - 'hwy_type_cd'

In [15]:
for x in ['f_harm_ev_cd1', 'm_harm_ev_cd1']:
    data_raw[x] = data_raw[x].str.strip()

for x in data_raw:
    values = data_raw[x].unique()
    if (
        (('Y' in values or 'Z' in values) and len(values)<50)
        or x in ['crash_type', 'pri_contrib_fac_cd', 'sec_contrib_fac_cd', 'hwy_type_cd']
    ):
        print (x, sorted(values, key=lambda x: (str(type(x)), x)))
        data_raw[x].fillna('Z', inplace=True)
        data_raw[x].replace([' ', 'Y'], 'Z', inplace=True)
        data_raw[x] = data_raw[x].apply(lambda x: 'Z' if x.isnumeric() else x)
        values = data_raw[x].unique()
        print (x, sorted(values, key=lambda x: (str(type(x)), x)))
        print ()
        
        data_raw[x] = data_raw[x].astype('category')
#        data_raw[x] = data_raw[x].cat.codes
        
        data[x] = data_raw[x]
        data_dummy_fields.append(x)
        data_raw_fields_to_drop.append(x)
        
for x in ['hwy_class', 'contributing_factor']:
    data_raw_fields_to_drop.append(x)
    

print ()
print (data.head())
print (data.info())

f_harm_ev_cd1 ['', 'A', 'AA', 'B', 'BB', 'C', 'CC', 'D', 'DD', 'E', 'EE', 'F', 'FF', 'G', 'GG', 'H', 'HH', 'I', 'II', 'J', 'JJ', 'K', 'KK', 'L', 'LL', 'M', 'MM', 'N', 'NN', 'O', 'OO', 'P', 'PP', 'Q', 'QQ', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'YY', 'Z']
f_harm_ev_cd1 ['', 'A', 'AA', 'B', 'BB', 'C', 'CC', 'D', 'DD', 'E', 'EE', 'F', 'FF', 'G', 'GG', 'H', 'HH', 'I', 'II', 'J', 'JJ', 'K', 'KK', 'L', 'LL', 'M', 'MM', 'N', 'NN', 'O', 'OO', 'P', 'PP', 'Q', 'QQ', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'YY', 'Z']

m_harm_ev_cd1 ['', 'A', 'AA', 'B', 'BB', 'C', 'CC', 'D', 'DD', 'E', 'EE', 'F', 'FF', 'G', 'GG', 'H', 'HH', 'I', 'II', 'J', 'JJ', 'K', 'KK', 'L', 'LL', 'M', 'MM', 'N', 'NN', 'O', 'OO', 'P', 'PP', 'Q', 'QQ', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'YY', 'Z']
m_harm_ev_cd1 ['', 'A', 'AA', 'B', 'BB', 'C', 'CC', 'D', 'DD', 'E', 'EE', 'F', 'FF', 'G', 'GG', 'H', 'HH', 'I', 'II', 'J', 'JJ', 'K', 'KK', 'L', 'LL', 'M', 'MM', 'N', 'NN', 'O', 'OO', 'P', 'PP', 'Q', 'QQ', 'R', 'S', 'T', 'U', 'V', 'W

## Blank Field

In [16]:
for x in ['ped_actions_2']:
    data_raw_fields_to_drop.append(x)

## Review Data

### Drop Used Fields from 'data_raw'

In [17]:
print (data_raw.shape)
data_raw_dropped = data_raw.drop(data_raw_fields_to_drop, axis=1, inplace=False)
print (data_raw_dropped.shape)

(160186, 79)
(160186, 18)


### Remaining Fields
I don't know that any of these are likely to correlate.  

In [18]:
for x in data_raw_dropped:
    values = data_raw_dropped[x].unique()
    print (x, len(values))

route 1024
milepoint 53808
crash_num 160186
prior_movements 1322
csect 2141
logmile 18203
lrs_id 4788
lrs_logmile 18093
adt 749
intersection_id 15037
ORIG_LATITUDE 91540
ORIG_LONGITUDE 87921
DOTD_LATITUDE 127283
DOTD_LONGITUDE 130753
pri_hwy_num 1006
milepost 7172
pri_road_name 15724
inter_road 35371


### Fields in 'data' dataframe

In [19]:
for x in data:
    print (x)

fatal
crash_month
crash_dayofweek
crash_hour
pri_road_dir
intersection
alcohol
roadway_departure
lane_departure
dr_sex_1
dr_sex_2
num_tot_inj
num_veh
parish_cd
parish_cd.1
city_cd
dr_age_1_bin
dr_age_2_bin
pri_dist_bin
f_harm_ev_cd1
m_harm_ev_cd1
man_coll_cd
crash_type
surf_cond_cd
invest_agency_cd
veh_type_cd1
veh_type_cd2
road_rel_cd
location_type
veh_severity_cd
hwy_type_cd
bypass
pri_contrib_fac_cd
sec_contrib_fac_cd
vision_obscure_1
vision_obscure_2
movement_reason_1
movement_reason_2
ped_actions_1
veh_lighting_1
veh_lighting_2
traff_cntl_cond_1
traff_cntl_cond_2
lighting_cd
dr_cond_cd1
dr_cond_cd2
veh_cond_cd1
veh_cond_cd2


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160186 entries, 0 to 160185
Data columns (total 48 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   fatal               160186 non-null  int64   
 1   crash_month         160186 non-null  int64   
 2   crash_dayofweek     160186 non-null  int64   
 3   crash_hour          160186 non-null  int64   
 4   pri_road_dir        160186 non-null  category
 5   intersection        160186 non-null  int64   
 6   alcohol             160186 non-null  int64   
 7   roadway_departure   160186 non-null  int64   
 8   lane_departure      160186 non-null  int64   
 9   dr_sex_1            160186 non-null  int64   
 10  dr_sex_2            160186 non-null  int64   
 11  num_tot_inj         160186 non-null  int64   
 12  num_veh             160186 non-null  int64   
 13  parish_cd           160186 non-null  int64   
 14  parish_cd.1         160186 non-null  int64   
 15  city_cd          

In [21]:
data.head()

Unnamed: 0,fatal,crash_month,crash_dayofweek,crash_hour,pri_road_dir,intersection,alcohol,roadway_departure,lane_departure,dr_sex_1,...,ped_actions_1,veh_lighting_1,veh_lighting_2,traff_cntl_cond_1,traff_cntl_cond_2,lighting_cd,dr_cond_cd1,dr_cond_cd2,veh_cond_cd1,veh_cond_cd2
0,0,1,1,17,Z,1,0,0,0,1,...,Z,Z,A,A,V,A,Z,A,Z,K
1,0,1,1,23,Z,1,0,0,0,1,...,Z,Z,A,A,U,C,A,A,K,K
2,0,1,1,22,W,0,0,0,0,1,...,Z,A,A,Q,C,C,A,A,K,K
3,0,1,1,18,Z,0,0,1,1,0,...,Z,A,Z,S,Z,B,A,Z,K,Z
4,0,1,1,19,Z,1,0,0,0,0,...,Z,A,A,R,R,C,A,A,K,K


## Get Dummies

### Fields to become Dummies

In [22]:
for field in data_dummy_fields:
    print (field)

crash_month
crash_dayofweek
crash_hour
pri_road_dir
parish_cd
parish_cd.1
city_cd
dr_age_1_bin
dr_age_2_bin
pri_dist_bin
f_harm_ev_cd1
m_harm_ev_cd1
man_coll_cd
crash_type
surf_cond_cd
invest_agency_cd
veh_type_cd1
veh_type_cd2
road_rel_cd
location_type
veh_severity_cd
hwy_type_cd
bypass
pri_contrib_fac_cd
sec_contrib_fac_cd
vision_obscure_1
vision_obscure_2
movement_reason_1
movement_reason_2
ped_actions_1
veh_lighting_1
veh_lighting_2
traff_cntl_cond_1
traff_cntl_cond_2
pri_road_dir
lighting_cd
dr_cond_cd1
dr_cond_cd2
veh_cond_cd1
veh_cond_cd2


In [23]:
for field in data_dummy_fields:
    data = pd.get_dummies(data,columns=[field],drop_first=False)
for x in data:
    print (x)
print (data.head())

KeyError: "None of [Index(['pri_road_dir'], dtype='object')] are in the [columns]"

## Split into 'train' and 'test' Sets with Proportional number of Fatalities

In [None]:
data_positive = data[data['fatal'] == 1]
data_negative = data[data['fatal'] == 0]
train_positive, test_positive = train_test_split(data_positive, test_size=0.2)
train_negative, test_negative = train_test_split(data_negative, test_size=0.2)
train = pd.concat([train_positive, train_negative])
test = pd.concat([test_positive, test_negative])
# Randomly shuffle the rows of the train and test sets,
# because otherwise they have the positive on top and the negative on the bottom.
# "shuffle" is an sklearn function.
train = shuffle(train)
test = shuffle(test)
x_train = train.drop(['fatal'], axis=1)
y_train = train['fatal']
x_test = test.drop(['fatal'], axis=1)
y_test = test['fatal']

print (x_train.shape, y_train.shape, x_test.shape, y_test.shape)

In [None]:
train, test = train_test_split(data, test_size=0.2)
x_train = train.drop(['fatal'], axis=1)
y_train = train['fatal']
x_test = test.drop(['fatal'], axis=1)
y_test = test['fatal']
print (x_train.shape, y_train.shape, x_test.shape, y_test.shape)

In [None]:
print (x_train.info())

## Perform Exploratory Analysis with Statistics on the 'train' Set

### Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
acc_log = round(logreg.score(x_train, y_train) * 100, 2)
acc_log

In [None]:
coeff_data = pd.DataFrame(train.columns.delete(0))
coeff_data.columns = ['Feature']
coeff_data["Correlation"] = pd.Series(logreg.coef_[0])

coeff_data.sort_values(by='Correlation', ascending=False)

In [None]:
for x in x_train:
    print ('fatal correlation by: ', x)
    A = train[[x, 'fatal']].groupby(x, as_index=False).mean()
    A = A.sort_values(by='fatal', ascending=False)
    print (A)
    print ()

## What are these highly correlated features?

### crash_type_0

In [None]:
data_raw['crash_type'].value_counts()

# Run ML Algorithms

In [None]:
Models = [
    LinearSVC(),
    SGDClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),        
    GaussianNB(), 
    Perceptron(),
#    KNeighborsClassifier(n_neighbors = 3), 
#    SVC(),
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0),
    MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1),
]

#By definition a confusion matrix C  is such that 
#C[i][j] is equal to the number of observations 
#known to be in group i and predicted to be in group j.

#Thus in binary classification, the count of 
#true negatives is C[0][0],
#false negatives is C[1][0], 
#true positives is C[1][1],
# and false positives is C[0][1].



for model in Models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = round(model.score(x_train,y_train) * 100, 2)
    precision = round(precision_score(y_test, y_pred, average='macro')*100,2)
    recall = round(recall_score(y_test, y_pred)*100,2)
    f1 = round(f1_score(y_test, y_pred)*100,2)
    C = confusion_matrix(y_test, y_pred)

    print (model)
    print (accuracy, precision, recall, f1)
    print (C)
    print ()