# Data Cleanup

## Import Libraries

### General Libraries

In [1]:
import sys
print ('Python version: {}'.format(sys.version))
import pandas as pd
print ('pandas version: {}'.format(pd.__version__))
import matplotlib
print ('matplotlib version: {}'.format(matplotlib.__version__))
import numpy as np
print ('NumPy version: {}'.format(np.__version__))
import scipy as sp
print ('SciPy version: {}'.format(sp.__version__))
import IPython
print ('IPython version: {}'.format(IPython.__version__))
import sklearn
print ('scikit-learn version: {}'.format(sklearn.__version__))

import random
import time
import warnings
import copy
warnings.filterwarnings('ignore')
print ('-*'*20)
from subprocess import check_output
print (check_output(['ls', './data']).decode('utf8'))

Python version: 3.9.1 (v3.9.1:1e5d33e9b9, Dec  7 2020, 12:10:52) 
[Clang 6.0 (clang-600.0.57)]
pandas version: 1.2.3
matplotlib version: 3.3.3
NumPy version: 1.19.5
SciPy version: 1.6.0
IPython version: 7.21.0
scikit-learn version: 0.24.1
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
2019_Crash_1_Database.csv
CODE_TB.csv



### Visualization Libraries

In [2]:
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Machine Learning Libraries

In [3]:
# machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

## Import Data

We will bring in the data as data_originalk, and make a deep copy, data_raw.
Then we will process each column, copy it to data, and delete it from data_raw

In [4]:
data_original = pd.read_csv('./data/2019_Crash_1_Database.csv', parse_dates = ['crash_date', 'crash_hour', 'crash_time'])
data_raw = data_original.copy(deep=True)
data = pd.DataFrame()
data_dummies = pd.DataFrame()
data_raw_fields_to_drop = []

## Fields with the Dependent Variable
The dependent variable y=1 if somebody died, y=0 otherwise.  
We have two fields that give us this information.  
'num_tot_kil' [0, 1, 2, 3, 4], giving the number killed.

'severity_cd' ['E' 'D' 'C' 'B' 'A'], with 'A' being 'Fatal.'

Do these columns agree?  Yes (below).

In [5]:
for x in ['num_tot_kil', 'severity_cd']:
    print (data_raw[x].isnull().sum())
A = np.where( (
    (data_raw['num_tot_kil'] == 0) & (data_raw['severity_cd'] == 'A') |
    (data_raw['num_tot_kil'] > 0) & (data_raw['severity_cd'] != 'A') 
))
print (A)
#A = np.where( (data_raw['num_tot_kil']>0 & data_raw['severity_cd']=='A'))

0
0
(array([], dtype=int64),)


In [6]:
data['fatal'] = data_raw['num_tot_kil'].apply(lambda x: 1 if x>0 else x)
for x in ['num_tot_kil', 'severity_cd']:
    data_raw_fields_to_drop.append(x)

## Fields where I have No Idea What This Means

In [7]:
for x in ['quadrant', 'spotted_by', 'city_cd', 'bypass']:
    data_raw_fields_to_drop.append(x)

## Fields with Time

- crash_date
- crash_hour
- crash_year
- crash_time

The years are all the same.

What might be interesting is the month and the day of week.

In [8]:
for x in ['crash_date', 'crash_hour']:
    print (data_raw[x].isnull().sum())
#print (data_raw.crash_hour.value_counts())
data_raw.crash_hour.replace(['  '], 25, inplace=True)
#print (data_raw.crash_hour.unique())



data['crash_month'] = data_raw['crash_date'].dt.month
data['crash_dayofweek'] = data_raw['crash_date'].dt.dayofweek
data['crash_hour'] = data_raw['crash_hour'].astype(int)

for x in ['crash_date', 'crash_hour', 'crash_year', 'crash_time']:
    data_raw_fields_to_drop.append(x)
print (data.head())
print (data.info())

0
0
   fatal  crash_month  crash_dayofweek  crash_hour
0      0            1                1          17
1      0            1                1          23
2      0            1                1          22
3      0            1                1          18
4      0            1                1          19
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160186 entries, 0 to 160185
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype
---  ------           --------------   -----
 0   fatal            160186 non-null  int64
 1   crash_month      160186 non-null  int64
 2   crash_dayofweek  160186 non-null  int64
 3   crash_hour       160186 non-null  int64
dtypes: int64(4)
memory usage: 4.9 MB
None


## Fields with Direction
I think I'll use just 'pri_road_dir,', which seems clean enough.  
Change the blanks to 'Z'.
Convert to type 'category'.
Convert category to codes.

In [9]:
for x in data_raw:
    if 'dir' in x:
        values = data_raw[x].unique()
        print (x, sorted(values, key=lambda x: (str(type(x)), x)))
data_raw['pri_road_dir'].replace([' '], 'Z', inplace=True)
data_raw['pri_road_dir'] = data_raw['pri_road_dir'].astype('category')
data_raw['pri_road_dir'] = data_raw['pri_road_dir'].cat.codes
print (data_raw['pri_road_dir'].unique())
                                    
data['pri_road_dir'] = data_raw['pri_road_dir']
for x in ['travel_dirs', 'pri_dir', 'pri_road_dir']:
    data_raw_fields_to_drop.append(x)

print ()
print (data.head())
print (data.info())

travel_dirs ['    ', 'E   ', 'EE  ', 'EEE ', 'EEEE', 'EEEN', 'EEES', 'EEEW', 'EEN ', 'EENE', 'EENN', 'EENW', 'EES ', 'EESS', 'EESW', 'EEW ', 'EEWE', 'EEWW', 'EN  ', 'ENE ', 'ENEE', 'ENN ', 'ENNE', 'ENNN', 'ENNS', 'ENS ', 'ENSS', 'ENSW', 'ENW ', 'ENWN', 'ENWW', 'ES  ', 'ESE ', 'ESEE', 'ESN ', 'ESNE', 'ESNN', 'ESS ', 'ESSE', 'ESSS', 'ESSW', 'ESW ', 'ESWW', 'EW  ', 'EWE ', 'EWEE', 'EWN ', 'EWNN', 'EWS ', 'EWSS', 'EWW ', 'EWWE', 'EWWS', 'EWWW', 'N   ', 'NE  ', 'NEE ', 'NEEE', 'NEEW', 'NEN ', 'NES ', 'NESN', 'NESS', 'NEW ', 'NEWE', 'NN  ', 'NNE ', 'NNEE', 'NNES', 'NNN ', 'NNNE', 'NNNN', 'NNNS', 'NNNW', 'NNS ', 'NNSN', 'NNSS', 'NNW ', 'NNWN', 'NNWW', 'NS  ', 'NSE ', 'NSEE', 'NSEN', 'NSN ', 'NSNN', 'NSNW', 'NSS ', 'NSSE', 'NSSS', 'NSW ', 'NSWW', 'NW  ', 'NWE ', 'NWEE', 'NWEN', 'NWN ', 'NWNS', 'NWS ', 'NWSS', 'NWW ', 'NWWW', 'S   ', 'S S ', 'SE  ', 'SEE ', 'SEEE', 'SEN ', 'SENN', 'SES ', 'SESS', 'SEW ', 'SEWW', 'SN  ', 'SNE ', 'SNEN', 'SNN ', 'SNNN', 'SNNS', 'SNS ', 'SNSN', 'SNSS', 'SNW ', 'SN

## Fields of Binary Features

In [10]:
for x in ['intersection', 'alcohol', 'roadway_departure', 'lane_departure', 'dr_sex_1', 'dr_sex_2']:
    data_raw[x].fillna(' ', inplace=True)
    data_raw[x].replace([' '], data_raw[x].mode(), inplace=True)
    data_raw[x].replace({'M':1, 'F':0, 'No':0, 'Yes':1}, inplace=True)
    values = data_raw[x].unique()
    print (x, sorted(values, key=lambda x: (str(type(x)), x)))  
    data[x] = data_raw[x]
    data_raw_fields_to_drop.append(x)

print ()
print (data.head())
print (data.info()) 

intersection [0, 1]
alcohol [0, 1]
roadway_departure [0, 1]
lane_departure [0, 1]
dr_sex_1 [0, 1]
dr_sex_2 [0, 1]

   fatal  crash_month  crash_dayofweek  crash_hour  pri_road_dir  \
0      0            1                1          17             8   
1      0            1                1          23             8   
2      0            1                1          22             7   
3      0            1                1          18             8   
4      0            1                1          19             8   

   intersection  alcohol  roadway_departure  lane_departure  dr_sex_1  \
0             1        0                  0               0         1   
1             1        0                  0               0         1   
2             0        0                  0               0         1   
3             0        0                  1               1         0   
4             1        0                  0               0         0   

   dr_sex_2  
0         1  
1        

## Fields with Integer Values
- 'num_tot_inj'
- 'num_veh'
- 'parish_cd'
- 'parish_cd.1'

In [11]:
# If more than 10, just lump in with 10.
for x in ['num_tot_inj', 'num_veh']:
    data_raw[x].fillna(data_raw[x].mode(), inplace=True)
    values = data_raw[x].unique()
    print (x, sorted(values, key=lambda x: (str(type(x)), x)))  
    data_raw[x].where(data_raw[x] <10, 10, inplace=True)
    values = data_raw[x].unique()
    print (x, sorted(values, key=lambda x: (str(type(x)), x)))  
    data[x] = data_raw[x]
    data_raw_fields_to_drop.append(x)


for x in ['parish_cd', 'parish_cd.1']:
    data_raw[x].fillna(data_raw[x].mode(), inplace=True)
    values = data_raw[x].unique()
    print (x, sorted(values, key=lambda x: (str(type(x)), x)))  
    data[x] = data_raw[x]
    data_raw_fields_to_drop.append(x)


print ()
print (data.head())
print (data.info())

num_tot_inj [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 36, 37, 52, 64]
num_tot_inj [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
num_veh [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 16]
num_veh [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
parish_cd [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]
parish_cd.1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]

   fatal  crash_month  crash_dayofweek  crash_hour  pri_road_dir  \
0      0            1                1          17             8   
1      0            1                1          23             8   
2      0            1       

## Fields to Put in Ranges
- 'dr_age_1'
- 'dr_age_2'

In [12]:
for x in ['dr_age_1','dr_age_2']:
    data_raw[x].fillna(0, inplace=True)
    data_raw[x] = data_raw[x].astype(int)
    data_raw.loc[(data_raw[x]>100), x] = 0
    xbin = x + '_bin'
    data_raw[xbin] = data_raw[x].replace(0,np.nan)    
    data_raw[xbin] = pd.qcut(data_raw[xbin], 5, duplicates='drop').cat.add_categories(0)
    data_raw[xbin].fillna(0, inplace=True)    
    data_raw[xbin] = data_raw[xbin].astype('category')
    data_raw[xbin] = data_raw[xbin].cat.codes
    data[xbin] = data_raw[xbin]
    values = data_raw[xbin].unique()
    print (data[xbin].value_counts())
    data_raw_fields_to_drop.append(x)
    data_raw_fields_to_drop.append(xbin)

print ()
print (data.head())
print (data.info())

1    29092
0    28982
3    28270
2    27890
4    27462
5    18490
Name: dr_age_1_bin, dtype: int64
5    33099
0    27664
2    27379
4    24573
3    24222
1    23249
Name: dr_age_2_bin, dtype: int64

   fatal  crash_month  crash_dayofweek  crash_hour  pri_road_dir  \
0      0            1                1          17             8   
1      0            1                1          23             8   
2      0            1                1          22             7   
3      0            1                1          18             8   
4      0            1                1          19             8   

   intersection  alcohol  roadway_departure  lane_departure  dr_sex_1  \
0             1        0                  0               0         1   
1             1        0                  0               0         1   
2             0        0                  0               0         1   
3             0        0                  1               1         0   
4             1        0   

## Distance from the Road
This one is weird.  

- The units are either in feet or miles.
- For many values the units are missing, and for others the measure is extreme. 
- I'm going to 
-- make all of the entries with missing units or negative distance zero, 
-- change all of the lengths to feet,
-- take out the zeroes, 
-- put in ranges,
-- and put the zeroes back.

In [13]:
for x in ['pri_measure']:
    data_raw[x] = data_raw[x].str.strip()
    values = data_raw[x].unique()
    print (x, len(values), values)
data_raw.loc[data_raw['pri_measure'] == '', 'pri_dist'] = 0
data_raw.loc[data_raw['pri_dist'] <= 0, 'pri_dist'] = 0
data_raw.loc[data_raw['pri_measure'] == 'MI', 'pri_dist'] *= 5280

for x in ['pri_dist']:
    data_raw[x].fillna(0, inplace=True)
#    data_raw[x] = data_raw[x].astype(int)
#    data_raw.loc[(data_raw[x]>100), x] = 0
    xbin = x + '_bin'
    data_raw[xbin] = data_raw[x].replace(0,np.nan)    
    data_raw[xbin] = pd.qcut(data_raw[xbin], 5, duplicates='drop').cat.add_categories(0)
    data_raw[xbin].fillna(0, inplace=True)
    
    data_raw[xbin] = data_raw[xbin].astype('category')
    data_raw[xbin] = data_raw[xbin].cat.codes
    
    data[xbin] = data_raw[xbin]
    print (data_raw[xbin].value_counts())
    data_raw_fields_to_drop.append(x)
    data_raw_fields_to_drop.append(xbin)

for x in ['pri_measure']:
    data_raw_fields_to_drop.append(x)
    

print ()
print (data.head())
print (data.info())

pri_measure 3 ['' 'FT' 'MI']
5    60888
0    25224
2    22264
4    19860
1    15993
3    15957
Name: pri_dist_bin, dtype: int64

   fatal  crash_month  crash_dayofweek  crash_hour  pri_road_dir  \
0      0            1                1          17             8   
1      0            1                1          23             8   
2      0            1                1          22             7   
3      0            1                1          18             8   
4      0            1                1          19             8   

   intersection  alcohol  roadway_departure  lane_departure  dr_sex_1  \
0             1        0                  0               0         1   
1             1        0                  0               0         1   
2             0        0                  0               0         1   
3             0        0                  1               1         0   
4             1        0                  0               0         0   

   dr_sex_2  num_tot_in

## Alpha fields with 'Y' = 'Unknown' or 'Z' = 'Other'

We have lots of fields where 'Y' is 'Unknown' and 'Z' is 'Other.  
- Merge nan, blank, erroneous intergers, Y, and Z, into 'Z'. 

Other related Alpha fields:
- 'crash_type' does not have a Y or Z, and I can't figure out what it means.
- 'hwy_class' is mixed Alpha and integers, and I have no idea what it means.
- 'contributing_factor' has two values, 'R' and 'O', and I have no idea what it means.
- 'veh_severity' has five values, and I have no idea what it means.

These fields have trailing spaces I had to remove:
- 'f_harm_ev_cd1'
- 'm_harm_ev_cd1'

I lumped in some other fields here:
 - 'crash_type'
 - 'pri_contrib_fac_cd'
 - 'sec_pri_contrib_fac_cd'
 - 'hwy_type_cd'

In [14]:
for x in ['f_harm_ev_cd1', 'm_harm_ev_cd1']:
    data_raw[x] = data_raw[x].str.strip()

for x in data_raw:
    values = data_raw[x].unique()
    if (
        (('Y' in values or 'Z' in values) and len(values)<50)
        or x in ['crash_type', 'pri_contrib_fac_cd', 'sec_contrib_fac_cd', 'hwy_type_cd']
    ):
        print (x, sorted(values, key=lambda x: (str(type(x)), x)))
        data_raw[x].fillna('Z', inplace=True)
        data_raw[x].replace([' ', 'Y'], 'Z', inplace=True)
        data_raw[x] = data_raw[x].apply(lambda x: 'Z' if x.isnumeric() else x)
        values = data_raw[x].unique()
        print (x, sorted(values, key=lambda x: (str(type(x)), x)))
        print ()
        
        data_raw[x] = data_raw[x].astype('category')
        data_raw[x] = data_raw[x].cat.codes
        
        data[x] = data_raw[x]
        data_raw_fields_to_drop.append(x)
        
for x in ['hwy_class', 'contributing_factor']:
    data_raw_fields_to_drop.append(x)
    

print ()
print (data.head())
print (data.info())

f_harm_ev_cd1 ['', 'A', 'AA', 'B', 'BB', 'C', 'CC', 'D', 'DD', 'E', 'EE', 'F', 'FF', 'G', 'GG', 'H', 'HH', 'I', 'II', 'J', 'JJ', 'K', 'KK', 'L', 'LL', 'M', 'MM', 'N', 'NN', 'O', 'OO', 'P', 'PP', 'Q', 'QQ', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'YY', 'Z']
f_harm_ev_cd1 ['', 'A', 'AA', 'B', 'BB', 'C', 'CC', 'D', 'DD', 'E', 'EE', 'F', 'FF', 'G', 'GG', 'H', 'HH', 'I', 'II', 'J', 'JJ', 'K', 'KK', 'L', 'LL', 'M', 'MM', 'N', 'NN', 'O', 'OO', 'P', 'PP', 'Q', 'QQ', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'YY', 'Z']

m_harm_ev_cd1 ['', 'A', 'AA', 'B', 'BB', 'C', 'CC', 'D', 'DD', 'E', 'EE', 'F', 'FF', 'G', 'GG', 'H', 'HH', 'I', 'II', 'J', 'JJ', 'K', 'KK', 'L', 'LL', 'M', 'MM', 'N', 'NN', 'O', 'OO', 'P', 'PP', 'Q', 'QQ', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'YY', 'Z']
m_harm_ev_cd1 ['', 'A', 'AA', 'B', 'BB', 'C', 'CC', 'D', 'DD', 'E', 'EE', 'F', 'FF', 'G', 'GG', 'H', 'HH', 'I', 'II', 'J', 'JJ', 'K', 'KK', 'L', 'LL', 'M', 'MM', 'N', 'NN', 'O', 'OO', 'P', 'PP', 'Q', 'QQ', 'R', 'S', 'T', 'U', 'V', 'W

## Blank Field

In [15]:
for x in ['ped_actions_2']:
    data_raw_fields_to_drop.append(x)

# Review Data

### Drop Used Fields from 'data_raw'

In [16]:
print (data_raw.shape)
data_raw.drop(data_raw_fields_to_drop, axis=1, inplace=True)
print (data_raw.shape)

(160186, 79)
(160186, 18)


## Remaining Fields
I don't know that any of these are likely to correlate.  

In [17]:
for x in data_raw:
    values = data_raw[x].unique()
    print (x, len(values))

route 1024
milepoint 53808
crash_num 160186
prior_movements 1322
csect 2141
logmile 18203
lrs_id 4788
lrs_logmile 18093
adt 749
intersection_id 15037
ORIG_LATITUDE 91540
ORIG_LONGITUDE 87921
DOTD_LATITUDE 127283
DOTD_LONGITUDE 130753
pri_hwy_num 1006
milepost 7172
pri_road_name 15724
inter_road 35371


## Fields in 'data' dataframe

In [18]:
for x in data:
    print (x)

fatal
crash_month
crash_dayofweek
crash_hour
pri_road_dir
intersection
alcohol
roadway_departure
lane_departure
dr_sex_1
dr_sex_2
num_tot_inj
num_veh
parish_cd
parish_cd.1
dr_age_1_bin
dr_age_2_bin
pri_dist_bin
f_harm_ev_cd1
m_harm_ev_cd1
man_coll_cd
crash_type
surf_cond_cd
invest_agency_cd
veh_type_cd1
veh_type_cd2
road_rel_cd
location_type
veh_severity_cd
hwy_type_cd
bypass
pri_contrib_fac_cd
sec_contrib_fac_cd
vision_obscure_1
vision_obscure_2
movement_reason_1
movement_reason_2
ped_actions_1
veh_lighting_1
veh_lighting_2
traff_cntl_cond_1
traff_cntl_cond_2
lighting_cd
dr_cond_cd1
dr_cond_cd2
veh_cond_cd1
veh_cond_cd2


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160186 entries, 0 to 160185
Data columns (total 47 columns):
 #   Column              Non-Null Count   Dtype
---  ------              --------------   -----
 0   fatal               160186 non-null  int64
 1   crash_month         160186 non-null  int64
 2   crash_dayofweek     160186 non-null  int64
 3   crash_hour          160186 non-null  int64
 4   pri_road_dir        160186 non-null  int8 
 5   intersection        160186 non-null  int64
 6   alcohol             160186 non-null  int64
 7   roadway_departure   160186 non-null  int64
 8   lane_departure      160186 non-null  int64
 9   dr_sex_1            160186 non-null  int64
 10  dr_sex_2            160186 non-null  int64
 11  num_tot_inj         160186 non-null  int64
 12  num_veh             160186 non-null  int64
 13  parish_cd           160186 non-null  int64
 14  parish_cd.1         160186 non-null  int64
 15  dr_age_1_bin        160186 non-null  int8 
 16  dr_age_2_bin        

In [20]:
data.head()

Unnamed: 0,fatal,crash_month,crash_dayofweek,crash_hour,pri_road_dir,intersection,alcohol,roadway_departure,lane_departure,dr_sex_1,...,ped_actions_1,veh_lighting_1,veh_lighting_2,traff_cntl_cond_1,traff_cntl_cond_2,lighting_cd,dr_cond_cd1,dr_cond_cd2,veh_cond_cd1,veh_cond_cd2
0,0,1,1,17,8,1,0,0,0,1,...,11,4,0,0,21,0,11,0,11,10
1,0,1,1,23,8,1,0,0,0,1,...,11,4,0,0,20,2,0,0,10,10
2,0,1,1,22,7,0,0,0,0,1,...,11,0,0,16,2,2,0,0,10,10
3,0,1,1,18,8,0,0,1,1,0,...,11,0,3,18,22,1,0,11,10,11
4,0,1,1,19,8,1,0,0,0,0,...,11,0,0,17,17,2,0,0,10,10


# Split into 'train' and 'test' Sets

In [21]:
train, test = train_test_split(data, test_size=0.2)
x_train = train.drop(['fatal'], axis=1)
y_train = train['fatal']
x_test = test.drop(['fatal'], axis=1)
y_test = test['fatal']
print (x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(128148, 46) (128148,) (32038, 46) (32038,)


In [22]:
print (x_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128148 entries, 104860 to 33469
Data columns (total 46 columns):
 #   Column              Non-Null Count   Dtype
---  ------              --------------   -----
 0   crash_month         128148 non-null  int64
 1   crash_dayofweek     128148 non-null  int64
 2   crash_hour          128148 non-null  int64
 3   pri_road_dir        128148 non-null  int8 
 4   intersection        128148 non-null  int64
 5   alcohol             128148 non-null  int64
 6   roadway_departure   128148 non-null  int64
 7   lane_departure      128148 non-null  int64
 8   dr_sex_1            128148 non-null  int64
 9   dr_sex_2            128148 non-null  int64
 10  num_tot_inj         128148 non-null  int64
 11  num_veh             128148 non-null  int64
 12  parish_cd           128148 non-null  int64
 13  parish_cd.1         128148 non-null  int64
 14  dr_age_1_bin        128148 non-null  int8 
 15  dr_age_2_bin        128148 non-null  int8 
 16  pri_dist_bin    

# Perform Exploratory Analysis with Statistics on the 'train' Set

### Logistic Regression

In [23]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
acc_log = round(logreg.score(x_train, y_train) * 100, 2)
acc_log

99.58

In [24]:
coeff_data = pd.DataFrame(train.columns.delete(0))
coeff_data.columns = ['Feature']
coeff_data["Correlation"] = pd.Series(logreg.coef_[0])

coeff_data.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
5,alcohol,0.619928
7,lane_departure,0.559236
9,dr_sex_2,0.478242
11,num_veh,0.291829
29,bypass,0.238424
42,dr_cond_cd1,0.213912
26,location_type,0.113012
8,dr_sex_1,0.105006
16,pri_dist_bin,0.086601
43,dr_cond_cd2,0.073


In [25]:
for x in x_train:
    print ('fatal correlation by: ', x)
    A = train[[x, 'fatal']].groupby(x, as_index=False).mean()
    A = A.sort_values(by='fatal', ascending=False)
    print (A)

fatal correlation by:  crash_month
    crash_month     fatal
5             6  0.005151
6             7  0.005008
4             5  0.004978
11           12  0.004871
7             8  0.004624
8             9  0.004578
9            10  0.004287
3             4  0.004211
10           11  0.003975
1             2  0.003949
0             1  0.003727
2             3  0.002600
fatal correlation by:  crash_dayofweek
   crash_dayofweek     fatal
6                6  0.006378
5                5  0.005325
4                4  0.004503
0                0  0.004097
3                3  0.003990
1                1  0.003708
2                2  0.003078
fatal correlation by:  crash_hour
    crash_hour     fatal
1            1  0.014915
2            2  0.013323
4            4  0.012413
22          22  0.011216
21          21  0.010641
23          23  0.010138
5            5  0.009826
20          20  0.008764
3            3  0.008628
0            0  0.008589
6            6  0.007240
24          25  0.0063

# Run ML Algorithms

In [26]:
Models = [
    SVC(),
    KNeighborsClassifier(n_neighbors = 3), 
    GaussianNB(), 
    Perceptron(),
    LinearSVC(),
    SGDClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),        
]

Accuracy = []

for model in Models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = round(model.score(x_train,y_train) * 100, 2)
    Accuracy.append(accuracy)
    print (accuracy, model)

99.57 SVC()
99.61 KNeighborsClassifier(n_neighbors=3)
85.08 GaussianNB()
99.56 Perceptron()
14.5 LinearSVC()
99.52 SGDClassifier()
100.0 DecisionTreeClassifier()
100.0 RandomForestClassifier()


In [27]:
models = pd.DataFrame({
    'Model': Models,
    'Score': Accuracy
})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
6,DecisionTreeClassifier(),100.0
7,"(DecisionTreeClassifier(max_features='auto', r...",100.0
1,KNeighborsClassifier(n_neighbors=3),99.61
0,SVC(),99.57
3,Perceptron(),99.56
5,SGDClassifier(),99.52
2,GaussianNB(),85.08
4,LinearSVC(),14.5
