In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Binning Data

Several of the columns have a high number of unique entries. For categorical columns especially these values/descriptions will be binned inorder to increase future model efficiancy. 

In [23]:
bins = pd.read_csv('./zippedData/merged_data.csv')

In [24]:
bins.head()

Unnamed: 0.1,Unnamed: 0,CRASH_RECORD_ID,POSTED_SPEED_LIMIT,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ROADWAY_SURFACE_COND,INJURIES_TOTAL,...,VEHICLE_TYPE,VEHICLE_USE,MANEUVER,OCCUPANT_CNT,PERSON_TYPE,SEX,AGE,SAFETY_EQUIPMENT,AIRBAG_DEPLOYED,DRIVER_VISION
0,3,0115ade9a755e835255508463f7e9c4a9a0b47e9304238...,30,UNKNOWN,CLEAR,"DARKNESS, LIGHTED ROAD",ANGLE,NOT DIVIDED,DRY,0.0,...,UNKNOWN/NA,UNKNOWN/NA,STRAIGHT AHEAD,1.0,DRIVER,M,62.0,USAGE UNKNOWN,NOT APPLICABLE,UNKNOWN
1,4,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,30,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,DRY,0.0,...,PASSENGER,PERSONAL,SLOW/STOP IN TRAFFIC,1.0,DRIVER,F,58.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN
2,5,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,30,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,DRY,0.0,...,PASSENGER,PERSONAL,SLOW/STOP IN TRAFFIC,1.0,DRIVER,M,49.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN
3,6,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,30,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,DRY,0.0,...,PASSENGER,PERSONAL,SLOW/STOP IN TRAFFIC,1.0,DRIVER,F,58.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN
4,7,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,30,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,DRY,0.0,...,PASSENGER,PERSONAL,SLOW/STOP IN TRAFFIC,1.0,DRIVER,M,49.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN


In [25]:
bins.drop('Unnamed: 0', axis =1, inplace=True)

In [26]:
bins.head()

Unnamed: 0,CRASH_RECORD_ID,POSTED_SPEED_LIMIT,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ROADWAY_SURFACE_COND,INJURIES_TOTAL,INJURIES_FATAL,...,VEHICLE_TYPE,VEHICLE_USE,MANEUVER,OCCUPANT_CNT,PERSON_TYPE,SEX,AGE,SAFETY_EQUIPMENT,AIRBAG_DEPLOYED,DRIVER_VISION
0,0115ade9a755e835255508463f7e9c4a9a0b47e9304238...,30,UNKNOWN,CLEAR,"DARKNESS, LIGHTED ROAD",ANGLE,NOT DIVIDED,DRY,0.0,0.0,...,UNKNOWN/NA,UNKNOWN/NA,STRAIGHT AHEAD,1.0,DRIVER,M,62.0,USAGE UNKNOWN,NOT APPLICABLE,UNKNOWN
1,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,30,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,DRY,0.0,0.0,...,PASSENGER,PERSONAL,SLOW/STOP IN TRAFFIC,1.0,DRIVER,F,58.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN
2,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,30,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,DRY,0.0,0.0,...,PASSENGER,PERSONAL,SLOW/STOP IN TRAFFIC,1.0,DRIVER,M,49.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN
3,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,30,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,DRY,0.0,0.0,...,PASSENGER,PERSONAL,SLOW/STOP IN TRAFFIC,1.0,DRIVER,F,58.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN
4,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,30,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,DRY,0.0,0.0,...,PASSENGER,PERSONAL,SLOW/STOP IN TRAFFIC,1.0,DRIVER,M,49.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN


In [27]:
bins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1494235 entries, 0 to 1494234
Data columns (total 28 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   CRASH_RECORD_ID       1494235 non-null  object 
 1   POSTED_SPEED_LIMIT    1494235 non-null  int64  
 2   DEVICE_CONDITION      1494235 non-null  object 
 3   WEATHER_CONDITION     1494235 non-null  object 
 4   LIGHTING_CONDITION    1494235 non-null  object 
 5   FIRST_CRASH_TYPE      1494235 non-null  object 
 6   TRAFFICWAY_TYPE       1494235 non-null  object 
 7   ROADWAY_SURFACE_COND  1494235 non-null  object 
 8   INJURIES_TOTAL        1494235 non-null  float64
 9   INJURIES_FATAL        1494235 non-null  float64
 10  CRASH_HOUR            1494235 non-null  int64  
 11  CRASH_DAY_OF_WEEK     1494235 non-null  int64  
 12  CRASH_MONTH           1494235 non-null  int64  
 13  LATITUDE              1494235 non-null  float64
 14  LONGITUDE             1494235 non-

**Device Condition**

In [28]:
bins.DEVICE_CONDITION.value_counts()
#Condition of traffic control device, as determined by reporting officer

NO CONTROLS                 743840
FUNCTIONING PROPERLY        647043
UNKNOWN                      75962
OTHER                        12100
FUNCTIONING IMPROPERLY        9211
NOT FUNCTIONING               5174
WORN REFLECTIVE MATERIAL       686
MISSING                        219
Name: DEVICE_CONDITION, dtype: int64

In [29]:
bins['DEVICE_CONDITION'] = bins.DEVICE_CONDITION.map({
    'NO CONTROLS':'Not Working',
    'FUNCTIONING PROPERLY':'Working', 
    'UNKNOWN': 'Unknown',
    'OTHER': 'Unknown',
    'FUNCTIONING IMPROPERLY': 'Not Working',
    'NOT FUNCTIONING': 'Not Working',
    'WORN REFLECTIVE MATERIAL': 'Working',
    'MISSING': 'Unknown' 
})

In [30]:
bins.DEVICE_CONDITION.value_counts()

Not Working    758225
Working        647729
Unknown         88281
Name: DEVICE_CONDITION, dtype: int64

In [31]:
bins.DEVICE_CONDITION.isna().sum()

0

**Posted Speed Limit**

In [32]:
bins.POSTED_SPEED_LIMIT.value_counts()

30    1148720
35     114914
25      79685
20      42373
15      34014
10      20993
40      19212
0       13793
45      11178
5        6262
55       1342
50        376
3         372
9         196
39        167
99        165
1          72
60         70
24         61
32         42
2          38
34         32
33         24
65         18
11         18
7          15
6          14
36         11
14          8
31          8
12          6
29          6
18          4
22          4
63          4
4           4
70          4
38          3
62          2
23          2
26          2
49          1
Name: POSTED_SPEED_LIMIT, dtype: int64

In [33]:
speed_bins = [-1, 30, 60, 100]
speed_labels = ['0-30', '30-60', '60-100']

In [34]:
bins.POSTED_SPEED_LIMIT = pd.cut(x=bins.POSTED_SPEED_LIMIT, 
                                    bins= speed_bins,
                                    labels = speed_labels
                                   ,)
                                   

In [35]:
bins.POSTED_SPEED_LIMIT.value_counts()

0-30      1346662
30-60      147380
60-100        193
Name: POSTED_SPEED_LIMIT, dtype: int64

In [36]:
bins.POSTED_SPEED_LIMIT.isna().sum()

0

**Weather Condition**

In [37]:
bins.WEATHER_CONDITION.value_counts()

CLEAR                       1199409
RAIN                         142735
SNOW                          53393
CLOUDY/OVERCAST               49859
UNKNOWN                       36373
OTHER                          4754
FREEZING RAIN/DRIZZLE          2465
SLEET/HAIL                     2180
FOG/SMOKE/HAZE                 2123
BLOWING SNOW                    667
SEVERE CROSS WIND GATE          261
BLOWING SAND, SOIL, DIRT         16
Name: WEATHER_CONDITION, dtype: int64

In [38]:
bins['WEATHER_CONDITION'] = bins.WEATHER_CONDITION.map({
    'CLEAR':'Clear',
    'RAIN':'Rain/Sleet/Snow', 
    'SNOW': 'Rain/Sleet/Snow',
    'CLOUDY/OVERCAST': 'Cloudy',
    'UNKNOWN': 'Other/Unknown',
    'OTHER': 'Other/Unknown',
    'FREEZING RAIN/DRIZZLE':'Rain/Sleet/Snow',
    'SLEET/HAIL': 'Rain/Sleet/Snow',
    'FOG/SMOKE/HAZE': 'Fog/Smoke/Haze',
    'BLOWING SNOW':'Rain/Sleet/Snow', 
    'SEVERE CROSS WIND GATE': 'Windy',
    'BLOWING SAND, SOIL, DIRT': 'Windy'
})

In [39]:
bins.WEATHER_CONDITION.value_counts()

Clear              1199409
Rain/Sleet/Snow     201440
Cloudy               49859
Other/Unknown        41127
Fog/Smoke/Haze        2123
Windy                  277
Name: WEATHER_CONDITION, dtype: int64

In [40]:
bins.WEATHER_CONDITION.isna().sum()

0

**Lighting Condition**

In [41]:
bins.LIGHTING_CONDITION.value_counts()

DAYLIGHT                  1034848
DARKNESS, LIGHTED ROAD     307958
DARKNESS                    58619
DUSK                        45534
DAWN                        23922
UNKNOWN                     23354
Name: LIGHTING_CONDITION, dtype: int64

In [42]:
bins['LIGHTING_CONDITION'] = bins.LIGHTING_CONDITION.map({
    'DAYLIGHT':'Day',
    'DARKNESS, LIGHTED ROAD':'Night', 
    'DARKNESS': 'Night',
    'DUSK': 'Night',
    'DAWN': 'Day',
    'UNKNOWN': 'Unknown',

})

In [43]:
bins.LIGHTING_CONDITION.value_counts()

Day        1058770
Night       412111
Unknown      23354
Name: LIGHTING_CONDITION, dtype: int64

In [44]:
bins.LIGHTING_CONDITION.isna().sum()

0

**Crash Type**

In [45]:
bins.FIRST_CRASH_TYPE.value_counts()

REAR END                        479779
TURNING                         288785
SIDESWIPE SAME DIRECTION        275420
ANGLE                           220738
PARKED MOTOR VEHICLE            113986
SIDESWIPE OPPOSITE DIRECTION     26012
FIXED OBJECT                     23179
HEAD ON                          17685
REAR TO FRONT                    14018
REAR TO SIDE                      9353
PEDESTRIAN                        8430
PEDALCYCLIST                      5816
OTHER OBJECT                      5264
REAR TO REAR                      3152
OTHER NONCOLLISION                1876
OVERTURNED                         384
ANIMAL                             315
TRAIN                               43
Name: FIRST_CRASH_TYPE, dtype: int64

In [46]:
bins['FIRST_CRASH_TYPE'] = bins.FIRST_CRASH_TYPE.map({
    'REAR END':'Moving',
    'TURNING':'Moving', 
    'SIDESWIPE SAME DIRECTION': 'Moving',
    'ANGLE': 'Moving',
    'PARKED MOTOR VEHICLE': 'Stationary',
    'SIDESWIPE OPPOSITE DIRECTION': 'Moving',
    'FIXED OBJECT':'Stationary', 
    'HEAD ON': 'Moving',
    'REAR TO FRONT': 'Moving',
    'REAR TO SIDE': 'Moving',
    'PEDESTRIAN': 'Object/Person',
    'PEDALCYCLIST': 'Object/Person',
    'OTHER OBJECT':'Object/Person', 
    'REAR TO REAR': 'Moving',
    'OTHER NONCOLLISION': 'Other',
    'OVERTURNED': 'Moving',
    'ANIMAL': 'Object/Person',
    'TRAIN': 'Train'
})

In [47]:
bins.FIRST_CRASH_TYPE.value_counts()

Moving           1335326
Stationary        137165
Object/Person      19825
Other               1876
Train                 43
Name: FIRST_CRASH_TYPE, dtype: int64

In [48]:
bins.FIRST_CRASH_TYPE.isna().sum()

0

**Trafficway Type**

In [49]:
bins.TRAFFICWAY_TYPE.value_counts()

NOT DIVIDED                        681256
DIVIDED - W/MEDIAN (NOT RAISED)    298098
ONE-WAY                            133395
DIVIDED - W/MEDIAN BARRIER         104396
FOUR WAY                            93176
PARKING LOT                         66097
OTHER                               36786
T-INTERSECTION                      18319
CENTER TURN LANE                    15240
ALLEY                               15054
UNKNOWN                             12364
RAMP                                 4586
UNKNOWN INTERSECTION TYPE            4420
DRIVEWAY                             3494
FIVE POINT, OR MORE                  2218
Y-INTERSECTION                       2114
TRAFFIC ROUTE                        1746
NOT REPORTED                          885
ROUNDABOUT                            310
L-INTERSECTION                        281
Name: TRAFFICWAY_TYPE, dtype: int64

In [50]:
bins['TRAFFICWAY_TYPE'] = bins.TRAFFICWAY_TYPE.map({
    'NOT DIVIDED': 'Road/Street',
    'DIVIDED - W/MEDIAN (NOT RAISED)': 'Road/Street',
    'ONE-WAY': 'Road/Street',
    'DIVIDED - W/MEDIAN BARRIER': 'Road/Street',
    'FOUR WAY': 'Intersection',
    'PARKING LOT': 'Parking lot',
    'OTHER': "Other/Unknown",
    'T-INTERSECTION': 'Intersection',
    'CENTER TURN LANE': 'Road/Street',
    'ALLEY': 'Road/Streen',
    'UNKNOWN': 'Other/Unknown',
    'RAMP': 'Highway',
    'UNKNOWN INTERSECTION TYPE': "Other/Unknown",
    'DRIVEWAY': 'Parking lot',
    'FIVE POINT, OR MORE': 'Intersection',
    'Y-INTERSECTION': 'Intersection',
    'TRAFFIC ROUTE': 'Other/Unknown',
    'NOT REPORTED': 'Other/Unknown',
    'ROUNDABOUT': 'Intersection',
    'L-INTERSECTION': 'Intersection'
})

In [51]:
bins.TRAFFICWAY_TYPE.value_counts()

Road/Street      1232385
Intersection      116418
Parking lot        69591
Other/Unknown      56201
Road/Streen        15054
Highway             4586
Name: TRAFFICWAY_TYPE, dtype: int64

In [52]:
bins.TRAFFICWAY_TYPE.isna().sum()

0

**Roadway Surface Condition**

In [53]:
bins.ROADWAY_SURFACE_COND.value_counts()

DRY                1131700
WET                 219163
UNKNOWN              75137
SNOW OR SLUSH        52510
ICE                  11965
OTHER                 3380
SAND, MUD, DIRT        380
Name: ROADWAY_SURFACE_COND, dtype: int64

In [54]:
bins['ROADWAY_SURFACE_COND'] = bins.ROADWAY_SURFACE_COND.map({
    'DRY': 'Dry',
    'WET': 'Wet',
    'UNKNOWN': 'Other/Unknown',
    'SNOW OR SLUSH': 'Ice',
    'ICE': 'Ice',
    'OTHER': "Other/Unknown",
    'SAND, MUD, DIRT': "Other/Unknown"
})

In [55]:
bins.ROADWAY_SURFACE_COND.value_counts()

Dry              1131700
Wet               219163
Other/Unknown      78897
Ice                64475
Name: ROADWAY_SURFACE_COND, dtype: int64

In [56]:
bins.CRASH_HOUR.isna().sum()

0

**Crash Hour**

In [57]:
bins.CRASH_HOUR.value_counts()

16    124120
15    122668
17    121806
14    106454
18     97148
13     95384
12     90387
8      83104
11     78378
9      69622
10     68646
19     67655
7      65913
20     51457
21     45511
22     41240
23     33240
6      30476
0      24581
1      19829
2      16665
5      15785
3      12672
4      11494
Name: CRASH_HOUR, dtype: int64

In [58]:
bins.CRASH_HOUR = pd.cut(x = bins.CRASH_HOUR,
                                   bins =[-1, 6, 12, 18, 23],
                                   labels = ['Early Morning', 'Morning', 'Afternoon', 'Evening/Night'])

In [59]:
bins.CRASH_HOUR.value_counts()

Afternoon        667580
Morning          456050
Evening/Night    239103
Early Morning    131502
Name: CRASH_HOUR, dtype: int64

In [60]:
bins.CRASH_HOUR.isna().sum()

0

**Unit Number**

A unique ID for each unit within a specific crash report.

In [61]:
bins.UNIT_NO.value_counts()

2     719598
1     691899
3      64788
4      12883
5       3203
6       1054
7        436
8        207
9         87
10        22
11        14
12         8
15         6
14         6
13         6
0          5
16         5
17         4
18         4
Name: UNIT_NO, dtype: int64

In [62]:
#after looking at value counts, 
#this column is not helpful and will be dropped

In [63]:
bins.drop('UNIT_NO', axis =1, inplace=True)

**Unit Type**

In [64]:
bins.UNIT_TYPE.value_counts()

DRIVER                 1418772
PARKED                   70216
DRIVERLESS                5047
NON-CONTACT VEHICLE        134
DISABLED VEHICLE            66
Name: UNIT_TYPE, dtype: int64

**Vehicle Year**

In [65]:
bins.VEHICLE_YEAR.value_counts()

2015.0    115288
2016.0    107971
2017.0    103143
2014.0    102241
2013.0     93867
           ...  
2515.0         1
2222.0         1
2116.0         1
2079.0         1
8324.0         1
Name: VEHICLE_YEAR, Length: 192, dtype: int64

In [66]:
bins.VEHICLE_YEAR.nunique()

192

In [67]:
#several issues with year entries, will drop column
bins.drop('VEHICLE_YEAR', axis =1, inplace=True)

In [68]:
bins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1494235 entries, 0 to 1494234
Data columns (total 26 columns):
 #   Column                Non-Null Count    Dtype   
---  ------                --------------    -----   
 0   CRASH_RECORD_ID       1494235 non-null  object  
 1   POSTED_SPEED_LIMIT    1494235 non-null  category
 2   DEVICE_CONDITION      1494235 non-null  object  
 3   WEATHER_CONDITION     1494235 non-null  object  
 4   LIGHTING_CONDITION    1494235 non-null  object  
 5   FIRST_CRASH_TYPE      1494235 non-null  object  
 6   TRAFFICWAY_TYPE       1494235 non-null  object  
 7   ROADWAY_SURFACE_COND  1494235 non-null  object  
 8   INJURIES_TOTAL        1494235 non-null  float64 
 9   INJURIES_FATAL        1494235 non-null  float64 
 10  CRASH_HOUR            1494235 non-null  category
 11  CRASH_DAY_OF_WEEK     1494235 non-null  int64   
 12  CRASH_MONTH           1494235 non-null  int64   
 13  LATITUDE              1494235 non-null  float64 
 14  LONGITUDE         

**Vehicle Type**

In [69]:
bins.VEHICLE_TYPE.value_counts()

PASSENGER                                 1033602
SPORT UTILITY VEHICLE (SUV)                215552
VAN/MINI-VAN                                76333
PICKUP                                      47553
UNKNOWN/NA                                  35053
TRUCK - SINGLE UNIT                         26679
BUS OVER 15 PASS.                           16660
OTHER                                       14947
TRACTOR W/ SEMI-TRAILER                     14345
MOTORCYCLE (OVER 150CC)                      3586
BUS UP TO 15 PASS.                           3341
TRACTOR W/O SEMI-TRAILER                     2057
OTHER VEHICLE WITH TRAILER                   1894
SINGLE UNIT TRUCK WITH TRAILER               1679
MOTOR DRIVEN CYCLE                            323
MOPED OR MOTORIZED BICYCLE                    259
AUTOCYCLE                                     219
ALL-TERRAIN VEHICLE (ATV)                      78
3-WHEELED MOTORCYCLE (2 REAR WHEELS)           36
FARM EQUIPMENT                                 33


In [70]:
bins['VEHICLE_TYPE'] = bins.VEHICLE_TYPE.map({
    'PASSENGER': 'Car',
    'SPORT UTILITY VEHICLE (SUV)': 'SUV/Van',
    'VAN/MINI-VAN': 'SUV/Van',
    'PICKUP': 'Truck',
    'UNKNOWN/NA': 'Unknown',
    'TRUCK - SINGLE UNIT': "Truck",
    'BUS OVER 15 PASS.': "Bus",
    'OTHER': "Other",
    'TRACTOR W/ SEMI-TRAILER': 'Semi',
    'TRACTOR W/O SEMI-TRAILER': "Semi",
    'MOTORCYCLE (OVER 150CC)': 'Motorbike/cycle',
    'BUS UP TO 15 PASS.': 'Bus',
    'OTHER VEHICLE WITH TRAILER': "Trailer",
    'SINGLE UNIT TRUCK WITH TRAILER': "Trailer",
    'MOTOR DRIVEN CYCLE': "Motorbike/cycle",
    'MOPED OR MOTORIZED BICYCLE': "Motorbike/cycle",
    'AUTOCYCLE': "Motorbike/cycle",
    'ALL-TERRAIN VEHICLE (ATV)': "Other",
    '3-WHEELED MOTORCYCLE (2 REAR WHEELS)': "Motorbike/cycle",
    'FARM EQUIPMENT': "Other",
    'RECREATIONAL OFF-HIGHWAY VEHICLE (ROV)': "Other"
})

In [71]:
bins.VEHICLE_TYPE.value_counts()

Car                1033602
SUV/Van             291885
Truck                74232
Unknown              35053
Bus                  20001
Semi                 16402
Other                15064
Motorbike/cycle       4423
Trailer               3573
Name: VEHICLE_TYPE, dtype: int64

In [72]:
bins.VEHICLE_TYPE.isna().sum()

0

**Vehicle Use**

In [73]:
bins.VEHICLE_USE.value_counts()
#after further inspection, does not seem descriptive enough, 
#will be dropped

PERSONAL                        1159692
UNKNOWN/NA                       140339
OTHER                             45761
NOT IN USE                        32573
TAXI/FOR HIRE                     25835
COMMERCIAL - SINGLE UNIT          20214
RIDESHARE SERVICE                 15882
CTA                               11033
POLICE                             9629
CONSTRUCTION/MAINTENANCE           7433
COMMERCIAL - MULTI-UNIT            6567
OTHER TRANSIT                      4923
SCHOOL BUS                         4304
TOW TRUCK                          3026
AMBULANCE                          1752
DRIVER EDUCATION                   1289
STATE OWNED                        1200
MASS TRANSIT                        988
FIRE                                831
LAWN CARE/LANDSCAPING               646
AGRICULTURE                         157
CAMPER/RV - SINGLE UNIT              79
MILITARY                             54
HOUSE TRAILER                        20
CAMPER/RV - TOWED/MULTI-UNIT          8


In [74]:
bins.drop('VEHICLE_USE', axis =1, inplace=True)

**Maneuver**

In [75]:
bins.MANEUVER.value_counts()

STRAIGHT AHEAD                        818543
SLOW/STOP IN TRAFFIC                  159197
TURNING LEFT                          109157
PARKED                                 71684
TURNING RIGHT                          55076
BACKING                                53851
UNKNOWN/NA                             34963
PASSING/OVERTAKING                     34062
CHANGING LANES                         33805
OTHER                                  26005
ENTERING TRAFFIC LANE FROM PARKING     20883
MERGING                                12483
STARTING IN TRAFFIC                    11625
U-TURN                                  9706
AVOIDING VEHICLES/OBJECTS               7240
ENTER FROM DRIVE/ALLEY                  6893
SKIDDING/CONTROL LOSS                   6379
LEAVING TRAFFIC LANE TO PARK            5914
SLOW/STOP - LEFT TURN                   4729
SLOW/STOP - RIGHT TURN                  2773
SLOW/STOP - LOAD/UNLOAD                 2321
PARKED IN TRAFFIC LANE                  1970
NEGOTIATIN

In [76]:
#seems redundant of crash type, will drop 

In [77]:
bins.drop('MANEUVER', axis =1, inplace=True)

**Occupant Count**

In [78]:
bins.OCCUPANT_CNT.value_counts()

1.0     1146667
2.0      195826
0.0       67704
3.0       53024
4.0       20551
5.0        7033
6.0        1867
7.0         731
8.0         261
9.0         131
11.0         88
10.0         72
12.0         47
13.0         45
15.0         27
20.0         19
16.0         19
14.0         16
19.0         13
36.0         10
18.0          9
26.0          9
17.0          6
28.0          6
25.0          5
30.0          5
35.0          4
33.0          4
39.0          4
29.0          4
41.0          4
44.0          3
27.0          3
21.0          3
43.0          2
60.0          2
24.0          2
23.0          2
34.0          2
47.0          1
38.0          1
37.0          1
22.0          1
31.0          1
Name: OCCUPANT_CNT, dtype: int64

In [79]:
#since most vehicles are not buses or trains 
#and can hold 7 or less people this feature does not seem helpful
# and will be dropped
# number injured in each crash will be more valueable 

In [80]:
bins.drop('OCCUPANT_CNT', axis =1, inplace=True)

**Person Type**

In [81]:
bins.PERSON_TYPE.value_counts()

DRIVER                 1493849
NON-MOTOR VEHICLE          214
NON-CONTACT VEHICLE        172
Name: PERSON_TYPE, dtype: int64

**Sex**

In [82]:
bins.SEX.value_counts()

M    892580
F    599993
X      1662
Name: SEX, dtype: int64

**Age**

In [83]:
bins.AGE.value_counts()

 27.0     44368
 25.0     44331
 26.0     44063
 28.0     43367
 29.0     42186
          ...  
-1.0          3
 102.0        3
-177.0        2
-47.0         2
-49.0         1
Name: AGE, Length: 113, dtype: int64

In [84]:
#seems to be data entry errors, 
#will only keep rows with people between 0 and 100 years of age

In [85]:
bins = bins[bins.AGE > 0]

In [86]:
bins.head()

Unnamed: 0,CRASH_RECORD_ID,POSTED_SPEED_LIMIT,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ROADWAY_SURFACE_COND,INJURIES_TOTAL,INJURIES_FATAL,...,LATITUDE,LONGITUDE,UNIT_TYPE,VEHICLE_TYPE,PERSON_TYPE,SEX,AGE,SAFETY_EQUIPMENT,AIRBAG_DEPLOYED,DRIVER_VISION
0,0115ade9a755e835255508463f7e9c4a9a0b47e9304238...,0-30,Unknown,Clear,Night,Moving,Road/Street,Dry,0.0,0.0,...,41.886336,-87.716203,DRIVER,Unknown,DRIVER,M,62.0,USAGE UNKNOWN,NOT APPLICABLE,UNKNOWN
1,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,0-30,Working,Clear,Day,Moving,Road/Street,Dry,0.0,0.0,...,41.925111,-87.667997,DRIVER,Car,DRIVER,F,58.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN
2,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,0-30,Working,Clear,Day,Moving,Road/Street,Dry,0.0,0.0,...,41.925111,-87.667997,DRIVER,Car,DRIVER,M,49.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN
3,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,0-30,Working,Clear,Day,Moving,Road/Street,Dry,0.0,0.0,...,41.925111,-87.667997,DRIVER,Car,DRIVER,F,58.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN
4,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,0-30,Working,Clear,Day,Moving,Road/Street,Dry,0.0,0.0,...,41.925111,-87.667997,DRIVER,Car,DRIVER,M,49.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN


In [87]:
bins.AGE.value_counts()

27.0     44368
25.0     44331
26.0     44063
28.0     43367
29.0     42186
         ...  
108.0        5
109.0        4
107.0        4
104.0        4
102.0        3
Name: AGE, Length: 108, dtype: int64

In [88]:
bins = bins[bins.AGE < 100]

In [89]:
bins.AGE.value_counts()

27.0    44368
25.0    44331
26.0    44063
28.0    43367
29.0    42186
        ...  
97.0       39
9.0        36
6.0        27
7.0        24
5.0        20
Name: AGE, Length: 99, dtype: int64

In [90]:
bins.AGE = pd.cut(x = bins.AGE,
                         bins =[-1, 25, 50, 75, 100],
                         labels = ['Young', 'Adult', 'Senior', 'Elderly'])

In [91]:
bins.AGE.value_counts()

Adult      809521
Senior     361302
Young      281923
Elderly     27489
Name: AGE, dtype: int64

**Safety Equipment**

In [92]:
bins.SAFETY_EQUIPMENT.value_counts()

SAFETY BELT USED                       934383
USAGE UNKNOWN                          500761
NONE PRESENT                            33673
SAFETY BELT NOT USED                     7272
HELMET NOT USED                          1816
DOT COMPLIANT MOTORCYCLE HELMET          1237
HELMET USED                               718
NOT DOT COMPLIANT MOTORCYCLE HELMET       195
SHOULD/LAP BELT USED IMPROPERLY           170
STRETCHER                                   4
WHEELCHAIR                                  3
CHILD RESTRAINT USED                        3
Name: SAFETY_EQUIPMENT, dtype: int64

In [93]:
bins['SAFETY_EQUIPMENT'] = bins.SAFETY_EQUIPMENT.map({
    'SAFETY BELT USED': 'Used',
    'USAGE UNKNOWN': 'Unknown',
    'NONE PRESENT': 'Not Used',
    'SAFETY BELT NOT USED': 'Not Used',
    'HELMET NOT USED': 'Not Used',
    'DOT COMPLIANT MOTORCYCLE HELMET': "Used",
    'HELMET USED': "Used",
    'NOT DOT COMPLIANT MOTORCYCLE HELMET': "Used",
    'SHOULD/LAP BELT USED IMPROPERLY': 'Not Used',
    'STRETCHER': "Used",
    'WHEELCHAIR': 'Used',
    'CHILD RESTRAINT USED': 'Used'
})

In [94]:
bins.SAFETY_EQUIPMENT.value_counts()

Used        936543
Unknown     500761
Not Used     42931
Name: SAFETY_EQUIPMENT, dtype: int64

In [95]:
bins.SAFETY_EQUIPMENT.isna().sum()

0

**Driver Vision**

In [96]:
bins.DRIVER_VISION.value_counts()

NOT OBSCURED              993110
UNKNOWN                   443484
OTHER                      16025
MOVING VEHICLES            11206
PARKED VEHICLES             7082
WINDSHIELD (WATER/ICE)      5266
BLINDED - SUNLIGHT          2095
TREES, PLANTS                793
BUILDINGS                    578
HILLCREST                    160
BLINDED - HEADLIGHTS         150
EMBANKMENT                   122
BLOWING MATERIALS            116
SIGNBOARD                     48
Name: DRIVER_VISION, dtype: int64

In [97]:
bins['DRIVER_VISION'] = bins.DRIVER_VISION.map({
    'NOT OBSCURED': 'Not Obstructed',
    'UNKNOWN': 'Unknown/Other',
    'OTHER': 'Uknown/Other',
    'MOVING VEHICLES': 'Obstructed',
    'PARKED VEHICLES': 'Obstructed',
    'WINDSHIELD (WATER/ICE)': "Obstructed",
    'BLINDED - SUNLIGHT': "Obstructed",
    'TREES, PLANTS': "Obstructed",
    'BUILDINGS': 'Obstructed',
    'HILLCREST': "Obstructed",
    'BLINDED - HEADLIGHTS': 'Obstructed',
    'EMBANKMENT': 'Obstructed',
    'BLOWING MATERIALS': 'Obstructed',
    'SIGNBOARD': 'Obstructed'
})

In [98]:
bins.SAFETY_EQUIPMENT.value_counts()

Used        936543
Unknown     500761
Not Used     42931
Name: SAFETY_EQUIPMENT, dtype: int64

In [99]:
bins.SAFETY_EQUIPMENT.isna().sum()

0

**Airbag Deployed**

In [100]:
bins.AIRBAG_DEPLOYED.value_counts()

DID NOT DEPLOY                            999013
NOT APPLICABLE                            287010
DEPLOYMENT UNKNOWN                         73775
DEPLOYED, FRONT                            64015
DEPLOYED, COMBINATION                      40455
DEPLOYED, SIDE                             15171
DEPLOYED OTHER (KNEE, AIR, BELT, ETC.)       796
Name: AIRBAG_DEPLOYED, dtype: int64

In [101]:
bins['AIRBAG_DEPLOYED'] = bins.AIRBAG_DEPLOYED.map({
    'DID NOT DEPLOY': 'Not Deployed',
    'NOT APPLICABLE': 'Not Deployed',
    'DEPLOYMENT UNKNOWN': 'Uknown',
    'DEPLOYED, FRONT': 'Deployed',
    'DEPLOYED, COMBINATION': 'Deployed',
    'DEPLOYED, SIDE': 'Deployed',
    'DEPLOYED, OTHER (KNEE, AIR, BELT, ETC.)': 'Deployed'
})

In [102]:
bins.AIRBAG_DEPLOYED.value_counts()

Not Deployed    1286023
Deployed         119641
Uknown            73775
Name: AIRBAG_DEPLOYED, dtype: int64

In [103]:
bins.SAFETY_EQUIPMENT.isna().sum()

0

In [104]:
bins.head()

Unnamed: 0,CRASH_RECORD_ID,POSTED_SPEED_LIMIT,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ROADWAY_SURFACE_COND,INJURIES_TOTAL,INJURIES_FATAL,...,LATITUDE,LONGITUDE,UNIT_TYPE,VEHICLE_TYPE,PERSON_TYPE,SEX,AGE,SAFETY_EQUIPMENT,AIRBAG_DEPLOYED,DRIVER_VISION
0,0115ade9a755e835255508463f7e9c4a9a0b47e9304238...,0-30,Unknown,Clear,Night,Moving,Road/Street,Dry,0.0,0.0,...,41.886336,-87.716203,DRIVER,Unknown,DRIVER,M,Senior,Unknown,Not Deployed,Unknown/Other
1,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,0-30,Working,Clear,Day,Moving,Road/Street,Dry,0.0,0.0,...,41.925111,-87.667997,DRIVER,Car,DRIVER,F,Senior,Unknown,Not Deployed,Unknown/Other
2,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,0-30,Working,Clear,Day,Moving,Road/Street,Dry,0.0,0.0,...,41.925111,-87.667997,DRIVER,Car,DRIVER,M,Adult,Unknown,Not Deployed,Unknown/Other
3,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,0-30,Working,Clear,Day,Moving,Road/Street,Dry,0.0,0.0,...,41.925111,-87.667997,DRIVER,Car,DRIVER,F,Senior,Unknown,Not Deployed,Unknown/Other
4,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,0-30,Working,Clear,Day,Moving,Road/Street,Dry,0.0,0.0,...,41.925111,-87.667997,DRIVER,Car,DRIVER,M,Adult,Unknown,Not Deployed,Unknown/Other


In [105]:
bins.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1480235 entries, 0 to 1494234
Data columns (total 23 columns):
 #   Column                Non-Null Count    Dtype   
---  ------                --------------    -----   
 0   CRASH_RECORD_ID       1480235 non-null  object  
 1   POSTED_SPEED_LIMIT    1480235 non-null  category
 2   DEVICE_CONDITION      1480235 non-null  object  
 3   WEATHER_CONDITION     1480235 non-null  object  
 4   LIGHTING_CONDITION    1480235 non-null  object  
 5   FIRST_CRASH_TYPE      1480235 non-null  object  
 6   TRAFFICWAY_TYPE       1480235 non-null  object  
 7   ROADWAY_SURFACE_COND  1480235 non-null  object  
 8   INJURIES_TOTAL        1480235 non-null  float64 
 9   INJURIES_FATAL        1480235 non-null  float64 
 10  CRASH_HOUR            1480235 non-null  category
 11  CRASH_DAY_OF_WEEK     1480235 non-null  int64   
 12  CRASH_MONTH           1480235 non-null  int64   
 13  LATITUDE              1480235 non-null  float64 
 14  LONGITUDE         

In [106]:
df_clean = bins

In [107]:
df_clean.head()

Unnamed: 0,CRASH_RECORD_ID,POSTED_SPEED_LIMIT,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ROADWAY_SURFACE_COND,INJURIES_TOTAL,INJURIES_FATAL,...,LATITUDE,LONGITUDE,UNIT_TYPE,VEHICLE_TYPE,PERSON_TYPE,SEX,AGE,SAFETY_EQUIPMENT,AIRBAG_DEPLOYED,DRIVER_VISION
0,0115ade9a755e835255508463f7e9c4a9a0b47e9304238...,0-30,Unknown,Clear,Night,Moving,Road/Street,Dry,0.0,0.0,...,41.886336,-87.716203,DRIVER,Unknown,DRIVER,M,Senior,Unknown,Not Deployed,Unknown/Other
1,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,0-30,Working,Clear,Day,Moving,Road/Street,Dry,0.0,0.0,...,41.925111,-87.667997,DRIVER,Car,DRIVER,F,Senior,Unknown,Not Deployed,Unknown/Other
2,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,0-30,Working,Clear,Day,Moving,Road/Street,Dry,0.0,0.0,...,41.925111,-87.667997,DRIVER,Car,DRIVER,M,Adult,Unknown,Not Deployed,Unknown/Other
3,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,0-30,Working,Clear,Day,Moving,Road/Street,Dry,0.0,0.0,...,41.925111,-87.667997,DRIVER,Car,DRIVER,F,Senior,Unknown,Not Deployed,Unknown/Other
4,017040c61958d2fa977c956b2bd2d6759ef7754496dc96...,0-30,Working,Clear,Day,Moving,Road/Street,Dry,0.0,0.0,...,41.925111,-87.667997,DRIVER,Car,DRIVER,M,Adult,Unknown,Not Deployed,Unknown/Other


In [108]:
df_clean.to_csv('./zippedData/df_clean.csv')