# Chicago Car Crash 

- Read in dataset from https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if
- Checking out the dataset
- Encode data to categories/classes
- Target = 'Rear End' type crashes
- Models DecisionTreeClassifier and RandomForestClassifier


In [1]:
# Importing libraries 
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

from mlxtend.evaluate import feature_importance_permutation

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sn

In [2]:
# read in the csv
df = pd.read_csv('Traffic_Crashes_-_Crashes.csv')

In [3]:
df.shape

(398690, 49)

In [4]:
df.head()

Unnamed: 0,CRASH_RECORD_ID,RD_NO,CRASH_DATE_EST_I,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,...,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION
0,00027e2894dd2f3fe4ff320a6d332d18e465b5c8ba2e79...,JC201794,,03/27/2019 04:20:00 PM,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,ANGLE,...,0.0,0.0,4.0,0.0,16,4,3,41.994704,-87.699395,POINT (-87.699395085278 41.994703544264)
1,0002fb938eb0feaea33820fa55b71e4ea0b332125e1510...,JB175900,,03/06/2018 03:00:00 PM,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,SIDESWIPE SAME DIRECTION,...,0.0,0.0,3.0,0.0,15,3,3,41.904671,-87.716685,POINT (-87.716685434641 41.904671135876)
2,0003fc68d857da032b1beafa8f7893ce0b123ce6b2dee5...,HZ488533,,10/25/2016 02:10:00 PM,30,TRAFFIC SIGNAL,UNKNOWN,CLEAR,DAYLIGHT,REAR END,...,0.0,0.0,2.0,0.0,14,3,10,41.706808,-87.642771,POINT (-87.642771373276 41.706807861572)
3,00060f93d2ae8d4e4c7fe75ac17055dd69a081d0b928a5...,JD123586,,01/21/2020 04:16:00 PM,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PEDESTRIAN,...,1.0,0.0,1.0,0.0,16,3,1,41.79161,-87.703356,POINT (-87.703355598231 41.791609871969)
4,0006882952e53c291df267014a03b57684383e9ad66d9e...,JC459747,,10/04/2019 12:00:00 PM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,...,0.0,0.0,2.0,0.0,12,6,10,41.84849,-87.675599,POINT (-87.67559940405 41.848490427751)


In [5]:
# Checking out the dataframe columns
df.columns

Index(['CRASH_RECORD_ID', 'RD_NO', 'CRASH_DATE_EST_I', 'CRASH_DATE',
       'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'LANE_CNT', 'ALIGNMENT', 'ROADWAY_SURFACE_COND',
       'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE', 'INTERSECTION_RELATED_I',
       'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'DAMAGE', 'DATE_POLICE_NOTIFIED',
       'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO',
       'STREET_DIRECTION', 'STREET_NAME', 'BEAT_OF_OCCURRENCE',
       'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I',
       'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 'NUM_UNITS',
       'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL',
       'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING',
       'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
       'INJURIES_UNKNOWN', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH',
       'LA

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398690 entries, 0 to 398689
Data columns (total 49 columns):
CRASH_RECORD_ID                  398690 non-null object
RD_NO                            396907 non-null object
CRASH_DATE_EST_I                 29500 non-null object
CRASH_DATE                       398690 non-null object
POSTED_SPEED_LIMIT               398690 non-null int64
TRAFFIC_CONTROL_DEVICE           398690 non-null object
DEVICE_CONDITION                 398690 non-null object
WEATHER_CONDITION                398690 non-null object
LIGHTING_CONDITION               398690 non-null object
FIRST_CRASH_TYPE                 398690 non-null object
TRAFFICWAY_TYPE                  398690 non-null object
LANE_CNT                         198551 non-null float64
ALIGNMENT                        398690 non-null object
ROADWAY_SURFACE_COND             398690 non-null object
ROAD_DEFECT                      398690 non-null object
REPORT_TYPE                      389281 non-null o

In [7]:
# Total Nans in the dataset (4 Million points)
df.isna().sum().sum()

3958429

In [8]:
# filling all na with UNKNOWN string, so we can encode later
# prevent from removing data
df.fillna('UNKOWN', inplace=True)

In [9]:
# dropping the columns that are unique values or does not provide insight 
df.drop(['CRASH_DATE', 'CRASH_RECORD_ID', 'CRASH_DATE_EST_I', 'RD_NO', 'REPORT_TYPE', 'STREET_NO', 'BEAT_OF_OCCURRENCE', 
         'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'WORKERS_PRESENT_I', 'INJURIES_UNKNOWN', 'LONGITUDE', 'LATITUDE',
         'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL','INJURIES_INCAPACITATING', 
         'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION', 'DAMAGE',
         'DATE_POLICE_NOTIFIED', 'CRASH_TYPE','NUM_UNITS','STREET_DIRECTION','STREET_NAME', 'LANE_CNT', 'LOCATION'], axis=1, inplace=True)

In [10]:
df.columns

Index(['POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT',
       'INTERSECTION_RELATED_I', 'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I',
       'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'DOORING_I',
       'WORK_ZONE_I', 'WORK_ZONE_TYPE', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK',
       'CRASH_MONTH'],
      dtype='object')

In [11]:
# We have speed limits that are not logged correctly, so we will drop them.
# There wasn't a lot so this will not effect our data
list_ = [3, 9, 99, 39, 1, 2, 32, 33, 6, 24, 11, 34, 18, 12, 36, 7, 14, 16, 38, 31, 22, 23, 63, 4, 26]
for n in list_:
    df.drop(index=df[df['POSTED_SPEED_LIMIT'] == n].index, inplace=True)

In [12]:
df.POSTED_SPEED_LIMIT

0         30
1         30
2         30
3         30
4         30
          ..
398685    30
398686    30
398687    15
398688    30
398689    30
Name: POSTED_SPEED_LIMIT, Length: 398286, dtype: int64

## OneHotEncoding: FIRST_CRASH_TYPE

In [13]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [14]:
df.head()

Unnamed: 0,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,ROAD_DEFECT,...,NOT_RIGHT_OF_WAY_I,HIT_AND_RUN_I,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,DOORING_I,WORK_ZONE_I,WORK_ZONE_TYPE,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH
0,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,ANGLE,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,UNKOWN,UNKOWN,FAILING TO YIELD RIGHT-OF-WAY,NOT APPLICABLE,UNKOWN,UNKOWN,UNKOWN,16,4,3
1,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,SIDESWIPE SAME DIRECTION,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,UNKOWN,UNKOWN,UNABLE TO DETERMINE,UNABLE TO DETERMINE,UNKOWN,UNKOWN,UNKOWN,15,3,3
2,30,TRAFFIC SIGNAL,UNKNOWN,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,UNKOWN,Y,UNABLE TO DETERMINE,UNABLE TO DETERMINE,UNKOWN,UNKOWN,UNKOWN,14,3,10
3,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PEDESTRIAN,FOUR WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,UNKOWN,UNKOWN,UNABLE TO DETERMINE,UNABLE TO DETERMINE,UNKOWN,UNKOWN,UNKOWN,16,3,1
4,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,UNKOWN,UNKOWN,FOLLOWING TOO CLOSELY,IMPROPER TURNING/NO SIGNAL,UNKOWN,UNKOWN,UNKOWN,12,6,10


This example could help for future readers:

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

train_X = pd.DataFrame({'Sex':['male', 'female']*3, 'AgeGroup':[0,15,30,45,60,75]})
>>>
     Sex     AgeGroup
0    male         0
1  female        15
2    male        30
3  female        45
4    male        60
5  female        75

encoder=OneHotEncoder(sparse=False)

train_X_encoded = pd.DataFrame (encoder.fit_transform(train_X[['Sex']]))

train_X_encoded.columns = encoder.get_feature_names(['Sex'])

train_X.drop(['Sex'] ,axis=1, inplace=True)

OH_X_train= pd.concat([train_X, train_X_encoded ], axis=1)

In [15]:
# Creating a new dataframe for FIRST_CRASH_TYPE
# Then we will OneHotEncode the data to categories
crash_type = df['FIRST_CRASH_TYPE']
crash_df = pd.DataFrame(crash_type, columns=['FIRST_CRASH_TYPE'])
crash_df = pd.DataFrame(ohe.fit_transform(crash_df[['FIRST_CRASH_TYPE']]))
crash_df.columns = ohe.get_feature_names(['FIRST_CRASH_TYPE'])

In [16]:
crash_df.head()

Unnamed: 0,FIRST_CRASH_TYPE_ANGLE,FIRST_CRASH_TYPE_ANIMAL,FIRST_CRASH_TYPE_FIXED OBJECT,FIRST_CRASH_TYPE_HEAD ON,FIRST_CRASH_TYPE_OTHER NONCOLLISION,FIRST_CRASH_TYPE_OTHER OBJECT,FIRST_CRASH_TYPE_OVERTURNED,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE,FIRST_CRASH_TYPE_PEDALCYCLIST,FIRST_CRASH_TYPE_PEDESTRIAN,FIRST_CRASH_TYPE_REAR END,FIRST_CRASH_TYPE_REAR TO FRONT,FIRST_CRASH_TYPE_REAR TO REAR,FIRST_CRASH_TYPE_REAR TO SIDE,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION,FIRST_CRASH_TYPE_TRAIN,FIRST_CRASH_TYPE_TURNING
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Do not need 'FIRST_CRASH_TYPE' column anymore
df.drop(columns=['FIRST_CRASH_TYPE'], inplace=True)

## LabelEncoding: FEATURES

In [18]:
# Here we select the columns to use as features
# Creating a for loop to label endcode all columns
cols_ = df.columns.tolist()

for col in cols_:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])

In [19]:
# All columns have been encoded
df.head()

Unnamed: 0,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,ROAD_DEFECT,INTERSECTION_RELATED_I,NOT_RIGHT_OF_WAY_I,HIT_AND_RUN_I,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,DOORING_I,WORK_ZONE_I,WORK_ZONE_TYPE,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH
0,6,4,3,2,3,8,3,0,1,1,1,1,18,26,1,1,3,16,3,2
1,6,4,3,2,3,8,3,0,1,1,1,1,36,36,1,1,3,15,2,2
2,6,16,6,2,3,8,3,0,1,1,1,2,36,36,1,1,3,14,2,9
3,6,4,3,2,3,6,3,0,1,2,1,1,36,36,1,1,3,16,2,0
4,6,16,1,2,3,8,3,0,1,2,1,1,19,24,1,1,3,12,5,9


Now to concatinate the two data frames together

In [37]:
df = crash_df.merge(right=df, how='left', left_index=True, right_index=True)
df

Unnamed: 0,FIRST_CRASH_TYPE_ANGLE,FIRST_CRASH_TYPE_ANIMAL,FIRST_CRASH_TYPE_FIXED OBJECT,FIRST_CRASH_TYPE_HEAD ON,FIRST_CRASH_TYPE_OTHER NONCOLLISION,FIRST_CRASH_TYPE_OTHER OBJECT,FIRST_CRASH_TYPE_OVERTURNED,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE,FIRST_CRASH_TYPE_PEDALCYCLIST,FIRST_CRASH_TYPE_PEDESTRIAN,...,NOT_RIGHT_OF_WAY_I,HIT_AND_RUN_I,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,DOORING_I,WORK_ZONE_I,WORK_ZONE_TYPE,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,18.0,26.0,1.0,1.0,3.0,16.0,3.0,2.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,36.0,36.0,1.0,1.0,3.0,15.0,2.0,2.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,36.0,36.0,1.0,1.0,3.0,14.0,2.0,9.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,36.0,36.0,1.0,1.0,3.0,16.0,2.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,19.0,24.0,1.0,1.0,3.0,12.0,5.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,21.0,21.0,1.0,1.0,3.0,8.0,5.0,1.0
398282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,18.0,26.0,1.0,1.0,3.0,19.0,3.0,2.0
398283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,36.0,26.0,1.0,1.0,3.0,16.0,6.0,5.0
398284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,36.0,36.0,1.0,1.0,3.0,9.0,6.0,8.0


## TARGET: Rear End

In [42]:
crash_list = crash_df.columns.tolist()

In [48]:
df.isna().sum()
df = df.dropna()

In [79]:
# Defining our Features and Target
# For X, we want to remove all of the other CRASH types and keep the features.
y = df['FIRST_CRASH_TYPE_REAR END']
X = df.drop(columns= crash_list, axis=1)

In [88]:
X.head()

Unnamed: 0,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,ROAD_DEFECT,INTERSECTION_RELATED_I,NOT_RIGHT_OF_WAY_I,HIT_AND_RUN_I,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,DOORING_I,WORK_ZONE_I,WORK_ZONE_TYPE,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH
0,6.0,4.0,3.0,2.0,3.0,8.0,3.0,0.0,1.0,1.0,1.0,1.0,18.0,26.0,1.0,1.0,3.0,16.0,3.0,2.0
1,6.0,4.0,3.0,2.0,3.0,8.0,3.0,0.0,1.0,1.0,1.0,1.0,36.0,36.0,1.0,1.0,3.0,15.0,2.0,2.0
2,6.0,16.0,6.0,2.0,3.0,8.0,3.0,0.0,1.0,1.0,1.0,2.0,36.0,36.0,1.0,1.0,3.0,14.0,2.0,9.0
3,6.0,4.0,3.0,2.0,3.0,6.0,3.0,0.0,1.0,2.0,1.0,1.0,36.0,36.0,1.0,1.0,3.0,16.0,2.0,0.0
4,6.0,16.0,1.0,2.0,3.0,8.0,3.0,0.0,1.0,2.0,1.0,1.0,19.0,24.0,1.0,1.0,3.0,12.0,5.0,9.0


## Baseline - RandomForestClassifier: Rear End 

In [89]:
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [90]:
y_test_pred = rf.predict(X_test)

In [91]:
print(f'Accuracy Score Test: {accuracy_score(y_test, y_test_pred)}')
print(f'ROC_AUC Test: {roc_auc_score(y_test, y_test_pred)}')
# ROC_AUC: It tells how much a model is capable of distinguishing between classes.

Accuracy Score Test: 0.7288035487640901
ROC_AUC Test: 0.5003334332365476


### Feature Importance: RandomForestClassifier

In [92]:
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Feature ranking:
1. feature 17 (0.173927)
2. feature 19 (0.129984)
3. feature 12 (0.118599)
4. feature 18 (0.097468)
5. feature 5 (0.087582)
6. feature 13 (0.079994)
7. feature 0 (0.059994)
8. feature 4 (0.039330)
9. feature 1 (0.033137)
10. feature 11 (0.032292)
11. feature 3 (0.027313)
12. feature 9 (0.025760)
13. feature 7 (0.025237)
14. feature 2 (0.024634)
15. feature 8 (0.023550)
16. feature 6 (0.008611)
17. feature 10 (0.008134)
18. feature 14 (0.001593)
19. feature 15 (0.001583)
20. feature 16 (0.001277)


In [None]:
X
#PRIM_CONTRIBUTORY_CAUSE, TRAFFICWAY_TYPE, NOT_RIGHT_OF_WAY_I

## RandomForestClassifier: Hyperparameters (GridSearchCV)

In [76]:
# Here we set a some parameters that the GridSearch will run through.
grid_p = {"n_estimators": [50, 100],
          "criterion": ["gini", "entropy"],
          "max_depth": [4, 6, 10],
          "min_samples_split": [5, 10],
          "min_samples_leaf": [5, 10],
         "max_samples": [400]}

grid_search = GridSearchCV(rf, grid_p, n_jobs=-1, cv=3, scoring='roc_auc')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight='balanced',
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=1,
                             

In [77]:
grid_search.best_score_

0.5012858565476507

In [78]:
grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 6,
 'max_samples': 400,
 'min_samples_leaf': 10,
 'min_samples_split': 5,
 'n_estimators': 50}

## Data Visual: Rear End Crashes

In [None]:
# Feature 4
df['TRAFFICWAY_TYPE'].value_counts().head(10)

In [None]:
# Feature 11 top 10 
df['PRIM_CONTRIBUTORY_CAUSE'].value_counts().head(10)

In [None]:
# Selecting the columns with only "REAR END" crashes
rear_end = df[df['FIRST_CRASH_TYPE'] == 'REAR END']
rear_end.TRAFFICWAY_TYPE.value_counts().index
rear_end.PRIM_CONTRIBUTORY_CAUSE.value_counts().index[:10]

In [None]:
# Bar graph of the top 10 PRIMARY_CONTRIBUTORY

plt.figure(figsize=(9,9))

y= rear_end.PRIM_CONTRIBUTORY_CAUSE.value_counts().values[:10]
x=rear_end.PRIM_CONTRIBUTORY_CAUSE.value_counts().index[:10]

sn.barplot(y, x)
plt.title('Top 10 Rear End Crashes Primary Contributory', size=30)
plt.ylabel("(Contributors)", size=25, rotation=0)
plt.xlabel("(Quantity of Crashes)", size=25)
plt.xticks(size=15)
plt.yticks(size=15);

In [None]:
# Bar graph of top 10 TrafficWay Types

plt.figure(figsize=(8,8))

y= rear_end.TRAFFICWAY_TYPE.value_counts().values[:10]
x=rear_end.TRAFFICWAY_TYPE.value_counts().index[:10]

sn.barplot(y, x)
plt.title('Top 10 Rear End Crashes Traffic Way Type', size=25)
plt.ylabel("(Contributory)", size=20, rotation=0)
plt.xlabel("(Quantity of Crashes)", size=20)
plt.xticks(size=10)
plt.yticks(size=10);