### Predicting the Severity of Automobile Accidents in Seattle, Washington ###

In this first week, you will discover your
project objectives, find your dataset that you will use for this capstone project, and publish your
dataset on GitHub.

In the second week, you will build your machine
learning solution.

In the third week,
you will finalize your model and be ready
to submit your work.

To complete capstone,
you will be working on a case study which is to predict the severity
of an accident.
Now, wouldn't it be great if there were something in place that could warn you, 
given the weather and the road conditions,
about the possibility of you getting into a car accident and how severe it would be,
so that you would drive more carefully or even change your travel plans?
Let's use our shared data for Seattle, Washington as an example of how to deal with the accidents data.

In [1]:
# Import common packages for Data Science applications.
import io
import itertools
import matplotlib as mpl
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import os
import pandas as pd
import pylab as pl
import scipy
import scipy.optimize as opt
import seaborn as sns
import sklearn
import sklearn.linear_model
import sys
from matplotlib.ticker import NullFormatter
from scipy import optimize
from scipy.optimize import curve_fit
from sklearn import linear_model
from sklearn import metrics
from sklearn import pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score 
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
%matplotlib inline

In [2]:
# Create a list of display options.
list_of_display_options_fully_qualified_names = str(\
"pd.options.display.chop_threshold, pd.options.display.float_format, pd.options.display.max_info_columns, pd.options.display.notebook_repr_html, \
pd.options.display.colheader_justify, pd.options.display.html, pd.options.display.max_info_rows, pd.options.display.pprint_nest_depth, \
pd.options.display.column_space, pd.options.display.large_repr, pd.options.display.max_rows, pd.options.display.precision, \
pd.options.display.date_dayfirst, pd.options.display.latex, pd.options.display.max_seq_items, pd.options.display.show_dimensions, \
pd.options.display.date_yearfirst, pd.options.display.max_categories, pd.options.display.memory_usage, pd.options.display.unicode, \
pd.options.display.encoding, pd.options.display.max_columns, pd.options.display.min_rows, pd.options.display.width, \
pd.options.display.expand_frame_repr, pd.options.display.max_colwidth, pd.options.display.multi_sparse").split(sep=', ')

# Initialize an empty list to store all the short names for display options.
list_of_display_options_short_names = list()
# For each fully qualified option name,
# get the option's short name and add it to the list of short names.
for fully_qualified_option_name in list_of_display_options_fully_qualified_names:
    # Get short option name.
    short_option_name = fully_qualified_option_name.split(sep='.')[-1]
    
    # Add short option name to list of display option short names.
    list_of_display_options_short_names.append(short_option_name)

# Define dictionary of display option settings.
dict_of_display_option_settings_short_names=\
{"max_info_columns": 500,\
"colheader_justify": "right",\
"max_info_rows": 1000,\
"column_space": 500,\
"max_rows": 1000,\
"precision": 9,\
"max_seq_items": 1000000000,\
"show_dimensions": True,\
"max_categories": 100,\
"memory_usage": True,\
"max_columns": 500,\
"max_colwidth": 500,\
"float_format": lambda x: '%.9f' % x}

# Set pandas display options using dictionary of short names,
# and display the options/value pairs.
print("Setting display options...")
for key in list(dict_of_display_option_settings_short_names.keys()):
    # Set display option.
    pd.set_option(key, dict_of_display_option_settings_short_names[key])
    # Print display option name and value.
    print(key, ": ", pd.get_option(key), sep='')

Setting display options...
max_info_columns: 500
colheader_justify: right
max_info_rows: 1000
column_space: 500
max_rows: 1000
precision: 9
max_seq_items: 1000000000
show_dimensions: True
max_categories: 100
memory_usage: True
max_columns: 500
max_colwidth: 500
float_format: <function <lambda> at 0x7f4f9f88d040>


In [3]:
# Attribute Information URL: https://www.seattle.gov/Documents/Departments/SDOT/GIS/Collisions_OD.pdf
# Read the Collisions Data CSV file and store it as a DataFrame.
# url="https://opendata.arcgis.com/datasets/5b5c745e0f1f48e7a53acec63a0022ab_0.csv" # HTTPError at 202009151050, using local copy of .csv instead.
# print(os.listdir("..")) # Print list of contents of current working directory.
local_path_to_csv = "../Collisions.csv"
df=pd.read_csv(local_path_to_csv, low_memory=False)

In [4]:
# View the first few rows of the collisions DataFrame.
df.head()

Unnamed: 0,X,Y,OBJECTID,INCKEY,COLDETKEY,REPORTNO,STATUS,ADDRTYPE,INTKEY,LOCATION,EXCEPTRSNCODE,EXCEPTRSNDESC,SEVERITYCODE,SEVERITYDESC,COLLISIONTYPE,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,INJURIES,SERIOUSINJURIES,FATALITIES,INCDATE,INCDTTM,JUNCTIONTYPE,SDOT_COLCODE,SDOT_COLDESC,INATTENTIONIND,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SDOTCOLNUM,SPEEDING,ST_COLCODE,ST_COLDESC,SEGLANEKEY,CROSSWALKKEY,HITPARKEDCAR
0,-122.320757054,47.609407946,1,328476,329976,EA08706,Matched,Block,,BROADWAY BETWEEN E COLUMBIA ST AND BOYLSTON AVE,,,1,Property Damage Only Collision,Sideswipe,2,0,0,2,0,0,0,2020/01/22 00:00:00+00,1/22/2020 3:21:00 PM,Mid-Block (not related to intersection),11.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, FRONT END AT ANGLE",,N,Raining,Wet,Dark - Street Lights On,,,,11.0,From same direction - both going straight - both moving - sideswipe,0,0,N
1,-122.319560827,47.662220664,2,328142,329642,EA06882,Matched,Block,,8TH AVE NE BETWEEN NE 45TH E ST AND NE 47TH ST,,,1,Property Damage Only Collision,Parked Car,2,0,0,2,0,0,0,2020/01/07 00:00:00+00,1/7/2020 8:00:00 AM,Mid-Block (not related to intersection),15.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, RIGHT SIDE SIDESWIPE",,N,Clear,Dry,Daylight,,,,32.0,One parked--one moving,0,0,Y
2,-122.327524508,47.604393273,3,20700,20700,1181833,Unmatched,Block,,JAMES ST BETWEEN 6TH AVE AND 7TH AVE,,,0,Unknown,,0,0,0,0,0,0,0,2004/01/30 00:00:00+00,1/30/2004,Mid-Block (but intersection related),11.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, FRONT END AT ANGLE",,,,,,,4030032.0,,,,0,0,N
3,-122.327524934,47.708621579,4,332126,333626,M16001640,Unmatched,Block,,NE NORTHGATE WAY BETWEEN 1ST AVE NE AND NE NORTHGATE DR,,,0,Unknown,,0,0,0,0,0,0,0,2016/01/23 00:00:00+00,1/23/2016,Mid-Block (not related to intersection),11.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, FRONT END AT ANGLE",,,,,,,,,,,0,0,N
4,-122.292120049,47.55900908,5,328238,329738,3857118,Unmatched,Block,,M L KING JR ER WAY S BETWEEN S ANGELINE ST AND S EDMUNDS ST,,,0,Unknown,,0,0,0,0,0,0,0,2020/01/26 00:00:00+00,1/26/2020,Mid-Block (not related to intersection),28.0,MOTOR VEHICLE RAN OFF ROAD - HIT FIXED OBJECT,,,,,,,,,,,0,0,N


In [5]:
df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221389 entries, 0 to 221388
Data columns (total 40 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   X                213918 non-null  float64
 1   Y                213918 non-null  float64
 2   OBJECTID         221389 non-null  int64  
 3   INCKEY           221389 non-null  int64  
 4   COLDETKEY        221389 non-null  int64  
 5   REPORTNO         221389 non-null  object 
 6   STATUS           221389 non-null  object 
 7   ADDRTYPE         217677 non-null  object 
 8   INTKEY           71884 non-null   float64
 9   LOCATION         216801 non-null  object 
 10  EXCEPTRSNCODE    100986 non-null  object 
 11  EXCEPTRSNDESC    11779 non-null   object 
 12  SEVERITYCODE     221388 non-null  object 
 13  SEVERITYDESC     221389 non-null  object 
 14  COLLISIONTYPE    195159 non-null  object 
 15  PERSONCOUNT      221389 non-null  int64  
 16  PEDCOUNT         221389 non-null  int6

In [6]:
# For each column of the collisions DataFrame,
# print the data type and relative frequencies of the values.
for column in list(df.columns):
    print(column,":", df[column].dtype)
    print(df[column].value_counts(normalize=True, dropna=False))
    print()

X : float64
nan              0.033746031
-122.332653349   0.001337013
-122.344896079   0.001273776
-122.328078578   0.001246674
-122.344996835   0.001219573
                     ...    
-122.372757223   0.000004517
-122.305825420   0.000004517
-122.385337171   0.000004517
-122.397974101   0.000004517
-122.358295798   0.000004517
Name: X, Length: 24973, dtype: float64

Y : float64
nan            0.033746031
47.708654503   0.001337013
47.717173101   0.001273776
47.604161235   0.001246674
47.725035552   0.001219573
                   ...    
47.669143854   0.000004517
47.592493078   0.000004517
47.560592450   0.000004517
47.658522767   0.000004517
47.541978750   0.000004517
Name: Y, Length: 24973, dtype: float64

OBJECTID : int64
2047     0.000004517
39494    0.000004517
8785     0.000004517
10832    0.000004517
53839    0.000004517
             ...    
21920    0.000004517
109983   0.000004517
107934   0.000004517
114077   0.000004517
2049     0.000004517
Name: OBJECTID, Length: 221389, 

<h2 id="data_wrangling">Data Wrangling</h2>

Steps for working with missing data:
<ol>
    <li>Identify missing data.</li>
    <li>Deal with missing data.</li>
    <li>Correct data format.</li>
</ol>

<h3 id="identifying_missing_data">Identifying Missing Data</h3>

The metadata document that accompanied the data set indicates that certain columns have "sentinel" values
that indicate an unknown or missing value. Each of these missing values will first be converted into NaN.
Subsequently, the NaN values will be dropped from the DataFrame.

In [7]:
# If any row of the collisions DataFrame contains a sentinel value representing "unknown",
# then replace it with NaN. 
# Sentinels for "unknown" are listed in the metadata form that accompanied the dataset.
df_unknowns_converted_to_nan = df.replace(to_replace=\
{"EXCEPTRSNCODE": " ",\
 "EXCEPTRSNDESC": "Not Enough Information, or Insufficient Location Information",\
 "SEVERITYCODE": "0",\
 "SEVERITYDESC": "Unknown",\
 "JUNCTIONTYPE": "Unknown",\
 "WEATHER": "Unknown",\
 "ROADCOND": "Unknown",\
 "LIGHTCOND": "Unknown",\
 "SDOT_COLCODE": float(0),\
 "SDOT_COLDESC": "NOT ENOUGH INFORMATION / NOT APPLICABLE",\
 "ST_COLCODE": " ",\
 "ST_COLDESC": "Not stated"},\
value=np.nan, inplace=False, limit=None, regex=False, method='pad')

df_unknowns_converted_to_nan.replace(to_replace={"ST_COLCODE": "0", }, value=np.nan, inplace=True, limit=None, regex=False, method='pad')

<h3 id="deal_with_missing_data">Deal with Missing Data</h3>

<ol>
    <li>Drop the Data
        <ol>
            <li>Drop entire row.</li>
            <li>Drop entire column.</li>
        </ol>
    </li>
    <li>Replace the Data
        <ol>
            <li>Replace data by mean.</li>
            <li>Replace data by frequency.</li>
            <li>Replace data based on other functions.</li>
        </ol>
    </li>
        
</ol>

Whole columns should be dropped only if most entries in the column are empty.

In [8]:
# Initialize a list to store the labels for the columns with missing data.
list_of_columns_with_missing_data = list()

# For each column in the collisions DataFrame,
# if the column contains at least one NaN, 
# then add the column's label to the list.
for column in list(df_unknowns_converted_to_nan.columns):
    if df_unknowns_converted_to_nan[column].hasnans:
        list_of_columns_with_missing_data.append(column)

# Print the number of columns and their labels,
# as well as the number of columns missing data and their labels.
print("Number of columns: %d" % len(df_unknowns_converted_to_nan.columns))
print("List of labels for columns:")
print(list(df_unknowns_converted_to_nan.columns))
print()
print("Number of columns that are missing data: %d" % len(list_of_columns_with_missing_data))
print("List of labels for columns that are missing data:")
print(list_of_columns_with_missing_data)

Number of columns: 40
List of labels for columns:
['X', 'Y', 'OBJECTID', 'INCKEY', 'COLDETKEY', 'REPORTNO', 'STATUS', 'ADDRTYPE', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC', 'SEVERITYCODE', 'SEVERITYDESC', 'COLLISIONTYPE', 'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INJURIES', 'SERIOUSINJURIES', 'FATALITIES', 'INCDATE', 'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC', 'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE', 'ST_COLDESC', 'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR']

Number of columns that are missing data: 23
List of labels for columns that are missing data:
['X', 'Y', 'ADDRTYPE', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC', 'SEVERITYCODE', 'SEVERITYDESC', 'COLLISIONTYPE', 'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC', 'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE', 'ST_COLDESC']


In [9]:
# For each column in the DataFrame after unknowns have been converted to Nan,
# print the relative frequencies of the column's values.
for column in list(df_unknowns_converted_to_nan.columns):
    print(column, df_unknowns_converted_to_nan[column].dtype, "Relative Frequencies:")
    print(df_unknowns_converted_to_nan[column].value_counts(normalize=True, dropna=False))
    print()

X float64 Relative Frequencies:
nan              0.033746031
-122.332653349   0.001337013
-122.344896079   0.001273776
-122.328078578   0.001246674
-122.344996835   0.001219573
                     ...    
-122.372757223   0.000004517
-122.305825420   0.000004517
-122.385337171   0.000004517
-122.397974101   0.000004517
-122.358295798   0.000004517
Name: X, Length: 24973, dtype: float64

Y float64 Relative Frequencies:
nan            0.033746031
47.708654503   0.001337013
47.717173101   0.001273776
47.604161235   0.001246674
47.725035552   0.001219573
                   ...    
47.669143854   0.000004517
47.592493078   0.000004517
47.560592450   0.000004517
47.658522767   0.000004517
47.541978750   0.000004517
Name: Y, Length: 24973, dtype: float64

OBJECTID int64 Relative Frequencies:
2047     0.000004517
39494    0.000004517
8785     0.000004517
10832    0.000004517
53839    0.000004517
             ...    
21920    0.000004517
109983   0.000004517
107934   0.000004517
114077   0.000

In [10]:
print(list(df.columns))

['X', 'Y', 'OBJECTID', 'INCKEY', 'COLDETKEY', 'REPORTNO', 'STATUS', 'ADDRTYPE', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC', 'SEVERITYCODE', 'SEVERITYDESC', 'COLLISIONTYPE', 'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INJURIES', 'SERIOUSINJURIES', 'FATALITIES', 'INCDATE', 'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC', 'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE', 'ST_COLDESC', 'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR']


In [11]:
# Drop any column from the collisions DataFrame if it satisfies at least one of the following conditions:
# 1) more than 15% of the column's data is NaN;
# 2) the column only contains unique identification keys;
# 3) the column's data is naturally categorical but does not fit into a small (< 50) number of categories;
# 4) infromation in one column is redundant because it is already represented by another column;
# 5) it is not clear how to interpret the column's data.

list_of_columns_to_drop = [\
                           "STATUS",\
                           "OBJECTID",\
                           "INCKEY",\
                           "COLDETKEY",\
                           "REPORTNO",\
                           "INTKEY",\
                           "LOCATION",\
                           "EXCEPTRSNCODE",\
                           "EXCEPTRSNDESC",\
                           "SEVERITYDESC",\
                           "INCDATE",\
                           "SDOT_COLDESC",\
                           "INATTENTIONIND",\
                           "UNDERINFL",\
                           "PEDROWNOTGRNT",\
                           "SDOTCOLNUM",\
                           "SPEEDING",\
                           "ST_COLDESC",\
                           "SEGLANEKEY",\
                           "CROSSWALKKEY"]

In [12]:
# Drop the selected columns from the DataFrame after converting unknowns to NaN.
# and store the result in a new DataFrame.
df_drop_columns = df_unknowns_converted_to_nan.drop(columns=list_of_columns_to_drop, inplace=False)

In [13]:
# Test if DataFrame has NaN after dropping columns.
if df_drop_columns.isna().any(axis=None):
    print("DataFrame has NaN.")
else:
    print("DataFrame has no NaN.")

DataFrame has NaN.


In [14]:
# Drop any row that contains at least one NaN.
df_drop_columns_and_rows = df_drop_columns.dropna(axis="index", how="any", thresh=None, subset=None, inplace=False)

In [15]:
# Test if DataFrame has NaN values after dropping columns and rows.
if df_drop_columns_and_rows.isna().any(axis=None):
    print("DataFrame has NaN.")
else:
    print("DataFrame has no NaN.")

DataFrame has no NaN.


In [16]:
# For each column in DataFrame after dropping columns and rows,
# print the relative frequencies of the column's values.
for column in list(df_drop_columns_and_rows.columns):
    print(column, "Relative Frequencies:")
    print(df_drop_columns_and_rows[column].value_counts(normalize=True, dropna=False))
    print()

X Relative Frequencies:
-122.332653349   0.001495688
-122.328078578   0.001471564
-122.344896079   0.001465533
-122.344996835   0.001356975
-122.299159660   0.001302696
                     ...    
-122.286246828   0.000006031
-122.316763085   0.000006031
-122.345973623   0.000006031
-122.327227856   0.000006031
-122.284585561   0.000006031
Name: X, Length: 22114, dtype: float64

Y Relative Frequencies:
47.708654503   0.001495688
47.604161235   0.001471564
47.717173101   0.001465533
47.725035552   0.001356975
47.579673463   0.001302696
                   ...    
47.683999566   0.000006031
47.688207483   0.000006031
47.635798771   0.000006031
47.585916170   0.000006031
47.690588615   0.000006031
Name: Y, Length: 22114, dtype: float64

ADDRTYPE Relative Frequencies:
Block          0.630034377
Intersection   0.369965623
Name: ADDRTYPE, Length: 2, dtype: float64

SEVERITYCODE Relative Frequencies:
1    0.666298776
2    0.318026657
2b   0.014239189
3    0.001435378
Name: SEVERITYCODE, Lengt

In [17]:
df_drop_columns_and_rows.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165810 entries, 0 to 221388
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   X                165810 non-null  float64
 1   Y                165810 non-null  float64
 2   ADDRTYPE         165810 non-null  object 
 3   SEVERITYCODE     165810 non-null  object 
 4   COLLISIONTYPE    165810 non-null  object 
 5   PERSONCOUNT      165810 non-null  int64  
 6   PEDCOUNT         165810 non-null  int64  
 7   PEDCYLCOUNT      165810 non-null  int64  
 8   VEHCOUNT         165810 non-null  int64  
 9   INJURIES         165810 non-null  int64  
 10  SERIOUSINJURIES  165810 non-null  int64  
 11  FATALITIES       165810 non-null  int64  
 12  INCDTTM          165810 non-null  object 
 13  JUNCTIONTYPE     165810 non-null  object 
 14  SDOT_COLCODE     165810 non-null  float64
 15  WEATHER          165810 non-null  object 
 16  ROADCOND         165810 non-null  obje

<h3 id="correct_data_format">Correct Data Format</h3>

Ensure that each data type is appropriate for the corresponding feature.
Convert integer data to "ordered" categorical types, e.g. SEVERITYCODE,
especially if the "integer ordering" of the original data is inappropriate.

If data represents date, time, or date/time information, then convert the data to the appropriate datetime representation.

In [32]:
# Create new DataFrame to store converted data types.
df_converted = pd.DataFrame()

for column in list(df_drop_columns_and_rows.columns):
     # Cast column "ST_COLCODE" to type category.
    if column in ["SDOT_COLCODE"]:
        #df_converted["SDOT_COLCODE"] = df_drop_columns_and_rows["SDOT_COLCODE"].astype('category')
        df_converted["SDOT_COLCODE"] = df_drop_columns_and_rows["SDOT_COLCODE"].astype('int64').astype('category')
    # Cast columns "INCDTTM" to type datetime.
    elif column in ["INCDTTM"]:
        df_converted[column] = pd.to_datetime(df_drop_columns_and_rows[column], infer_datetime_format=True)
    # Cast columns of type object to type category.
    elif (df_drop_columns_and_rows[column].dtype in [np.dtype('object')]):
        df_converted[column] = df_drop_columns_and_rows[column].astype('category')
    # Copy all other columns to new DataFrame without changing their types.
    else:
        df_converted[column] = df_drop_columns_and_rows[column]

In [33]:
df_converted[["SDOT_COLCODE"]].info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165810 entries, 0 to 221388
Data columns (total 1 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   SDOT_COLCODE  165810 non-null  category
dtypes: category(1)
memory usage: 1.4 MB


In [34]:
# Verify that DataFrame has no NaN.
if df_converted.isna().any(axis=None):
    print("DataFrame has NaN.")
else:
    print("DataFrame has no NaN.")

DataFrame has no NaN.


In [35]:
# Display info about new DataFrame after casting objects to category or date
df_converted.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165810 entries, 0 to 221388
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   X                165810 non-null  float64       
 1   Y                165810 non-null  float64       
 2   ADDRTYPE         165810 non-null  category      
 3   SEVERITYCODE     165810 non-null  category      
 4   COLLISIONTYPE    165810 non-null  category      
 5   PERSONCOUNT      165810 non-null  int64         
 6   PEDCOUNT         165810 non-null  int64         
 7   PEDCYLCOUNT      165810 non-null  int64         
 8   VEHCOUNT         165810 non-null  int64         
 9   INJURIES         165810 non-null  int64         
 10  SERIOUSINJURIES  165810 non-null  int64         
 11  FATALITIES       165810 non-null  int64         
 12  INCDTTM          165810 non-null  datetime64[ns]
 13  JUNCTIONTYPE     165810 non-null  category      
 14  SDOT_COLCODE     165

In [36]:
# Create DataFrame of categorical columns.
df_categorical = df_converted.select_dtypes(include="category")

In [37]:
df_categorical.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165810 entries, 0 to 221388
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   ADDRTYPE       165810 non-null  category
 1   SEVERITYCODE   165810 non-null  category
 2   COLLISIONTYPE  165810 non-null  category
 3   JUNCTIONTYPE   165810 non-null  category
 4   SDOT_COLCODE   165810 non-null  category
 5   WEATHER        165810 non-null  category
 6   ROADCOND       165810 non-null  category
 7   LIGHTCOND      165810 non-null  category
 8   ST_COLCODE     165810 non-null  category
 9   HITPARKEDCAR   165810 non-null  category
dtypes: category(10)
memory usage: 2.9 MB


In [40]:
# For each column in the categorical DataFrame,
# print the relative frequency of the values.
for column in list(df_categorical.columns):
    print(column, ":", df_categorical[column].dtype)
    print(df_categorical[column].value_counts(normalize=True, dropna=False))
    print()

ADDRTYPE : category
Block          0.630034377
Intersection   0.369965623
Name: ADDRTYPE, Length: 2, dtype: float64

SEVERITYCODE : category
1    0.666298776
2    0.318026657
2b   0.014239189
3    0.001435378
Name: SEVERITYCODE, Length: 4, dtype: float64

COLLISIONTYPE : category
Angles       0.207834268
Parked Car   0.194819372
Rear Ended   0.193613172
Other        0.130733973
Sideswipe    0.104040770
Left Turn    0.082540257
Cycles       0.033677100
Pedestrian   0.023219347
Right Turn   0.017025511
Head On      0.012496231
Name: COLLISIONTYPE, Length: 10, dtype: float64

JUNCTIONTYPE : category
Mid-Block (not related to intersection)             0.442036065
At Intersection (intersection related)              0.359146010
Mid-Block (but intersection related)                0.127163621
Driveway Junction                                   0.060189373
At Intersection (but not related to intersection)   0.010662807
Ramp Junction                                       0.000802123
Name: JUNCTI

#### Features before One Hot Encoding

In [45]:
features = df_categorical.drop(columns="SEVERITYCODE", inplace=False)
features.head()

Unnamed: 0,ADDRTYPE,COLLISIONTYPE,JUNCTIONTYPE,SDOT_COLCODE,WEATHER,ROADCOND,LIGHTCOND,ST_COLCODE,HITPARKEDCAR
0,Block,Sideswipe,Mid-Block (not related to intersection),11,Raining,Wet,Dark - Street Lights On,11,N
1,Block,Parked Car,Mid-Block (not related to intersection),15,Clear,Dry,Daylight,32,Y
5,Block,Rear Ended,Mid-Block (not related to intersection),14,Clear,Dry,Daylight,14,N
6,Block,Other,Mid-Block (but intersection related),28,Clear,Wet,Daylight,50,N
8,Intersection,Sideswipe,At Intersection (intersection related),14,Overcast,Dry,Daylight,81,N


#### Use one hot encoding technique to convert categorical varables to binary variables and append them to the features DataFrame 

In [47]:
# For each feature of the features DataFrame,
# get dummy encoding for the feature,
# prefix the category column labels with the feature label and a '_' separator,
# and concatenate the one-hot encoded columns to the features DataFrame.
for feature in list(features.columns):
    features = pd.concat([features, pd.get_dummies(features[feature], prefix=feature, prefix_sep='_', dummy_na=False, columns=feature, sparse=False, drop_first=False)], axis=1)

features.head()

Unnamed: 0,ADDRTYPE,COLLISIONTYPE,JUNCTIONTYPE,SDOT_COLCODE,WEATHER,ROADCOND,LIGHTCOND,ST_COLCODE,HITPARKEDCAR,ADDRTYPE_Block,ADDRTYPE_Intersection,COLLISIONTYPE_Angles,COLLISIONTYPE_Cycles,COLLISIONTYPE_Head On,COLLISIONTYPE_Left Turn,COLLISIONTYPE_Other,COLLISIONTYPE_Parked Car,COLLISIONTYPE_Pedestrian,COLLISIONTYPE_Rear Ended,COLLISIONTYPE_Right Turn,COLLISIONTYPE_Sideswipe,JUNCTIONTYPE_At Intersection (but not related to intersection),JUNCTIONTYPE_At Intersection (intersection related),JUNCTIONTYPE_Driveway Junction,JUNCTIONTYPE_Mid-Block (but intersection related),JUNCTIONTYPE_Mid-Block (not related to intersection),JUNCTIONTYPE_Ramp Junction,SDOT_COLCODE_11,SDOT_COLCODE_12,SDOT_COLCODE_13,SDOT_COLCODE_14,SDOT_COLCODE_15,SDOT_COLCODE_16,SDOT_COLCODE_18,SDOT_COLCODE_21,SDOT_COLCODE_22,SDOT_COLCODE_23,SDOT_COLCODE_24,SDOT_COLCODE_25,SDOT_COLCODE_26,SDOT_COLCODE_27,SDOT_COLCODE_28,SDOT_COLCODE_29,SDOT_COLCODE_31,SDOT_COLCODE_32,SDOT_COLCODE_33,SDOT_COLCODE_34,SDOT_COLCODE_35,SDOT_COLCODE_36,SDOT_COLCODE_44,SDOT_COLCODE_46,SDOT_COLCODE_47,SDOT_COLCODE_48,SDOT_COLCODE_51,SDOT_COLCODE_52,SDOT_COLCODE_53,SDOT_COLCODE_54,SDOT_COLCODE_55,SDOT_COLCODE_56,SDOT_COLCODE_58,SDOT_COLCODE_61,SDOT_COLCODE_64,SDOT_COLCODE_66,SDOT_COLCODE_68,SDOT_COLCODE_69,WEATHER_Blowing Sand/Dirt,WEATHER_Clear,WEATHER_Fog/Smog/Smoke,WEATHER_Other,WEATHER_Overcast,WEATHER_Partly Cloudy,WEATHER_Raining,WEATHER_Severe Crosswind,WEATHER_Sleet/Hail/Freezing Rain,WEATHER_Snowing,ROADCOND_Dry,ROADCOND_Ice,ROADCOND_Oil,ROADCOND_Other,ROADCOND_Sand/Mud/Dirt,ROADCOND_Snow/Slush,ROADCOND_Standing Water,ROADCOND_Wet,LIGHTCOND_Dark - No Street Lights,LIGHTCOND_Dark - Street Lights Off,LIGHTCOND_Dark - Street Lights On,LIGHTCOND_Dark - Unknown Lighting,LIGHTCOND_Dawn,LIGHTCOND_Daylight,LIGHTCOND_Dusk,LIGHTCOND_Other,ST_COLCODE_1,ST_COLCODE_10,ST_COLCODE_11,ST_COLCODE_12,ST_COLCODE_13,ST_COLCODE_14,ST_COLCODE_15,ST_COLCODE_16,ST_COLCODE_17,ST_COLCODE_18,ST_COLCODE_19,ST_COLCODE_2,ST_COLCODE_20,ST_COLCODE_21,ST_COLCODE_22,ST_COLCODE_23,ST_COLCODE_24,ST_COLCODE_25,ST_COLCODE_26,ST_COLCODE_27,ST_COLCODE_28,ST_COLCODE_29,ST_COLCODE_3,ST_COLCODE_30,ST_COLCODE_31,ST_COLCODE_32,ST_COLCODE_4,ST_COLCODE_40,ST_COLCODE_41,ST_COLCODE_42,ST_COLCODE_43,ST_COLCODE_45,ST_COLCODE_48,ST_COLCODE_49,ST_COLCODE_5,ST_COLCODE_50,ST_COLCODE_51,ST_COLCODE_52,ST_COLCODE_53,ST_COLCODE_54,ST_COLCODE_56,ST_COLCODE_57,ST_COLCODE_6,ST_COLCODE_60,ST_COLCODE_64,ST_COLCODE_65,ST_COLCODE_66,ST_COLCODE_67,ST_COLCODE_7,ST_COLCODE_71,ST_COLCODE_72,ST_COLCODE_73,ST_COLCODE_74,ST_COLCODE_8,ST_COLCODE_81,ST_COLCODE_82,ST_COLCODE_83,ST_COLCODE_84,ST_COLCODE_85,ST_COLCODE_87,ST_COLCODE_88,HITPARKEDCAR_N,HITPARKEDCAR_Y
0,Block,Sideswipe,Mid-Block (not related to intersection),11,Raining,Wet,Dark - Street Lights On,11,N,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,Block,Parked Car,Mid-Block (not related to intersection),15,Clear,Dry,Daylight,32,Y,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5,Block,Rear Ended,Mid-Block (not related to intersection),14,Clear,Dry,Daylight,14,N,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
6,Block,Other,Mid-Block (but intersection related),28,Clear,Wet,Daylight,50,N,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
8,Intersection,Sideswipe,At Intersection (intersection related),14,Overcast,Dry,Daylight,81,N,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0


In [50]:
features.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165810 entries, 0 to 221388
Data columns (total 154 columns):
 #   Column                                                          Non-Null Count   Dtype   
---  ------                                                          --------------   -----   
 0   ADDRTYPE                                                        165810 non-null  category
 1   COLLISIONTYPE                                                   165810 non-null  category
 2   JUNCTIONTYPE                                                    165810 non-null  category
 3   SDOT_COLCODE                                                    165810 non-null  category
 4   WEATHER                                                         165810 non-null  category
 5   ROADCOND                                                        165810 non-null  category
 6   LIGHTCOND                                                       165810 non-null  category
 7   ST_COLCODE                  