<h1 id="title">Code for Predicting Motor Vehicle Accident Severity in Seattle, Washington</h1>

This notebook contains the Python code for the Applied Data Science Capstone Project.
In this project, we use a publicly available data set to build several kinds of models to predict the severity of motor vehicle accidents in Seattle, Washington.

In [None]:
# Import packages and modules.
import io
import itertools
import matplotlib as mpl
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import os
import pandas as pd
import pylab as pl
import scipy
import scipy.optimize as opt
import seaborn as sns
import sklearn
import sys
import timeit
import warnings
from matplotlib.ticker import NullFormatter
from scipy import optimize
from scipy.optimize import curve_fit
from scipy.sparse import csr_matrix
from sklearn import linear_model
from sklearn import metrics
from sklearn import pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn import tree
from sklearn import utils
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import log_loss
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn.tree import plot_tree
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.exceptions import ConvergenceWarning
from timeit import default_timer
%matplotlib inline

In [None]:
warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")

In [None]:
# Define the starting time for this notebook. 
notebook_start_time = default_timer()

In [None]:
# This function computes the time elapsed in seconds from the 
# time represented by the first parameter (start_time)
# to the time represented by the second parameter (end_time)
# This function requires the os package to be imported.
def elapsed_time(start_time = notebook_start_time):
    return default_timer() - start_time   

In [None]:
# This function prints the time elapsed in seconds from the 
# time represented by the first parameter (start_time)
# to the time represented by the second parameter (end_time)
# This function requires the os package to be imported.
def print_elapsed_time(start_time = notebook_start_time):
    print("Elapsed time is", elapsed_time(start_time), "seconds.")
    return None

In [None]:
# Create a list of display options.
list_of_display_options_fully_qualified_names = str(\
"pd.options.display.chop_threshold, pd.options.display.float_format, pd.options.display.max_info_columns, pd.options.display.notebook_repr_html, \
pd.options.display.colheader_justify, pd.options.display.html, pd.options.display.max_info_rows, pd.options.display.pprint_nest_depth, \
pd.options.display.column_space, pd.options.display.large_repr, pd.options.display.max_rows, pd.options.display.precision, \
pd.options.display.date_dayfirst, pd.options.display.latex, pd.options.display.max_seq_items, pd.options.display.show_dimensions, \
pd.options.display.date_yearfirst, pd.options.display.max_categories, pd.options.display.memory_usage, pd.options.display.unicode, \
pd.options.display.encoding, pd.options.display.max_columns, pd.options.display.min_rows, pd.options.display.width, \
pd.options.display.expand_frame_repr, pd.options.display.max_colwidth, pd.options.display.multi_sparse").split(sep=', ')

# Initialize an empty list to store all the short names for display options.
list_of_display_options_short_names = list()
# For each fully qualified option name,
# get the option's short name and add it to the list of short names.
for fully_qualified_option_name in list_of_display_options_fully_qualified_names:
    # Get short option name.
    short_option_name = fully_qualified_option_name.split(sep='.')[-1]
    
    # Add short option name to list of display option short names.
    list_of_display_options_short_names.append(short_option_name)

# Define dictionary of display option settings.
dict_of_display_option_settings_short_names=\
{"max_info_columns": 1000,\
"colheader_justify": "right",\
"max_info_rows": 1000000,\
"column_space": 1000,\
"max_rows": 1000000,\
"precision": 9,\
"max_seq_items": 1000000000000,\
"show_dimensions": True,\
"max_categories": 100,\
"memory_usage": True,\
"max_columns": 1000,\
"max_colwidth": 1000,\
"float_format": lambda x: '%.9f' % x}

# Set pandas display options using dictionary of short names,
# and display the options/value pairs.
print("Setting display options...")
for key in list(dict_of_display_option_settings_short_names.keys()):
    # Set display option.
    pd.set_option(key, dict_of_display_option_settings_short_names[key])
    # Print display option name and value.
    print(key, ": ", pd.get_option(key), sep='')

In [None]:
# Set seed for random number generator.
# seed = np.int(os.times()[4]) # Use this line for better pseudo-random behavior.
seed = 42
print(seed)

In [None]:
# Attribute Information URL: https://www.seattle.gov/Documents/Departments/SDOT/GIS/Collisions_OD.pdf
# Read the Collisions Data CSV file and store it as a DataFrame.
# url="https://opendata.arcgis.com/datasets/5b5c745e0f1f48e7a53acec63a0022ab_0.csv" # HTTPError at 202009151050, using local copy of .csv instead.
# print(os.listdir("..")) # Print list of contents of current working directory.
local_path_to_csv = '~/IBM Data Science Professional Certificate Course/Course 9 - Applied Data Science Capstone/projects/Collisions.csv'
df=pd.read_csv(local_path_to_csv, low_memory=False)

In [None]:
# View the first few rows of the collisions DataFrame.
df.head()

In [None]:
df.info()

In [None]:
print_elapsed_time(notebook_start_time)

<h2 id="data_wrangling">Data Wrangling</h2>

Steps for working with missing data:
<ol>
    <li>Identify missing data.</li>
    <li>Deal with missing data.</li>
    <li>Correct data format.</li>
</ol>

<h3 id="identifying_missing_data">Identifying Missing Data</h3>

The metadata document that accompanied the data set indicates that certain columns have "sentinel" values
that indicate an unknown or missing value. Each of these missing values will first be converted into NaN.
Subsequently, the NaN values will be dropped from the DataFrame.

In [None]:
# If any row of the collisions DataFrame contains a sentinel value representing "unknown" or "other",
# then replace it with NaN. 
# Sentinels for "unknown" are listed in the metadata document that accompanies the dataset.
df_unknowns_converted_to_nan = df.replace(to_replace=\
    {"EXCEPTRSNCODE": " ",\
     "EXCEPTRSNDESC": "Not Enough Information, or Insufficient Location Information",\
     "COLLISIONTYPE": "Other",\
     "SEVERITYCODE": "0",\
     "SEVERITYDESC": "Unknown",\
     "JUNCTIONTYPE": "Unknown",\
     "WEATHER": "Unknown",\
     "ROADCOND": "Unknown",\
     "LIGHTCOND": "Unknown",\
     "SDOT_COLCODE": float(0),\
     "SDOT_COLDESC": "NOT ENOUGH INFORMATION / NOT APPLICABLE",\
     "ST_COLCODE": " ",\
     "ST_COLDESC": "Not stated"}, value=np.nan, inplace=False, limit=None, regex=False, method='pad')

df_unknowns_converted_to_nan.replace(to_replace=\
    {"ST_COLCODE": "0",\
     "WEATHER": "Other",\
     "ROADCOND": "Other",\
     "LIGHTCOND": "Other"}, value=np.nan, inplace=True, limit=None, regex=False, method='pad')

df_unknowns_converted_to_nan.replace(to_replace=\
    {"LIGHTCOND": "Dark - Unknown Lighting"}, value=np.nan, inplace=True, limit=None, regex=False, method='pad')

In [None]:
print_elapsed_time(notebook_start_time)

<h3 id="deal_with_missing_data">Deal with Missing Data</h3>

<ol>
    <li>Drop the Data
        <ol>
            <li>Drop entire row.</li>
            <li>Drop entire column.</li>
        </ol>
    </li>
    <li>Replace the Data
        <ol>
            <li>Replace data by mean.</li>
            <li>Replace data by frequency.</li>
            <li>Replace data based on other functions.</li>
        </ol>
    </li>
        
</ol>

Whole columns should be dropped only if most entries in the column are empty.

In [None]:
print(list(df.columns))

In [None]:
# Drop any column from the collisions DataFrame if it satisfies at least one of the following conditions:
# 1) more than 15% of the column's data is NaN;
# 2) the column only contains unique identification keys, or information not useful for model building;
# 3) the column's data is categorical but does not fit into a small (< 15) number of categories;
# 4) information in the column is redundant because it is already represented by another column;
# 5) it is not clear how to interpret the column's data.
list_of_columns_to_drop = ["ADDRTYPE",\
                           "STATUS",\
                           "OBJECTID",\
                           "INCKEY",\
                           "COLDETKEY",\
                           "REPORTNO",\
                           "INTKEY",\
                           "LOCATION",\
                           "EXCEPTRSNCODE",\
                           "EXCEPTRSNDESC",\
                           "SEVERITYDESC",\
                           "PERSONCOUNT",\
                           "VEHCOUNT",\
                           "INJURIES",\
                           "SERIOUSINJURIES",\
                           "FATALITIES",\
                           "INCDATE",\
                           "INCDTTM",\
                           "JUNCTIONTYPE",\
                           "SDOT_COLCODE",\
                           "SDOT_COLDESC",\
                           "INATTENTIONIND",\
                           "UNDERINFL",\
                           "PEDROWNOTGRNT",\
                           "SDOTCOLNUM",\
                           "SPEEDING",\
                           "ST_COLCODE",\
                           "ST_COLDESC",\
                           "SEGLANEKEY",\
                           "CROSSWALKKEY",\
                           "HITPARKEDCAR"]

In [None]:
# Drop the selected columns from the DataFrame after converting unknowns to NaN.
# Store the result in a new DataFrame.
df_drop_columns = df_unknowns_converted_to_nan.drop(columns=list_of_columns_to_drop, inplace=False)

In [None]:
# Drop any row that contains at least one NaN.
df_drop_columns_and_rows = df_drop_columns.dropna(axis="index", how="any", thresh=None, subset=None, inplace=False)

In [None]:
minor_severity_labels = ['1', '2']
major_severity_labels = ['2b', '3']

In [None]:
series_is_severe = df_drop_columns_and_rows['SEVERITYCODE'].isin(major_severity_labels)

In [None]:
series_is_severe.value_counts()

In [None]:
# Define a new column called 'IS_SEVERE'.
# For each row of the DataFrame, if 'SEVERITYCODE' is '2b' or '3', then 'IS_SEVERE' gets the boolean value of True.
# If 'SEVERITYCODE' is '1' or '2', then 'IS_SEVERE' gets the boolean value of False.
df_drop_columns_and_rows.insert(0, 'IS_SEVERE', series_is_severe)

In [None]:
# Drop column SEVERITYCODE from DataFrame in place, because severity is now represented by column IS_SEVERE.
df_drop_columns_and_rows = df_drop_columns_and_rows.drop(columns=['SEVERITYCODE'], inplace=False)

In [None]:
df_drop_columns_and_rows.info()

In [None]:
df_drop_columns_and_rows.head()

In [None]:
print_elapsed_time(notebook_start_time)

<h3 id="correct_data_format">Correct Data Format</h3>

Ensure that each data type is appropriate for the corresponding feature.
Cast columns of type "object" as type "category", but leave all other column types unaltered.

In [None]:
# Create new DataFrame to store converted data types.
df_converted = pd.DataFrame()

for column in list(df_drop_columns_and_rows.columns):
    if (df_drop_columns_and_rows[column].dtype in [np.dtype('object')]):
        df_converted[column] = df_drop_columns_and_rows[column].astype('category')
    # Copy all other columns to new DataFrame without changing their types.
    else:
        df_converted[column] = df_drop_columns_and_rows[column]

In [None]:
# Create DataFrame of categorical or integer columns, inclusive.
df_categorical = df_converted.select_dtypes(include=['bool', 'category', 'integer'])

In [None]:
print_elapsed_time(notebook_start_time)

<h2 id="feature_selection">Feature selection</h2>

#### Features before One-Hot Encoding

In [None]:
df_categorical.head()

In [None]:
df_categorical.info()

In [None]:
# Severity vs. Collision Type

In [None]:
print('IS_SEVERE Relative Frequencies:\n')
df_categorical['IS_SEVERE'].value_counts(normalize=True, dropna=False)

In [None]:
print('IS_SEVERE Value Counts:\n')
df_categorical['IS_SEVERE'].value_counts(normalize=False, dropna=False)

In [None]:
len(df_categorical)

In [None]:
# For each class of COLLISIONTYPE, get the relative frequencies for IS_SEVERE, i.e.
# for each COLLISIONTYPE group, compute the number of rows with IS_SEVERE=True divided by the size of the this COLLISIONTYPE group.

# Create a GroupBy object on COLLISIONTYPE.
groupby_collisiontype = df_categorical[['COLLISIONTYPE', 'IS_SEVERE']].groupby(by=['COLLISIONTYPE'])

# Create a GroupBy object on COLLISIONTYPE, IS_SEVERE.
groupby_collisiontype_is_severe = df_categorical[['COLLISIONTYPE', 'IS_SEVERE']].groupby(by=['COLLISIONTYPE', 'IS_SEVERE'])

print('IS_SEVERE relative frequencies:')
print(df_categorical['IS_SEVERE'].value_counts(normalize=True, dropna=False))
print()

df_value_counts_by_collisiontype = pd.DataFrame(data=groupby_collisiontype_is_severe.size(), columns=['Value Counts'])
print('IS_SEVERE value counts over each COLLISIONTYPE group:\n', df_value_counts_by_collisiontype)
print()

severity_frequency_cutoff = df_categorical['IS_SEVERE'].value_counts(normalize=True, dropna=False).min()
print('Proportion of all data with IS_SEVERE=True: %f' % (severity_frequency_cutoff))
print()

df_frequencies_by_collisiontype = pd.DataFrame(data=groupby_collisiontype_is_severe.size() / groupby_collisiontype.size(),\
                                               columns=['Relative Frequencies'])
print('IS_SEVERE relative frequencies normalized over each COLLISIONTYPE group:\n', df_frequencies_by_collisiontype)
print()

print('IS_SEVERE relative frequencies normalized over each COLLISIONTYPE group,\n',\
      'given the proportion of IS_SEVERE=True > %f:\n' % (severity_frequency_cutoff),\
      df_frequencies_by_collisiontype[df_frequencies_by_collisiontype.xs(True, level=1, axis=0) > severity_frequency_cutoff].dropna(), sep='')

In [None]:
ax = df_frequencies_by_collisiontype.plot.bar(alpha=.5, title='Frequency normalized by COLLISIONTYPE vs. (COLLISIONTYPE, IS_SEVERE)')

In [None]:
ax = df_frequencies_by_collisiontype[
    df_frequencies_by_collisiontype.xs(True, level=1, axis=0) > severity_frequency_cutoff]\
    .dropna().plot.bar(alpha=.5,\
    title='Frequency normalized by COLLISIONTYPE vs. (COLLISIONTYPE, IS_SEVERE),\nCondition: Frequency of IS_SEVERE=True > %f' %\
    (severity_frequency_cutoff))

In [None]:
ax = df_frequencies_by_collisiontype[\
    df_frequencies_by_collisiontype.xs(True, level=1, axis=0) > severity_frequency_cutoff]\
    .dropna().xs(True, level=1, axis=0).plot.bar(alpha=.5,\
    title='Frequency normalized by COLLISIONTYPE vs. COLLISIONTYPE\nConditions: IS_SEVERE=True and frequency of IS_SEVERE=True > %f' %\
    (severity_frequency_cutoff))

In [None]:
print('IS_SEVERE Relative Frequencies:\n')
df_categorical['IS_SEVERE'].value_counts(normalize=True, dropna=False)

In [None]:
print('IS_SEVERE Value Counts:\n')
df_categorical['IS_SEVERE'].value_counts(normalize=False, dropna=False)

In [None]:
len(df_categorical)

In [None]:
# For each class of WEATHER, get the relative frequencies for IS_SEVERE, i.e.
# for each weather group, compute the number of rows with IS_SEVERE=True divided by the size of the this weather group.

# Create a GroupBy object on WEATHER.
groupby_weather = df_categorical[['WEATHER', 'IS_SEVERE']].groupby(by=['WEATHER'])

# Create a GroupBy object on WEATHER, IS_SEVERE.
groupby_weather_is_severe = df_categorical[['WEATHER', 'IS_SEVERE']].groupby(by=['WEATHER', 'IS_SEVERE'])

# Create a multiindexed DataFrame.
#df_grouped_by_weather_severity = pd.DataFrame(df_categorical[['WEATHER', 'IS_SEVERE']].groupby(by=['WEATHER', 'IS_SEVERE']).count())
#print(df_grouped_by_weather_severity.head())
#print()

print('IS_SEVERE relative frequencies:')
print(df_categorical['IS_SEVERE'].value_counts(normalize=True, dropna=False))
print()

df_value_counts_by_weather = pd.DataFrame(data=groupby_weather_is_severe.size(), columns=['Value Counts'])
print('IS_SEVERE value counts for each WEATHER class:\n', df_value_counts_by_weather)
print()

severity_frequency_cutoff = df_categorical['IS_SEVERE'].value_counts(normalize=True, dropna=False).min()
print('Proportion of all data with IS_SEVERE=True: %f' % (severity_frequency_cutoff))
print()

df_frequencies_by_weather = pd.DataFrame(data=groupby_weather_is_severe.size() / groupby_weather.size(), columns=['Relative Frequencies'])
print('IS_SEVERE relative frequencies normalized by WEATHER class:\n', df_frequencies_by_weather)
print()

print('IS_SEVERE relative frequencies normalized by WEATHER class,\n',\
      'given the proportion of IS_SEVERE=True > %f:\n' % (severity_frequency_cutoff),\
      df_frequencies_by_weather[df_frequencies_by_weather.xs(True, level=1, axis=0) > severity_frequency_cutoff].dropna(), sep='')
print()

In [None]:
ax = df_frequencies_by_weather.plot.bar(alpha=.5, title='Frequency normalized by WEATHER vs. (WEATHER, IS_SEVERE)')

In [None]:
ax = df_frequencies_by_weather[df_frequencies_by_weather.xs(True, level=1, axis=0) > severity_frequency_cutoff].dropna().plot.bar(alpha=.5,\
    title='Frequency normalized by WEATHER vs. (WEATHER, IS_SEVERE),\nCondition: Frequency of IS_SEVERE=True > %f' %
    (severity_frequency_cutoff))

In [None]:
ax = df_frequencies_by_weather[\
    df_frequencies_by_weather.xs(True, level=1, axis=0) > severity_frequency_cutoff]\
    .dropna().xs(True, level=1, axis=0).plot.bar(alpha=.5,\
    title='Frequency normalized by WEATHER vs. WEATHER\nConditions: IS_SEVERE=True and Frequency of IS_SEVERE=True > %f' %\
    (severity_frequency_cutoff))

In [None]:
# Severity vs. Road Conditions

In [None]:
print('IS_SEVERE Relative Frequencies:\n')
df_categorical['IS_SEVERE'].value_counts(normalize=True, dropna=False)

In [None]:
print('IS_SEVERE Value Counts:\n')
df_categorical['IS_SEVERE'].value_counts(normalize=False, dropna=False)

In [None]:
len(df_categorical)

In [None]:
# For each class of ROADCOND, get the relative frequencies for IS_SEVERE, i.e.
# for each ROADCOND group, compute the number of rows with IS_SEVERE=True divided by the size of the this ROADCOND group.

# Create a GroupBy object on ROADCOND.
groupby_roadcond = df_categorical[['ROADCOND', 'IS_SEVERE']].groupby(by=['ROADCOND'])

# Create a GroupBy object on ROADCOND, IS_SEVERE.
groupby_roadcond_is_severe = df_categorical[['ROADCOND', 'IS_SEVERE']].groupby(by=['ROADCOND', 'IS_SEVERE'])

print('IS_SEVERE relative frequencies:')
print(df_categorical['IS_SEVERE'].value_counts(normalize=True, dropna=False))
print()

df_value_counts_by_roadcond = pd.DataFrame(data=groupby_roadcond_is_severe.size(), columns=['Value Counts'])
print('IS_SEVERE value counts over each ROADCOND group:\n', df_value_counts_by_roadcond)
print()

severity_frequency_cutoff = df_categorical['IS_SEVERE'].value_counts(normalize=True, dropna=False).min()
print('Proportion of all data with IS_SEVERE=True: %f' % (severity_frequency_cutoff))
print()

df_frequencies_by_roadcond = pd.DataFrame(data=groupby_roadcond_is_severe.size() / groupby_roadcond.size(), columns=['Relative Frequencies'])
print('IS_SEVERE relative frequencies normalized over each ROADCOND group:\n', df_frequencies_by_roadcond)
print()

print('IS_SEVERE relative frequencies normalized over each ROADCOND group,\n',\
      'given the proportion of IS_SEVERE=True > %f:\n' % (severity_frequency_cutoff),\
      df_frequencies_by_roadcond[df_frequencies_by_roadcond.xs(True, level=1, axis=0) > severity_frequency_cutoff].dropna(), sep='')

In [None]:
ax = df_frequencies_by_roadcond.plot.bar(alpha=.5, title='Frequency normalized by ROADCOND vs. (ROADCOND, IS_SEVERE)')

In [None]:
ax = df_frequencies_by_roadcond[df_frequencies_by_roadcond.xs(True, level=1, axis=0) > severity_frequency_cutoff].dropna().plot.bar(alpha=.5,\
    title='Frequency normalized by ROADCOND vs. (ROADCOND, IS_SEVERE),\nCondition: Frequency of IS_SEVERE=True > %f' %
    (severity_frequency_cutoff))

In [None]:
ax = df_frequencies_by_roadcond[\
    df_frequencies_by_roadcond.xs(True, level=1, axis=0) > severity_frequency_cutoff]\
    .dropna().xs(True, level=1, axis=0).plot.bar(alpha=.5,\
    title='Frequency normalized by ROADCOND vs. ROADCOND\nConditions: IS_SEVERE=True and frequency of IS_SEVERE=True > %f' %\
    (severity_frequency_cutoff))

In [None]:
# Severity vs. Light Conditions

In [None]:
print('IS_SEVERE Relative Frequencies:\n')
df_categorical['IS_SEVERE'].value_counts(normalize=True, dropna=False)

In [None]:
print('IS_SEVERE Value Counts:\n')
df_categorical['IS_SEVERE'].value_counts(normalize=False, dropna=False)

In [None]:
len(df_categorical)

In [None]:
# For each class of LIGHTCOND, get the relative frequencies for IS_SEVERE, i.e.
# for each LIGHTCOND group, compute the number of rows with IS_SEVERE=True divided by the size of the this LIGHTCOND group.

# Create a GroupBy object on LIGHTCOND.
groupby_lightcond = df_categorical[['LIGHTCOND', 'IS_SEVERE']].groupby(by=['LIGHTCOND'])

# Create a GroupBy object on LIGHTCOND, IS_SEVERE.
groupby_lightcond_is_severe = df_categorical[['LIGHTCOND', 'IS_SEVERE']].groupby(by=['LIGHTCOND', 'IS_SEVERE'])

print('IS_SEVERE relative frequencies:')
print(df_categorical['IS_SEVERE'].value_counts(normalize=True, dropna=False))
print()

df_value_counts_by_lightcond = pd.DataFrame(data=groupby_lightcond_is_severe.size(), columns=['Value Counts'])
print('IS_SEVERE value counts over each LIGHTCOND group:\n', df_value_counts_by_lightcond)
print()

severity_frequency_cutoff = df_categorical['IS_SEVERE'].value_counts(normalize=True, dropna=False).min()
print('Proportion of all data with IS_SEVERE=True: %f' % (severity_frequency_cutoff))
print()

df_frequencies_by_lightcond = pd.DataFrame(data=groupby_lightcond_is_severe.size() / groupby_lightcond.size(), columns=['Relative Frequencies'])
print('IS_SEVERE relative frequencies normalized over each LIGHTCOND group:\n', df_frequencies_by_lightcond)
print()

print('IS_SEVERE relative frequencies normalized over each LIGHTCOND group,\n',\
      'given the proportion of IS_SEVERE=True > %f:\n' % (severity_frequency_cutoff),\
      df_frequencies_by_lightcond[df_frequencies_by_lightcond.xs(True, level=1, axis=0) > severity_frequency_cutoff].dropna(), sep='')

In [None]:
ax = df_frequencies_by_lightcond.plot.bar(alpha=.5, title='Frequency normalized by LIGHTCOND vs. (LIGHTCOND, IS_SEVERE)')

In [None]:
ax = df_frequencies_by_lightcond[df_frequencies_by_lightcond.xs(True, level=1, axis=0) > severity_frequency_cutoff].dropna().plot.bar(alpha=.5,\
    title='Frequency normalized by LIGHTCOND vs. (LIGHTCOND, IS_SEVERE),\nCondition: Frequency of IS_SEVERE=True > %f' %
    (severity_frequency_cutoff))

In [None]:
ax = df_frequencies_by_lightcond[\
    df_frequencies_by_lightcond.xs(True, level=1, axis=0) > severity_frequency_cutoff]\
    .dropna().xs(True, level=1, axis=0).plot.bar(alpha=.5,\
    title='Frequency normalized by LIGHTCOND vs. LIGHTCOND\nConditions: IS_SEVERE=True and frequency of IS_SEVERE=True > %f' %\
    (severity_frequency_cutoff))

In [None]:
print_elapsed_time(notebook_start_time)

<h2 id="imbalanced_data">Dealing with Imbalanced Data</h2>

Because the data is imbalanced, we split the DataFrame into two DataFrames, one for each value of the IS_SEVERE.

In [None]:
print('IS_SEVERE relative frequencies:')
print(df_categorical['IS_SEVERE'].value_counts(normalize=True, dropna=False))

In [None]:
print("IS_SEVERE value counts:")
print(df_categorical['IS_SEVERE'].value_counts(normalize=False, dropna=False))

In [None]:
df_class_major_severity = df_categorical[df_categorical['IS_SEVERE']]
df_class_minor_severity = df_categorical[~df_categorical['IS_SEVERE']]

In [None]:
type(df_class_major_severity)

In [None]:
df_class_major_severity.info()

In [None]:
type(df_class_minor_severity)

In [None]:
df_class_minor_severity.info()

In [None]:
print("IS_SEVERE relative frequencies:")
print(df_class_major_severity["IS_SEVERE"].value_counts(normalize=False, dropna=False))

In [None]:
print("IS_SEVERE relative frequencies:")
print(df_class_minor_severity["IS_SEVERE"].value_counts(normalize=False, dropna=False))

In [None]:
# Store and print the size of the all classes.
class_major_severity_size = len(df_class_major_severity)
class_minor_severity_size = len(df_class_minor_severity)
print('class_major_severity_size =', class_major_severity_size)
print('class_minor_severity_size =', class_minor_severity_size)
print()
# Store and print the size of the minority class.
minority_class_size = len(df_class_major_severity)
print('minority_class_size =', minority_class_size)

In [None]:
print_elapsed_time(notebook_start_time)

<h2 id='sample_size_class_3'>Sample the Accident Severity Classes Equally to Create a Balanced Training Set<\h2>

In [None]:
# Number of folds for cross-validation
number_of_folds = 2
print('number_of_folds = %d' % (number_of_folds))

In [None]:
# This parameter determines what fraction of the data is reserved for testing.
# It must be between 0 and 1, exclusive, i.e. 0 < test_size_ratio < 1.
test_size_ratio = .5
print('test_size_ratio = %f' % (test_size_ratio))

# Using train/test splits, set aside part of the data for testing.
df_class_minor_severity_train_pre_sampling, df_class_minor_severity_test =\
    train_test_split(df_class_minor_severity, test_size=test_size_ratio, random_state=seed)

df_class_major_severity_train_pre_sampling, df_class_major_severity_test =\
    train_test_split(df_class_major_severity, test_size=test_size_ratio, random_state=seed)

In [None]:
# Build a balanced training set by sampling the imbalanced training data equally by class:

sampling_ratio = 1
print('sampling_ratio = %f' % (sampling_ratio))

# Set the boolean <replace> parameter of the sample method based on the sampling ratio.
# If the sampling_ratio > 1, sample with replacement.
# Otherwise, sample without replacement.
sample_with_replacement = bool(sampling_ratio > 1)

# Print the value of the replace parameter before passing it to the sample method.
print('Sample class major severity with replacement: %s' % (sample_with_replacement))

# Sample the minority class's training set based on the sampling parameter and store the sample for later concatenation.
df_class_major_severity_train = df_class_major_severity_train_pre_sampling.sample(frac=sampling_ratio,\
                                                                                  replace=sample_with_replacement,\
                                                                                  axis='index',\
                                                                                  random_state=seed)

# Store the size of the minority class's training set.
df_class_major_severity_train_size = df_class_major_severity_train.shape[0]

# Take a sample of the other class's pre-sampling training data,
# where the sample size taken is equal to the size of the minority class's training set.
# If the sample size to be taken exceeds the number of samples in available, sample with replacement.

# Sampling for class minor_severity
sample_with_replacement = bool(df_class_major_severity_train_size > df_class_minor_severity_train_pre_sampling.shape[0])
# Print the value of the replace parameter before passing it to the sample method.
print('Sample class minor severity with replacement: %s' % (sample_with_replacement))
df_class_minor_severity_train = df_class_minor_severity_train_pre_sampling.sample(n=df_class_major_severity_train_size,\
                                                                                   replace=sample_with_replacement, axis='index',\
                                                                                   random_state=seed)

<h2>Generate a Balanced Training Set and an Unbalanced Test Set</h2>

In [None]:
# Make a balanced, unshuffled training set by concatenating the equal sized samples of the training sets for each class.
df_train_balanced_not_shuffled = pd.concat([df_class_minor_severity_train, df_class_major_severity_train], axis='index')

# Make a not necessarily balanced testing set by concatenating the testing sets for each class.
df_test_not_shuffled = pd.concat([df_class_minor_severity_test, df_class_major_severity_test], axis='index')

In [None]:
# Shuffle the training set and store it for tuning hyper-parameters and for cross-validation.
df_train = shuffle(df_train_balanced_not_shuffled, random_state=seed)

# Shuffle the unbalanced DataFrame and store it for validation and for comparing the models.
df_test = shuffle(df_test_not_shuffled, random_state=seed)

In [None]:
# Verify the training DataFrame is balanced.
print("IS_SEVERE relative frequencies:")
print(df_train["IS_SEVERE"].value_counts(normalize=False, dropna=False))

In [None]:
# Verify the new DataFrame is balanced.
print("IS_SEVERE relative frequencies:")
print(df_train["IS_SEVERE"].value_counts(normalize=True, dropna=False))

Let's define a feature set for the training data represented by a DataFrame. 

In [None]:
# Define a data frame to store the features for the training data.
df_features = df_train.drop(columns=['IS_SEVERE'], inplace=False)

In [None]:
# Display information about the features DataFrame.
df_features.info()

The boolean array y represents the target variable IS_SEVERE for the training data.

In [None]:
# Convert the training data target into a numpy array.
y = df_train['IS_SEVERE'].to_numpy()

In [None]:
print_elapsed_time(notebook_start_time)

## Transform the Data 

We normalize the data by transforming it so that it is compatible
with the machine learning estimators we use in this notebook.
The features are stored in sparse matrix format.

In [None]:
# Create a OneHotEncoder and fit it to the features.
# The fit is performed on the data set before the any test/train splits.
# The data will be encoded as a sparse matrix, the default behavior.
start_time = default_timer()
print("Fitting OneHotEncoder to training data...")
encoder = OneHotEncoder(sparse=True, handle_unknown='error')
encoder.fit(df_categorical.drop(columns=['IS_SEVERE'], inplace=False))
X = encoder.transform(df_features)
print("Completed in", elapsed_time(start_time), "seconds.")
# Display the categories of the encoder.
print(encoder.get_feature_names())

The sparse matrix X represents the one-hot encoded feature set for the training data.

In [None]:
# Display the type for the training data feature set.
type(X)

In [None]:
# Display the shape of the training data feature set.
X.shape

In [None]:
# Display the type for the training data target array.
type(y)

In [None]:
# Display the shape of the training data target array.
y.shape

In [None]:
print_elapsed_time(notebook_start_time)

In [None]:
#df_categorical.info()
df_test.info()

In [None]:
# Print the relative frequencies for the validation DataFrame.
print("IS_SEVERE relative frequencies:")
print(df_test['IS_SEVERE'].value_counts(normalize=False, dropna=False))

In [None]:
# Print the relative frequencies for the validation DataFrame.
print("IS_SEVERE relative frequencies:")
print(df_test['IS_SEVERE'].value_counts(normalize=True, dropna=False))

In [None]:
# Encode the test data features using the same instance of the OneHotEncoder that was fitted on the training data features.
start_time = default_timer()
print('Transforming features using OneHotEncoder...')
X_test = df_test.drop(columns=['IS_SEVERE'], inplace=False)
X_test = encoder.transform(X_test)
print("Encoding completed in", elapsed_time(start_time), "seconds.")

In [None]:
y_test = df_test['IS_SEVERE'].to_numpy()

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
print_elapsed_time(notebook_start_time)

<h2 id="logistic_regression">Build and Test a Logistic Regression Model<\h2>

In [None]:
t0 = default_timer()

# Make a pipline.
logistic_regression_pipe = make_pipeline(StandardScaler(with_mean=False), LogisticRegression(solver='saga', random_state=seed), verbose=False)

# Fit the model to the balanced training data.
logistic_regression_pipe.fit(X, y)

# Use the fitted model to generate predictions based on test data.
y_pred = logistic_regression_pipe.predict(X_test)

# Set display labels.
labels = [True, False]
target_names = ['Major', 'Minor']
display_labels = target_names

# Display the classification report for Logistic Regression model applied to unbalanced test data.
print()
print('Classification Report for Logistic Regression on Unbalanced Test Data')
print()
print(classification_report(y_test, y_pred, labels=labels, target_names=target_names, digits=6))
print()

# Display the confusion matrices based on the unbalanced test data.
# Create a figure.
fig = plt.figure(figsize=(6.4 * 2, 4.8 * 2))
fig.suptitle('Confusion Matrices for Logistic Regression on Unbalanced Test Data', fontsize=20)
    
ax = plt.subplot(2, 2, 1)
ax.set_title('Normalized over True Severity', fontsize=12)
plot_confusion_matrix(logistic_regression_pipe, X_test, y_test, labels=labels, display_labels=display_labels, normalize='true', ax=ax)
    
ax = plt.subplot(2, 2, 2)
ax.set_title('Normalized over Predicted Severity', fontsize=12)
plot_confusion_matrix(logistic_regression_pipe, X_test, y_test, labels=labels, display_labels=display_labels, normalize='pred', ax=ax)
    
ax = plt.subplot(2, 2, 3)
ax.set_title('Normalized', fontsize=12)
plot_confusion_matrix(logistic_regression_pipe, X_test, y_test, labels=labels, display_labels=display_labels, normalize='all', ax=ax)

ax = plt.subplot(2, 2, 4)
ax.set_title('Normalized', fontsize=12)
plot_confusion_matrix(logistic_regression_pipe, X_test, y_test, labels=labels, display_labels=display_labels, normalize=None, ax=ax)

plt.show()

print()
print('Logistic Regression Classification Report and Confusion Matrices generated in %f seconds.' % elapsed_time(t0))

In [None]:
print_elapsed_time(notebook_start_time)

<h2 id="support_vector_machine">Build and Test a Support Vector Machine<\h2>

In [None]:
t0 = default_timer()

# Make a pipline.
support_vector_machine_pipe = make_pipeline(StandardScaler(with_mean=False), LinearSVC(dual=False, random_state=seed), verbose=False)

# Fit the model to the balanced training data.
support_vector_machine_pipe.fit(X, y)

# Use the fitted model to generate predictions based on test data.
y_pred = support_vector_machine_pipe.predict(X_test)

# Set display labels.
labels = [True, False]
target_names = ['Major', 'Minor']
display_labels = target_names


# Display the classification report for Support Vector Machine model applied to unbalanced test data.
print()
print('Classification Report for Support Vector Machine for Unbalanced Test Data')
print()
print(classification_report(y_test, y_pred, labels=labels, target_names=target_names, digits=6))
print()

# Display the confusion matrices based on the unbalanced test data.
# Create a figure.
fig = plt.figure(figsize=(6.4 * 2, 4.8 * 2))
fig.suptitle('Confusion Matrices for Unbalanced Test Data', fontsize=20)
    
ax = plt.subplot(2, 2, 1)
ax.set_title('Normalized over True Severity', fontsize=12)
plot_confusion_matrix(support_vector_machine_pipe, X_test, y_test, labels=labels, display_labels=display_labels, normalize='true', ax=ax)
    
ax = plt.subplot(2, 2, 2)
ax.set_title('Normalized over Predicted Severity', fontsize=12)
plot_confusion_matrix(support_vector_machine_pipe, X_test, y_test, labels=labels, display_labels=display_labels, normalize='pred', ax=ax)
    
ax = plt.subplot(2, 2, 3)
ax.set_title('Normalized', fontsize=12)
plot_confusion_matrix(support_vector_machine_pipe, X_test, y_test, labels=labels, display_labels=display_labels, normalize='all', ax=ax)

ax = plt.subplot(2, 2, 4)
ax.set_title('Normalized', fontsize=12)
plot_confusion_matrix(support_vector_machine_pipe, X_test, y_test, labels=labels, display_labels=display_labels, normalize=None, ax=ax)

plt.show()

print()
print('Support Vector Machine Classification Report and Confusion Matrices generated in %f seconds.' % elapsed_time(t0))

In [None]:
print_elapsed_time(notebook_start_time)

<h2 id="knn_classifier">Building a k-Nearest Neighbors Classifier<\h2>

In [None]:
t0 = default_timer()

# Set the maximum number of neighbors based on number of training samples in each folds to prevent ValueError.
# Also, ensure number of neighbors is always an odd integer to avoid ties, using '2 * (N // 2) - 1' technique.
upper_bound = 10 # Set by user based on system resources.
maximum_number_of_neighbors = \
    min(2 * (upper_bound // 2) - 1, (np.int(2 * ((X.shape[0] / number_of_folds) // 2) - 1 )))
print('maximum_number_of_neighbors = %d' % (maximum_number_of_neighbors))

# Make a pipeline.
k_neighbors_pipeline = make_pipeline(StandardScaler(with_mean=False), KNeighborsClassifier(n_jobs=-1), verbose=False)

# Use only odd numbers of neighbors to avoid ties.
# Number of neighbors range starts high and ends low, allowing user to  monitor kernel messages for possible local maxima in score.
grid_parameters = {'kneighborsclassifier__n_neighbors': range(maximum_number_of_neighbors, 1, -2)}
scoring = ['recall_weighted']
grid_search_cv = GridSearchCV(k_neighbors_pipeline, param_grid=grid_parameters, scoring=scoring, n_jobs=-1, refit='recall_weighted',\
                              cv=number_of_folds, verbose=50, error_score='raise', return_train_score=False)

grid_search_cv.fit(X, y)

print()
print('Completed grid search in %f seconds' % (elapsed_time(t0)))

In [None]:
# Store and print the best KNeighborsClassifier from GridSearchCV.
k_neighbors_clf = grid_search_cv.best_estimator_
print('Best KNeighborsClassifier parameters:')
for key in k_neighbors_clf.get_params().keys():
    print(key, ':', k_neighbors_clf.get_params()[key])

In [None]:
t0 = default_timer()

# Use the fitted model to generate predictions based on test data.
y_pred = k_neighbors_clf.predict(X_test)

# Set display labels.
labels = [True, False]
target_names = ['Major', 'Minor']
display_labels = target_names

# Display the classification report for k-Nearest Neighbors model applied to unbalanced test data.
print()
print('Classification Report for k-Nearest Neighbors for Unbalanced Test Data')
print()
print(classification_report(y_test, y_pred, labels=labels, target_names=target_names, digits=6))
print()

# Display the confusion matrices based on the unbalanced test data.
# Create a figure.
fig = plt.figure(figsize=(6.4 * 2, 4.8 * 2))
fig.suptitle('Confusion Matrices for Unbalanced Test Data', fontsize=20)
    
ax = plt.subplot(2, 2, 1)
ax.set_title('Normalized over True Severity', fontsize=12)
plot_confusion_matrix(k_neighbors_clf, X_test, y_test, labels=labels, display_labels=display_labels, normalize='true', ax=ax)
    
ax = plt.subplot(2, 2, 2)
ax.set_title('Normalized over Predicted Severity', fontsize=12)
plot_confusion_matrix(k_neighbors_clf, X_test, y_test, labels=labels, display_labels=display_labels, normalize='pred', ax=ax)
    
ax = plt.subplot(2, 2, 3)
ax.set_title('Normalized', fontsize=12)
plot_confusion_matrix(k_neighbors_clf, X_test, y_test, labels=labels, display_labels=display_labels, normalize='all', ax=ax)

ax = plt.subplot(2, 2, 4)
ax.set_title('Not Normalized', fontsize=12)
plot_confusion_matrix(k_neighbors_clf, X_test, y_test, labels=labels, display_labels=display_labels, normalize=None, ax=ax)

plt.show()

print()
print('k-Nearest Neighbors Classification Report and Confusion Matrices generated in %f seconds.' % elapsed_time(t0))

In [None]:
print_elapsed_time(notebook_start_time)

<h2 id="decision_tree_classifier">Building and Test a Decision Tree Classifier<\h2>

In [None]:
t0 = default_timer()

# Make a pipline.
decision_tree_pipe = make_pipeline(StandardScaler(with_mean=False), DecisionTreeClassifier(random_state=seed), verbose=False)

# Fit the model to the balanced training data.
decision_tree_pipe.fit(X, y)

# Use the fitted model to generate predictions based on test data.
y_pred = decision_tree_pipe.predict(X_test)

# Set display labels.
labels = [True, False]
target_names = ['Major', 'Minor']
display_labels = target_names

# Display the classification report for Decision Tree model applied to unbalanced test data.
print()
print('Classification Report for Decision Tree for Unbalanced Test Data')
print()
print(classification_report(y_test, y_pred, labels=labels, target_names=target_names, digits=6))
print()

# Display the confusion matrices based on the unbalanced test data.
# Create a figure.
fig = plt.figure(figsize=(6.4 * 2, 4.8 * 2))
fig.suptitle('Confusion Matrices for Unbalanced Test Data', fontsize=20)
    
ax = plt.subplot(2, 2, 1)
ax.set_title('Normalized over True Severity', fontsize=12)
plot_confusion_matrix(decision_tree_pipe, X_test, y_test, labels=labels, display_labels=display_labels, normalize='true', ax=ax)
    
ax = plt.subplot(2, 2, 2)
ax.set_title('Normalized over Predicted Severity', fontsize=12)
plot_confusion_matrix(decision_tree_pipe, X_test, y_test, labels=labels, display_labels=display_labels, normalize='pred', ax=ax)
    
ax = plt.subplot(2, 2, 3)
ax.set_title('Normalized', fontsize=12)
plot_confusion_matrix(decision_tree_pipe, X_test, y_test, labels=labels, display_labels=display_labels, normalize='all', ax=ax)

ax = plt.subplot(2, 2, 4)
ax.set_title('Not Normalized', fontsize=12)
plot_confusion_matrix(decision_tree_pipe, X_test, y_test, labels=labels, display_labels=display_labels, normalize=None, ax=ax)

plt.show()

print()
print('Decision Tree Classification Report and Confusion Matrices generated in %f seconds.' % elapsed_time(t0))

In [None]:
print_elapsed_time(notebook_start_time)

## Compare the Models

In [None]:
print_elapsed_time(notebook_start_time)