### Predicting the Severity of Automobile Accidents in Seattle, Washington ###

In this first week, you will discover your
project objectives, find your dataset that you will use for this capstone project, and publish your
dataset on GitHub.

In the second week, you will build your machine
learning solution.

In the third week,
you will finalize your model and be ready
to submit your work.

To complete capstone,
you will be working on a case study which is to predict the severity
of an accident.
Now, wouldn't it be great if there were something in place that could warn you, 
given the weather and the road conditions,
about the possibility of you getting into a car accident and how severe it would be,
so that you would drive more carefully or even change your travel plans?
Let's use our shared data for Seattle, Washington as an example of how to deal with the accidents data.

In [1]:
# Import packages and modules.
import io
import itertools
import matplotlib as mpl
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import os
import pandas as pd
import pylab as pl
import scipy
import scipy.optimize as opt
import seaborn as sns
import sklearn
import sys
from matplotlib.ticker import NullFormatter
from scipy import optimize
from scipy.optimize import curve_fit
from scipy.sparse import csr_matrix
from sklearn import linear_model
from sklearn import metrics
from sklearn import pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import log_loss
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
%matplotlib inline

In [2]:
# This function takes no arguments and returns an integer representing
# the system time elapsed in seconds from a fixed instant of time.
# This function requires the os package to be imported.
# Specifically, it uses built-in function times() from module posix.
def time_now():
    return os.times()[4]

In [3]:
# Define a global variable to store the starting time for this notebook.
global notebook_start_time 
notebook_start_time = time_now()

In [4]:
# This function computes the time elapsed in seconds from the 
# time represented by the first parameter (start_time)
# to the time represented by the second parameter (end_time)
# This function requires the os package to be imported.
def elapsed_time(start_time = notebook_start_time):
    return time_now() - start_time   

In [5]:
# This function prints the time elapsed in seconds from the 
# time represented by the first parameter (start_time)
# to the time represented by the second parameter (end_time)
# This function requires the os package to be imported.
def print_elapsed_time(start_time = notebook_start_time):
    print("Elapsed time is", elapsed_time(start_time), "seconds.")
    return None

In [6]:
print_elapsed_time()

Elapsed time is 0.2800000011920929 seconds.


In [7]:
# Create a list of display options.
list_of_display_options_fully_qualified_names = str(\
"pd.options.display.chop_threshold, pd.options.display.float_format, pd.options.display.max_info_columns, pd.options.display.notebook_repr_html, \
pd.options.display.colheader_justify, pd.options.display.html, pd.options.display.max_info_rows, pd.options.display.pprint_nest_depth, \
pd.options.display.column_space, pd.options.display.large_repr, pd.options.display.max_rows, pd.options.display.precision, \
pd.options.display.date_dayfirst, pd.options.display.latex, pd.options.display.max_seq_items, pd.options.display.show_dimensions, \
pd.options.display.date_yearfirst, pd.options.display.max_categories, pd.options.display.memory_usage, pd.options.display.unicode, \
pd.options.display.encoding, pd.options.display.max_columns, pd.options.display.min_rows, pd.options.display.width, \
pd.options.display.expand_frame_repr, pd.options.display.max_colwidth, pd.options.display.multi_sparse").split(sep=', ')

# Initialize an empty list to store all the short names for display options.
list_of_display_options_short_names = list()
# For each fully qualified option name,
# get the option's short name and add it to the list of short names.
for fully_qualified_option_name in list_of_display_options_fully_qualified_names:
    # Get short option name.
    short_option_name = fully_qualified_option_name.split(sep='.')[-1]
    
    # Add short option name to list of display option short names.
    list_of_display_options_short_names.append(short_option_name)

# Define dictionary of display option settings.
dict_of_display_option_settings_short_names=\
{"max_info_columns": 1000,\
"colheader_justify": "right",\
"max_info_rows": 1000000,\
"column_space": 1000,\
"max_rows": 1000000,\
"precision": 9,\
"max_seq_items": 1000000000000,\
"show_dimensions": True,\
"max_categories": 100,\
"memory_usage": True,\
"max_columns": 1000,\
"max_colwidth": 1000,\
"float_format": lambda x: '%.9f' % x}

# Set pandas display options using dictionary of short names,
# and display the options/value pairs.
print("Setting display options...")
for key in list(dict_of_display_option_settings_short_names.keys()):
    # Set display option.
    pd.set_option(key, dict_of_display_option_settings_short_names[key])
    # Print display option name and value.
    print(key, ": ", pd.get_option(key), sep='')

Setting display options...
max_info_columns: 1000
colheader_justify: right
max_info_rows: 1000000
column_space: 1000
max_rows: 1000000
precision: 9
max_seq_items: 1000000000000
show_dimensions: True
max_categories: 100
memory_usage: True
max_columns: 1000
max_colwidth: 1000
float_format: <function <lambda> at 0x7fd0942621f0>


In [8]:
# Attribute Information URL: https://www.seattle.gov/Documents/Departments/SDOT/GIS/Collisions_OD.pdf
# Read the Collisions Data CSV file and store it as a DataFrame.
# url="https://opendata.arcgis.com/datasets/5b5c745e0f1f48e7a53acec63a0022ab_0.csv" # HTTPError at 202009151050, using local copy of .csv instead.
# print(os.listdir("..")) # Print list of contents of current working directory.
local_path_to_csv = "../Collisions.csv"
df=pd.read_csv(local_path_to_csv, low_memory=False)

In [9]:
# View the first few rows of the collisions DataFrame.
df.head()

Unnamed: 0,X,Y,OBJECTID,INCKEY,COLDETKEY,REPORTNO,STATUS,ADDRTYPE,INTKEY,LOCATION,EXCEPTRSNCODE,EXCEPTRSNDESC,SEVERITYCODE,SEVERITYDESC,COLLISIONTYPE,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,INJURIES,SERIOUSINJURIES,FATALITIES,INCDATE,INCDTTM,JUNCTIONTYPE,SDOT_COLCODE,SDOT_COLDESC,INATTENTIONIND,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SDOTCOLNUM,SPEEDING,ST_COLCODE,ST_COLDESC,SEGLANEKEY,CROSSWALKKEY,HITPARKEDCAR
0,-122.320757054,47.609407946,1,328476,329976,EA08706,Matched,Block,,BROADWAY BETWEEN E COLUMBIA ST AND BOYLSTON AVE,,,1,Property Damage Only Collision,Sideswipe,2,0,0,2,0,0,0,2020/01/22 00:00:00+00,1/22/2020 3:21:00 PM,Mid-Block (not related to intersection),11.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, FRONT END AT ANGLE",,N,Raining,Wet,Dark - Street Lights On,,,,11.0,From same direction - both going straight - both moving - sideswipe,0,0,N
1,-122.319560827,47.662220664,2,328142,329642,EA06882,Matched,Block,,8TH AVE NE BETWEEN NE 45TH E ST AND NE 47TH ST,,,1,Property Damage Only Collision,Parked Car,2,0,0,2,0,0,0,2020/01/07 00:00:00+00,1/7/2020 8:00:00 AM,Mid-Block (not related to intersection),15.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, RIGHT SIDE SIDESWIPE",,N,Clear,Dry,Daylight,,,,32.0,One parked--one moving,0,0,Y
2,-122.327524508,47.604393273,3,20700,20700,1181833,Unmatched,Block,,JAMES ST BETWEEN 6TH AVE AND 7TH AVE,,,0,Unknown,,0,0,0,0,0,0,0,2004/01/30 00:00:00+00,1/30/2004,Mid-Block (but intersection related),11.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, FRONT END AT ANGLE",,,,,,,4030032.0,,,,0,0,N
3,-122.327524934,47.708621579,4,332126,333626,M16001640,Unmatched,Block,,NE NORTHGATE WAY BETWEEN 1ST AVE NE AND NE NORTHGATE DR,,,0,Unknown,,0,0,0,0,0,0,0,2016/01/23 00:00:00+00,1/23/2016,Mid-Block (not related to intersection),11.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, FRONT END AT ANGLE",,,,,,,,,,,0,0,N
4,-122.292120049,47.55900908,5,328238,329738,3857118,Unmatched,Block,,M L KING JR ER WAY S BETWEEN S ANGELINE ST AND S EDMUNDS ST,,,0,Unknown,,0,0,0,0,0,0,0,2020/01/26 00:00:00+00,1/26/2020,Mid-Block (not related to intersection),28.0,MOTOR VEHICLE RAN OFF ROAD - HIT FIXED OBJECT,,,,,,,,,,,0,0,N


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221389 entries, 0 to 221388
Data columns (total 40 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   X                213918 non-null  float64
 1   Y                213918 non-null  float64
 2   OBJECTID         221389 non-null  int64  
 3   INCKEY           221389 non-null  int64  
 4   COLDETKEY        221389 non-null  int64  
 5   REPORTNO         221389 non-null  object 
 6   STATUS           221389 non-null  object 
 7   ADDRTYPE         217677 non-null  object 
 8   INTKEY           71884 non-null   float64
 9   LOCATION         216801 non-null  object 
 10  EXCEPTRSNCODE    100986 non-null  object 
 11  EXCEPTRSNDESC    11779 non-null   object 
 12  SEVERITYCODE     221388 non-null  object 
 13  SEVERITYDESC     221389 non-null  object 
 14  COLLISIONTYPE    195159 non-null  object 
 15  PERSONCOUNT      221389 non-null  int64  
 16  PEDCOUNT         221389 non-null  int6

<h2 id="data_wrangling">Data Wrangling</h2>

Steps for working with missing data:
<ol>
    <li>Identify missing data.</li>
    <li>Deal with missing data.</li>
    <li>Correct data format.</li>
</ol>

<h3 id="identifying_missing_data">Identifying Missing Data</h3>

The metadata document that accompanied the data set indicates that certain columns have "sentinel" values
that indicate an unknown or missing value. Each of these missing values will first be converted into NaN.
Subsequently, the NaN values will be dropped from the DataFrame.

In [11]:
# If any row of the collisions DataFrame contains a sentinel value representing "unknown",
# then replace it with NaN. 
# Sentinels for "unknown" are listed in the metadata document that accompanies the dataset.
df_unknowns_converted_to_nan = df.replace(to_replace=\
{"EXCEPTRSNCODE": " ",\
 "EXCEPTRSNDESC": "Not Enough Information, or Insufficient Location Information",\
 "SEVERITYCODE": "0",\
 "SEVERITYDESC": "Unknown",\
 "JUNCTIONTYPE": "Unknown",\
 "WEATHER": "Unknown",\
 "ROADCOND": "Unknown",\
 "LIGHTCOND": "Unknown",\
 "SDOT_COLCODE": float(0),\
 "SDOT_COLDESC": "NOT ENOUGH INFORMATION / NOT APPLICABLE",\
 "ST_COLCODE": " ",\
 "ST_COLDESC": "Not stated"},\
value=np.nan, inplace=False, limit=None, regex=False, method='pad')

df_unknowns_converted_to_nan.replace(to_replace={"ST_COLCODE": "0", }, value=np.nan, inplace=True, limit=None, regex=False, method='pad')

<h3 id="deal_with_missing_data">Deal with Missing Data</h3>

<ol>
    <li>Drop the Data
        <ol>
            <li>Drop entire row.</li>
            <li>Drop entire column.</li>
        </ol>
    </li>
    <li>Replace the Data
        <ol>
            <li>Replace data by mean.</li>
            <li>Replace data by frequency.</li>
            <li>Replace data based on other functions.</li>
        </ol>
    </li>
        
</ol>

Whole columns should be dropped only if most entries in the column are empty.

In [12]:
# Initialize a list to store the labels for the columns with missing data.
list_of_columns_with_missing_data = list()

# For each column in the collisions DataFrame,
# if the column contains at least one NaN, 
# then add the column's label to the list.
for column in list(df_unknowns_converted_to_nan.columns):
    if df_unknowns_converted_to_nan[column].hasnans:
        list_of_columns_with_missing_data.append(column)

In [13]:
print(list(df.columns))

['X', 'Y', 'OBJECTID', 'INCKEY', 'COLDETKEY', 'REPORTNO', 'STATUS', 'ADDRTYPE', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC', 'SEVERITYCODE', 'SEVERITYDESC', 'COLLISIONTYPE', 'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INJURIES', 'SERIOUSINJURIES', 'FATALITIES', 'INCDATE', 'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC', 'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE', 'ST_COLDESC', 'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR']


In [14]:
# Drop any column from the collisions DataFrame if it satisfies at least one of the following conditions:
# 1) more than 15% of the column's data is NaN;
# 2) the column only contains unique identification keys, or information not useful for model building;
# 3) the column's data is categorical but does not fit into a small (< 15) number of categories;
# 4) information in the column is redundant because it is already represented by another column;
# 5) it is not clear how to interpret the column's data.
list_of_columns_to_drop = ["ADDRTYPE",\
                           "STATUS",\
                           "OBJECTID",\
                           "INCKEY",\
                           "COLDETKEY",\
                           "REPORTNO",\
                           "INTKEY",\
                           "LOCATION",\
                           "EXCEPTRSNCODE",\
                           "EXCEPTRSNDESC",\
                           "SEVERITYDESC",\
                           "INCDATE",\
                           "INCDTTM",\
                           "JUNCTIONTYPE",\
                           "SDOT_COLCODE",\
                           "SDOT_COLDESC",\
                           "INATTENTIONIND",\
                           "UNDERINFL",\
                           "PEDROWNOTGRNT",\
                           "SDOTCOLNUM",\
                           "SPEEDING",\
                           "ST_COLCODE",\
                           "ST_COLDESC",\
                           "SEGLANEKEY",\
                           "CROSSWALKKEY",\
                           "HITPARKEDCAR"]

In [15]:
# Drop the selected columns from the DataFrame after converting unknowns to NaN.
# Store the result in a new DataFrame.
df_drop_columns = df_unknowns_converted_to_nan.drop(columns=list_of_columns_to_drop, inplace=False)

In [16]:
# Drop any row that contains at least one NaN.
df_drop_columns_and_rows = df_drop_columns.dropna(axis="index", how="any", thresh=None, subset=None, inplace=False)

In [17]:
print_elapsed_time()

Elapsed time is 17.929999999701977 seconds.


<h3 id="correct_data_format">Correct Data Format</h3>

Ensure that each data type is appropriate for the corresponding feature.
Cast columns of type "object" as type "category", but leave all other column types unaltered.

In [18]:
# Create new DataFrame to store converted data types.
df_converted = pd.DataFrame()

for column in list(df_drop_columns_and_rows.columns):
    if (df_drop_columns_and_rows[column].dtype in [np.dtype('object')]):
        df_converted[column] = df_drop_columns_and_rows[column].astype('category')
    # Copy all other columns to new DataFrame without changing their types.
    else:
        df_converted[column] = df_drop_columns_and_rows[column]

In [19]:
print_elapsed_time()

Elapsed time is 18.41000000014901 seconds.


In [20]:
# Cast columns "INCDTTM" to type datetime.
    #if column in ["INCDTTM"]:
    #    df_converted[column] = pd.to_datetime(df_drop_columns_and_rows[column], infer_datetime_format=True)
    # Cast columns of type object to type category

In [21]:
# Create DataFrame of categorical columns.
df_categorical = df_converted.select_dtypes(include="category")

#### Features before One Hot Encoding

In [22]:
list(df_categorical.columns)

['SEVERITYCODE', 'COLLISIONTYPE', 'WEATHER', 'ROADCOND', 'LIGHTCOND']

In [23]:
df_categorical.head(10)

Unnamed: 0,SEVERITYCODE,COLLISIONTYPE,WEATHER,ROADCOND,LIGHTCOND
0,1,Sideswipe,Raining,Wet,Dark - Street Lights On
1,1,Parked Car,Clear,Dry,Daylight
5,1,Rear Ended,Clear,Dry,Daylight
6,1,Other,Clear,Wet,Daylight
8,1,Sideswipe,Overcast,Dry,Daylight
9,1,Sideswipe,Clear,Dry,Daylight
10,1,Rear Ended,Overcast,Dry,Daylight
11,1,Angles,Overcast,Dry,Daylight
12,1,Parked Car,Clear,Wet,Dark - Street Lights On
13,2,Parked Car,Overcast,Dry,Dark - Street Lights On


In [24]:
features = df_categorical[["COLLISIONTYPE", "WEATHER", "ROADCOND", "LIGHTCOND"]]
#features = df_categorical[["WEATHER", "ROADCOND", "LIGHTCOND"]]
#features = df_categorical[["COLLISIONTYPE", "WEATHER"]]

In [25]:
list_of_features = list(features.columns)

In [26]:
print("SEVERITYCODE relative frequencies:")
print(df_categorical["SEVERITYCODE"].value_counts(normalize=True, dropna=False))

SEVERITYCODE relative frequencies:
1    0.657943120
2    0.323036911
2b   0.017111571
3    0.001908397
Name: SEVERITYCODE, Length: 4, dtype: float64


In [27]:
for feature in list_of_features:
    print(df_categorical.groupby(feature)["SEVERITYCODE"].value_counts(normalize=True, dropna=False))
    print()

COLLISIONTYPE  SEVERITYCODE
Angles         1              0.594234777
               2              0.392278305
               2b             0.012763371
               3              0.000723547
Cycles         2              0.815928270
               1              0.108649789
               2b             0.071026723
               3              0.004395218
Head On        1              0.530622010
               2              0.412918660
               2b             0.047368421
               3              0.009090909
Left Turn      1              0.587973761
               2              0.391399417
               2b             0.019460641
               3              0.001166181
Other          1              0.714686327
               2              0.259758469
               2b             0.021665385
               3              0.003889819
Parked Car     1              0.918220226
               2              0.077850326
               2b             0.003722634
      

In [28]:
for feature in list_of_features:
    print(df_categorical.groupby("SEVERITYCODE")[feature].value_counts(normalize=True, dropna=False))
    #print(df_categorical.groupby("SEVERITYCODE")[feature].value_counts(normalize=False, dropna=False))
    print()

SEVERITYCODE  COLLISIONTYPE
1             Parked Car      0.274835960
              Angles          0.181567358
              Rear Ended      0.159813233
              Other           0.139730461
              Sideswipe       0.132169576
              Left Turn       0.071337613
              Right Turn      0.019622928
              Head On         0.009807043
              Pedestrian      0.005650767
              Cycles          0.005465061
2             Rear Ended      0.252066786
              Angles          0.244123845
              Other           0.103438339
              Pedestrian      0.103078115
              Left Turn       0.096720160
              Cycles          0.083589993
              Parked Car      0.047459520
              Sideswipe       0.043190865
              Head On         0.015543668
              Right Turn      0.010788711
2b            Pedestrian      0.278136688
              Other           0.162869772
              Angles          0.149948997
      

In [29]:
features.head(10)

Unnamed: 0,COLLISIONTYPE,WEATHER,ROADCOND,LIGHTCOND
0,Sideswipe,Raining,Wet,Dark - Street Lights On
1,Parked Car,Clear,Dry,Daylight
5,Rear Ended,Clear,Dry,Daylight
6,Other,Clear,Wet,Daylight
8,Sideswipe,Overcast,Dry,Daylight
9,Sideswipe,Clear,Dry,Daylight
10,Rear Ended,Overcast,Dry,Daylight
11,Angles,Overcast,Dry,Daylight
12,Parked Car,Clear,Wet,Dark - Street Lights On
13,Parked Car,Overcast,Dry,Dark - Street Lights On


In [30]:
print_elapsed_time()

Elapsed time is 20.519999999552965 seconds.


### Feature selection

Let's define a features set represented by the numerical DataFrame X_not_normalized:

In [31]:
X = features

In [32]:
X.shape

(171872, 4)

In [33]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 171872 entries, 0 to 221388
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   COLLISIONTYPE  171872 non-null  category
 1   WEATHER        171872 non-null  category
 2   ROADCOND       171872 non-null  category
 3   LIGHTCOND      171872 non-null  category
dtypes: category(4)
memory usage: 2.0 MB


We also define the labels for the target variable, SEVERITYCODE:

In [34]:
y = df_categorical["SEVERITYCODE"].to_numpy()

In [35]:
y.shape

(171872,)

In [36]:
print_elapsed_time()

Elapsed time is 21.059999998658895 seconds.


## Split the Data into Training and Testing Sets

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [38]:
print_elapsed_time()

Elapsed time is 21.309999998658895 seconds.


## Encode Categorical Features as a One-Hot Numeric Array

#### Use one hot encoding technique to convert categorical varables to binary variables and append them to the features DataFrame 

In [39]:
# Create a OneHotEncoder and fit it to the features of the training data.
# OneHotEncoder will transform the data into a sparse matrix if the parameter sparse=True,
# otherwise the output will be a 2-D array.
start_time = time_now()
print("Fitting OneHotEncoder to training data...")
encoder = OneHotEncoder()
encoder.fit(X_train)
print("Completed in", elapsed_time(start_time), "seconds.")
# Display the categories of the encoder.
print(encoder.categories_)

Fitting OneHotEncoder to training data...
Completed in 0.14000000059604645 seconds.
[array(['Angles', 'Cycles', 'Head On', 'Left Turn', 'Other', 'Parked Car',
       'Pedestrian', 'Rear Ended', 'Right Turn', 'Sideswipe'],
      dtype=object), array(['Blowing Sand/Dirt', 'Clear', 'Fog/Smog/Smoke', 'Other',
       'Overcast', 'Partly Cloudy', 'Raining', 'Severe Crosswind',
       'Sleet/Hail/Freezing Rain', 'Snowing'], dtype=object), array(['Dry', 'Ice', 'Oil', 'Other', 'Sand/Mud/Dirt', 'Snow/Slush',
       'Standing Water', 'Wet'], dtype=object), array(['Dark - No Street Lights', 'Dark - Street Lights Off',
       'Dark - Street Lights On', 'Dark - Unknown Lighting', 'Dawn',
       'Daylight', 'Dusk', 'Other'], dtype=object)]


In [40]:
# Transform the training data features using OneHotEncoder.
start_time = time_now()
print()
X_train_one_hot_encoded = encoder.transform(X_train)
print("Completed in", elapsed_time(start_time), "seconds.")


Completed in 0.5 seconds.


In [41]:
type(X_train_one_hot_encoded)

scipy.sparse.csr.csr_matrix

In [42]:
X_train.shape

(137497, 4)

In [43]:
X_train_one_hot_encoded.shape

(137497, 36)

In [44]:
# Transform the test data features using the same instance of the OneHotEncoder
# that was applied to the training data features.
start_time = time_now()
print()
X_test_one_hot_encoded = encoder.transform(X_test)
print("Completed in", elapsed_time(start_time), "seconds.")


Completed in 0.12999999895691872 seconds.


In [45]:
type(X_test_one_hot_encoded)

scipy.sparse.csr.csr_matrix

In [46]:
X_test.shape

(34375, 4)

In [47]:
X_test_one_hot_encoded.shape

(34375, 36)

In [48]:
print_elapsed_time()

Elapsed time is 22.87000000104308 seconds.


## Transform the Data 

We normalize the data by transforming it so that it is compatible
with the machine learning estimators we use in this notebook.
We use special care with sparse matrix data so as to not destroy the
structure.

In [49]:
# Construct a StandardScaler applicable to sparse CSR or CSC matrix data.
# Pass with_mean=False to the constructor to avoid breaking the sparsity structure
# of the data.
# To avoid unnecessary memory copies, use CSR or CSC representation upstream.
scaler = StandardScaler(with_mean=False)

In [50]:
# Fit and transform the sparse one-hot encoded training data,
# and store the transformed data.
start_time = time_now()
X_train_transformed = scaler.fit_transform(X_train_one_hot_encoded)
print_elapsed_time(start_time)

Elapsed time is 0.1600000001490116 seconds.


In [51]:
X_train_transformed.shape

(137497, 36)

In [52]:
# Transform the sparse one-hot encoded test data
# using the same StandardScaler instance that was used to
# transform the sparse, one-hot encoded training data.
start_time = time_now()
X_test_transformed = scaler.transform(X_test_one_hot_encoded)
print_elapsed_time(start_time)

AttributeError: 'StandardScaler' object has no attribute 'ransform'

In [None]:
X_test_transformed.shape

In [None]:
print_elapsed_time()

# Classification 

We split the normalized data and target labels into a training test and a test set.
We use the training set to build an accurate model.
Afterwards, we use the test set to report the accuracy of the model.

We apply the following algorithms to produce various kinds of models.
- K Nearest Neighbor(KNN)
- Decision Tree
- Support Vector Machine
- Logistic Regression

## Build a K-Nearest Neighbors (KNN) Model
For each integer $1 \le k \lt 50$, we build a KNN classifier with $k$ neighbors and compute the Jaccard score
for the classifier. The best value of $k$ corresponds to classifier with maximum Jaccard score.
The upper bound of $50$ was chosen as a matter of convenience, since each KNN classifier requires significant
system resources to construct.

In [None]:
# Build the most accurate KNN model by using a value of k that maximizes the Jaccard score.
print("Building KNeighborsClassifier with k = 10 neighbors...")
start_time = time_now()
knn_clf = KNeighborsClassifier(n_neighbors = 10).fit(X_train_transformed, y_train)
print("Completed in", elapsed_time(start_time), "seconds.")

In [None]:
print_elapsed_time()

## Evaluate the K Nearest Neighbor Model

# Build a Decision Tree Model

In [None]:
# Build a decision tree model from the training data previously generated.
start_time = time_now()
decision_tree = DecisionTreeClassifier(criterion="entropy")
decision_tree.fit(X_train_transformed, y_train)
print("Built Decision Tree Model in", elapsed_time(start_time), "seconds.")
print()

In [None]:
print_elapsed_time()

# Build a Support Vector Machine Model

In [None]:
# Build a support vector machine model from the training data previously generated.
start_time = time_now()
clf = svm.SVC(kernel='rbf', gamma='auto')
clf.fit(X_train_transformed, y_train)
print("Built Support Vector Machine Model in", elapsed_time(start_time), "seconds.")
print()

In [None]:
print_elapsed_time()

# Build a Logistic Regression Model

In [None]:
# Build a logistic regression model from the training data previously generated.
start_time = time_now()
lr = LogisticRegression(C=0.01, solver='liblinear').fit(X_train_transformed, y_train)
print("Built Logistic Regression Model in", elapsed_time(start_time), "seconds.")
print()

In [None]:
print_elapsed_time()

# Evaluate the Various Models

In [None]:
pd.Series(y_test).value_counts(normalize=True, dropna=False)

In [None]:
start_time = time_now()
# Apply KNN to the test set, generate predictions for KNN.
print("Running command: y_knn_predictions=knn_clf.predict(X_test)")
y_knn_predictions.knn_clf.predict(X_test_transformed)
print("Completed in", elapsed_time(start_time), "seconds.")
print()

In [None]:
y_knn_predictions.shape

In [None]:
pd.Series(y_knn_predictions).value_counts(normalize=True, dropna=False)

In [None]:
# Apply Decision Tree to the test set, generate predictions for Decision Tree.
print("Running command: y_tree_predictions = decision_tree.predict(X_test)")
start_time = time_now()
y_tree_predictions = decision_tree.predict(X_test_transformed)
print("Completed in", elapsed_time(start_time), "seconds")
print()

In [None]:
pd.Series(y_tree_predictions).value_counts(normalize=True, dropna=False)

In [None]:
# Apply SVM to the test set, generate predictions for SVM.
print("Running command: y_svm_predictions = clf.predict(X_test)")
start_time = time_now()
y_svm_predictions = clf.predict(X_test_transformed)
print("Completed in", elapsed_time(start_time), "seconds.")
print()

In [None]:
pd.Series(y_svm_predictions).value_counts(normalize=True, dropna=False)

In [None]:
# Apply Logistic Regression to the test set, generate predictions and probabilities for Logistic Regression.
print("Running command: y_lr_predictions = lr.predict(X_test)")
start_time = time_now()
y_lr_predictions = lr.predict(X_test_transformed)
print("Completed in", elapsed_time(start_time), "seconds.")
print()

print("Running command: y_lr_probabilities = lr.predict_proba(X_test\)")
start_time = time_now()
y_lr_probabilities = lr.predict_proba(X_test_transformed)
print("Completed in", elapsed_time(start_time), "seconds.")
print()

In [None]:
pd.Series(y_lr_predictions).value_counts(normalize=True, dropna=False)

In [None]:
# Define numpy arrays to store the results of tests of the various algorithms.
# index = 0 => KNN score
# index = 1 => Decision Tree
# index = 2 => SVM
# index = 3 => Logistic Regression
jaccard = np.zeros((4,4))
f1 = np.zeros((4,4))
logloss = np.zeros((4,4))
logloss[0] = np.nan
logloss[1] = np.nan
logloss[2] = np.nan

In [None]:
# For KNN model, compute Jaccard score.
jaccard[0] = jaccard_score(y_test, y_knn_predictions, labels=["1", "2", "2b", "3"], average=None)
print("KNN Jaccard score is", jaccard[0,:])
# For KNN model, compute F1-score.
f1[0] = f1_score(y_test, y_knn_predictions, average=None)
print("KNN F1-score is", f1[0])
print()

In [None]:
# For Decision Tree model, compute Jaccard score.
jaccard[1] = jaccard_score(y_test, y_tree_predictions, labels=["1", "2", "2b", "3"], average=None)
print("Decision Tree Jaccard score is: ", jaccard[1])
# For Decision Tree model, compute F1-score.
f1[1] = f1_score(y_test, y_tree_predictions, average=None)
print("Decision Tree F1-score is: ", f1[1])
print()

In [None]:
# For SVM algorithm, compute Jaccard score.
jaccard[2] = jaccard_score(y_test, y_svm_predictions, labels=["1", "2", "2b", "3"], average=None)
print("SVM Jaccard score is", jaccard[2])
# For SVM algorithm, compute F1-score.
f1[2] = f1_score(y_test, y_svm_predictions, average=None)
print("SVM F1-score is", f1[2])
print()

In [None]:
# For logistic regression algorithm, compute Jaccard score.
jaccard[3] = jaccard_score(y_test, y_lr_predictions, labels=["1", "2", "2b", "3"], average=None)
print("Logistic Regression Jaccard similarity score is", jaccard[3])
# For logistic regression algorithm, compute F1-score.
f1[3] = f1_score(y_test, y_lr_predictions, average=None)
print("Logistic Regression F1-score is", f1[3])
# For logistic regression algorithm, compute log loss.
logloss[3] = log_loss(y_test, y_lr_probabilities)
print("Logistic Regression log loss is", logloss[3])

In [None]:
print("Notebook total elapsed time:", elapsed_time(), "seconds.")