# Dataset Classifications and EDA

# Project EDA
 - We first explored predicting either the Severity or the Collision Type of vehicle collisions
     - The collision type is more evenly distributed
     - The severity is predicted more accurately (possibly, to be expected due to the distribution)

In [20]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

import mysklearn.juputils as juputils
importlib.reload(juputils)

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyKNeighborsClassifier, MySimpleLinearRegressor, MyNaiveBayesClassifier, MyDecisionTreeClassifier, MyRandomForestClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

## Distribution of Attribute Labels
The first distribution is from Severity Description, the second is from Collision Type (below).

In [21]:
import os
importlib.reload(juputils)

fname = os.path.join("input_data", "collisions.csv")
collisions_data = MyPyTable().load_from_file(fname)

class_labels = collisions_data.get_column('SEVERITYDESC')

severity_unique_labels = juputils.get_unique(class_labels)
print(severity_unique_labels)

print(juputils.get_counts(class_labels, severity_unique_labels))

class_labels = collisions_data.get_column('COLLISIONTYPE')

collision_unique_labels = juputils.get_unique(class_labels)
print(collision_unique_labels)

print(juputils.get_counts(class_labels, collision_unique_labels))

['Injury Collision', 'Property Damage Only Collision', 'Serious Injury Collision', 'Unknown', 'Fatality Collision']
[1399, 3206, 74, 810, 11]
['Angles', 'Parked Car', 'Left Turn', 'Other', 'Sideswipe', 'Rear Ended', '', 'Pedestrian', 'Cycles', 'Right Turn', 'Head On']
[828, 1113, 338, 614, 430, 870, 909, 162, 117, 63, 56]


## Predicting Collision Severity Description: Decision Tree Classifier

In [22]:
import os
importlib.reload(juputils)

print('TREE FOR COLLISION SEVERITY')

fname = os.path.join("input_data", "collisions.csv")
collisions_data = MyPyTable().load_from_file(fname)

weather = collisions_data.get_column('WEATHER')
road_condition = collisions_data.get_column('ROADCOND')
light_condition = collisions_data.get_column('LIGHTCOND')
junction_type = collisions_data.get_column('JUNCTIONTYPE')

X_train = [[weather[i],road_condition[i],light_condition[i],junction_type[i]] for i in range(len(weather))]
y_train = collisions_data.get_column('SEVERITYDESC')

for i,val in enumerate(y_train):
    if val == 'Unknown':
        del y_train[i]
        del X_train[i]

strattrain_folds, strattest_folds = myevaluation.stratified_kfold_cross_validation(X_train, y_train, 10)
strat_xtrain, strat_ytrain, strat_xtest, strat_ytest = myutils.get_from_folds(X_train, y_train, strattrain_folds, strattest_folds)

myt = MyDecisionTreeClassifier()
myt.fit(strat_xtrain, strat_ytrain)

predicted = myt.predict(strat_xtest)
accuracy = juputils.get_accuracy(strat_ytest, predicted)
print('Tree: accuracy =', accuracy, 'error rate = ', (1-accuracy))

TREE FOR COLLISION SEVERITY
Tree: accuracy = 0.6523364485981309 error rate =  0.3476635514018691


In [23]:
print('==========================================================\n\
Confusion Matrix: Decision Tree and Severity Description\n\
==========================================================\n')
matrix = myevaluation.confusion_matrix(strat_ytest, predicted, severity_unique_labels)
table_header = ['Severity', 1,2, 3 ,4, 'Total', 'Recognition (%)']
juputils.add_conf_stats(matrix)
juputils.print_tabulate(matrix, table_header)

print("\nMATRIX KEY")
for i in range(len(severity_unique_labels)):
    if severity_unique_labels[i] != "":
        print(i+1, ":", severity_unique_labels[i])
    else:
        print(i+1, ": Unknown")

Confusion Matrix: Decision Tree and Severity Description

  Severity     1    2    3    4    Total    Recognition (%)
         1  3056    0    0    0     3057              99.97
         2    71    0    0    0       73               0
         3   125    0    0    0      128               0
         4    10    0    0    0       14               0

MATRIX KEY
1 : Injury Collision
2 : Property Damage Only Collision
3 : Serious Injury Collision
4 : Unknown
5 : Fatality Collision


## Predicting Collision Severity Description: Naive Bayes Classifier

In [24]:
import os
importlib.reload(juputils)

print('NAIVE BAYES FOR COLLISION SEVERITY')

fname = os.path.join("input_data", "collisions.csv")
collisions_data = MyPyTable().load_from_file(fname)

weather = collisions_data.get_column('WEATHER')
road_condition = collisions_data.get_column('ROADCOND')
light_condition = collisions_data.get_column('LIGHTCOND')
junction_type = collisions_data.get_column('JUNCTIONTYPE')

X_train = [[weather[i],road_condition[i],light_condition[i],junction_type[i]] for i in range(len(weather))]
y_train = collisions_data.get_column('SEVERITYDESC')

for i,val in enumerate(y_train):
    if val == 'Unknown':
        del y_train[i]
        del X_train[i]

strattrain_folds, strattest_folds = myevaluation.stratified_kfold_cross_validation(X_train, y_train, 10)
strat_xtrain, strat_ytrain, strat_xtest, strat_ytest = myutils.get_from_folds(X_train, y_train, strattrain_folds, strattest_folds)

myb = MyNaiveBayesClassifier()
myb.fit(strat_xtrain, strat_ytrain)

predicted = myb.predict(strat_xtest)
accuracy = juputils.get_accuracy(strat_ytest, predicted)
print('Naive Bayes: accuracy =', accuracy, 'error rate = ', (1-accuracy))

NAIVE BAYES FOR COLLISION SEVERITY
Naive Bayes: accuracy = 0.6610591900311527 error rate =  0.33894080996884735


In [25]:
print('===========================================\n\
Confusion Matrix: Naive Bayes and Severity Description\n\
===========================================\n')
matrix = myevaluation.confusion_matrix(strat_ytest, predicted, severity_unique_labels)
table_header = ['Severity', 1,2, 3 ,4, 'Total', 'Recognition (%)']
juputils.add_conf_stats(matrix)
juputils.print_tabulate(matrix, table_header)

print("\nMATRIX KEY")
for i in range(len(severity_unique_labels)):
    if severity_unique_labels[i] != "":
        print(i+1, ":", severity_unique_labels[i])
    else:
        print(i+1, ": Unknown")

Confusion Matrix: Naive Bayes and Severity Description

  Severity     1    2    3    4    Total    Recognition (%)
         1  2728    0   79    0     2808              97.15
         2    45    0    3    0       50               0
         3     0    0  125    0      128              97.66
         4    10    0    0    0       14               0

MATRIX KEY
1 : Injury Collision
2 : Property Damage Only Collision
3 : Serious Injury Collision
4 : Unknown
5 : Fatality Collision


## Predicting Collision Type: Decision Tree Classifier

In [26]:
import os
importlib.reload(juputils)

print('DECISION TREE FOR COLLISION TYPE')

fname = os.path.join("input_data", "collisions.csv")
collisions_data = MyPyTable().load_from_file(fname)

weather = collisions_data.get_column('WEATHER')
road_condition = collisions_data.get_column('ROADCOND')
light_condition = collisions_data.get_column('LIGHTCOND')
junction_type = collisions_data.get_column('JUNCTIONTYPE')
severity = collisions_data.get_column('SEVERITYDESC')

X_train = [[weather[i],road_condition[i],light_condition[i],junction_type[i],severity[i]] for i in range(len(weather))]
y_train = collisions_data.get_column('COLLISIONTYPE')

for i,val in enumerate(y_train):
    if val == 'Unknown':
        del y_train[i]
        del X_train[i]

strattrain_folds, strattest_folds = myevaluation.stratified_kfold_cross_validation(X_train, y_train, 10)
strat_xtrain, strat_ytrain, strat_xtest, strat_ytest = myutils.get_from_folds(X_train, y_train, strattrain_folds, strattest_folds)

myt = MyDecisionTreeClassifier()
myt.fit(strat_xtrain, strat_ytrain)

predicted = myt.predict(strat_xtest)
accuracy = juputils.get_accuracy(strat_ytest, predicted)
print('Tree: accuracy =', accuracy, 'error rate = ', (1-accuracy))

DECISION TREE FOR COLLISION TYPE
Tree: accuracy = 0.20672727272727273 error rate =  0.7932727272727272


In [27]:
print('===================================================\n\
Confusion Matrix: Decision Tree and Collision Type\n\
===================================================\n')
matrix = myevaluation.confusion_matrix(strat_ytest, predicted, collision_unique_labels)
table_header = ['Collision Type', 1,2, 3 ,4, 5, 6, 7, 8,9,10, 'Total', 'Recognition (%)']
juputils.add_conf_stats(matrix)
juputils.print_tabulate(matrix, table_header)

print("\nMATRIX KEY")
for i in range(len(collision_unique_labels)):
    if collision_unique_labels[i] != "":
        print(i+1, ":", collision_unique_labels[i])
    else:
        print(i+1, ": Unknown")

Confusion Matrix: Decision Tree and Collision Type

  Collision Type    1    2    3    4    5    6    7    8    9    10    Total    Recognition (%)
               1  243    8  316    0   49    0    4  459    0     0     1080              22.5
               2    4   10   18    0   10    0   74   28    0     0      146               6.85
               3   16    4  231    0   25    0   40  240    0     0      559              41.32
               4   11    6  126    0   18    0   19  185    0     0      369               0
               5   17   10  387    0   53    0   10  335    0     0      817               6.49
               6  909    0    0    0    0    0    0    0    0     0      915               0
               7    8    2   15    0    8    0   32   30    0     0      102              31.37
               8    3    1    7    0    5    0    7   33    0     0       64              51.56
               9    0    1   14    0    2    0    8   12    0     0       46               

## Predicting Collision Type: Naive Bayes Classifier

In [28]:
import os
importlib.reload(juputils)

print('NAIVE BAYES FOR COLLISION TYPE')

fname = os.path.join("input_data", "collisions.csv")
collisions_data = MyPyTable().load_from_file(fname)

weather = collisions_data.get_column('WEATHER')
road_condition = collisions_data.get_column('ROADCOND')
light_condition = collisions_data.get_column('LIGHTCOND')
junction_type = collisions_data.get_column('JUNCTIONTYPE')
severity = collisions_data.get_column('SEVERITYDESC')

X_train = [[weather[i],road_condition[i],light_condition[i],junction_type[i],severity[i]] for i in range(len(weather))]
y_train = collisions_data.get_column('COLLISIONTYPE')

for i,val in enumerate(y_train):
    if val == 'Unknown':
        del y_train[i]
        del X_train[i]

strattrain_folds, strattest_folds = myevaluation.stratified_kfold_cross_validation(X_train, y_train, 10)
strat_xtrain, strat_ytrain, strat_xtest, strat_ytest = myutils.get_from_folds(X_train, y_train, strattrain_folds, strattest_folds)

myb = MyNaiveBayesClassifier()
myb.fit(strat_xtrain, strat_ytrain)

predicted = myb.predict(strat_xtest)
accuracy = juputils.get_accuracy(strat_ytest, predicted)
print('Naive Bayes: accuracy =', accuracy, 'error rate = ', (1-accuracy))

NAIVE BAYES FOR COLLISION TYPE
Naive Bayes: accuracy = 0.5887272727272728 error rate =  0.41127272727272723


In [29]:
print('===============================================\n\
Confusion Matrix: Naive Bayes and Collision Type\n\
===============================================\n')
matrix = myevaluation.confusion_matrix(strat_ytest, predicted, collision_unique_labels)
table_header = ['Collision Type', 1,2, 3 ,4, 5, 6, 7, 8,9,10, 'Total', 'Recognition (%)']
juputils.add_conf_stats(matrix)
juputils.print_tabulate(matrix, table_header)

print("\nMATRIX KEY")
for i in range(len(collision_unique_labels)):
    if collision_unique_labels[i] != "":
        print(i+1, ":", collision_unique_labels[i])
    else:
        print(i+1, ": Unknown")

Confusion Matrix: Naive Bayes and Collision Type

  Collision Type    1    2    3    4    5    6    7    8    9    10    Total    Recognition (%)
               1  839    0   83   39  129    1    1    2    0     0     1095              76.62
               2   13    2   15    0   24    0    4    6    0     0       66               3.03
               3  180    1  173   21  152    0    0    2    0     1      533              32.46
               4  164    0   26   34  125    0    0    0    0     0      353               9.63
               5  168    0   34   40  559    0    2    3    0     0      811              68.93
               6    0    0    0    0    0  909    0    0    0     0      915              99.34
               7    8    1   10    0   35    2    6    4    0     0       73               8.22
               8    8    0   10    0   24    0    2    9    0     0       61              14.75
               9   12    0    7    0   11    0    0    1    0     0       40          

## Predicting Collision Type: Random Forest Classifier

In [30]:
import os
importlib.reload(juputils)

print('FOREST CLASSIFIER FOR COLLISION TYPE')

fname = os.path.join("input_data", "collisions.csv")
collisions_data = MyPyTable().load_from_file(fname)

weather = collisions_data.get_column('WEATHER')
road_condition = collisions_data.get_column('ROADCOND')
light_condition = collisions_data.get_column('LIGHTCOND')
junction_type = collisions_data.get_column('JUNCTIONTYPE')
severity = collisions_data.get_column('SEVERITYDESC')

X_train = [[weather[i],road_condition[i],light_condition[i],junction_type[i],severity[i]] for i in range(len(weather))]
y_train = collisions_data.get_column('COLLISIONTYPE')

for i,val in enumerate(y_train):
    if val == 'Unknown':
        del y_train[i]
        del X_train[i]

strattrain_folds, strattest_folds = myevaluation.stratified_kfold_cross_validation(X_train, y_train, 10)
strat_xtrain, strat_ytrain, strat_xtest, strat_ytest = myutils.get_from_folds(X_train, y_train, strattrain_folds, strattest_folds)

myf = MyRandomForestClassifier(2, 5, 3)
myf.fit(strat_xtrain, strat_ytrain)

predicted = myf.predict(strat_xtest)
accuracy = juputils.get_accuracy(strat_ytest, predicted)
print('Forest Classifier: accuracy =', accuracy, 'error rate = ', (1-accuracy))

FOREST CLASSIFIER FOR COLLISION TYPE
Forest Classifier: accuracy = 0.13272727272727272 error rate =  0.8672727272727273


In [31]:
print('==================================================\n\
Confusion Matrix: Random Forest and Collision Type\n\
==================================================\n')
matrix = myevaluation.confusion_matrix(strat_ytest, predicted, collision_unique_labels)
table_header = ['Collision Type', 1,2, 3 ,4, 5, 6, 7, 8,9,10, 'Total', 'Recognition (%)']
juputils.add_conf_stats(matrix)
juputils.print_tabulate(matrix, table_header)

print("\nMATRIX KEY")
for i in range(len(collision_unique_labels)):
    if collision_unique_labels[i] != "":
        print(i+1, ":", collision_unique_labels[i])
    else:
        print(i+1, ": Unknown")

Confusion Matrix: Random Forest and Collision Type

  Collision Type    1    2    3    4    5    6    7    8    9    10    Total    Recognition (%)
               1   17    8   38    0   32    0  290  705    0     0     1091               1.56
               2    3    9   12    0    6    0   81   33    0     0      146               6.16
               3    1    4   52    0   31    0  221  251    0     0      563               9.24
               4    0    6   56    0   13    0   90  201    0     0      370               0
               5    3   10  240    0   35    0  158  366    0     0      817               4.28
               6  909    0    0    0    0    0    0    0    0     0      915               0
               7    3    2    0    0    2    0   47   41    0     0      102              46.08
               8    0    1    3    0    3    0   11   38    0     0       64              59.38
               9    0    1    7    0    2    0   15   12    0     0       46              