In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import export_text
from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import _tree

In [2]:
#Importing the data that is already semi encoded from Excel 
#the encoding made YES 1, and eliminated NULLs in target
df = pd.read_csv('seriously_injured_rl_encoded.csv')
df.head()

Unnamed: 0,ACCNUM,ObjectId,INVTYPE,INVAGE,INJURY,VEHTYPE,MANOEUVER,PEDESTRIAN,CYCLIST,AUTOMOBILE,...,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,HOUR,TIME,VISIBILITY,LIGHT,collision_lat,collision_long
0,56815,16515,Driver,45 to 49,,Delivery Van,Going Ahead,0,0,1,...,1,1,0,0,11,1100,Clear,Daylight,-79.5755,43.7018
1,56815,16516,Driver,50 to 54,Major,"Automobile, Station Wagon",Going Ahead,0,0,1,...,1,1,0,0,11,1100,Clear,Daylight,-79.5755,43.7018
2,56815,16517,Driver,55 to 59,,Pick Up Truck,Going Ahead,0,0,1,...,1,1,0,0,11,1100,Clear,Daylight,-79.5755,43.7018
3,56815,16518,Driver,35 to 39,,"Automobile, Station Wagon",Stopped,0,0,1,...,1,1,0,0,11,1100,Clear,Daylight,-79.5755,43.7018
4,93249,16533,Driver,35 to 39,,Pick Up Truck,Turning Left,0,0,1,...,0,0,0,0,14,1356,Clear,Daylight,-79.3497,43.6519


In [3]:
#Find the target counts -- checking for classification imbalance
print(df.groupby(['INJURY'])['INJURY'].count())

#With imbalance, oversample
ros = RandomOverSampler(random_state=1)
#Drop columns that may not matter in overall Decision Tree 
#-- should drop those that do not help in telling a story
#-- should also drop redundant columns
df2 = df.drop(['VEHTYPE', 'MANOEUVER','INVAGE', 'collision_lat', 'collision_long','TIME', "HOUR"],axis=1)
x_ros, y_ros = ros.fit_resample(df2.loc[:,df2.columns != 'INJURY'], df['INJURY'])

#For checking if over-sampling worked
df3 = pd.concat([x_ros,y_ros],axis=1)
print(df3.groupby(['INJURY'])['INJURY'].count())

INJURY
Fatal       821
Major      5668
Minimal    1042
Minor      1311
None       6406
Name: INJURY, dtype: int64
INJURY
Fatal      6406
Major      6406
Minimal    6406
Minor      6406
None       6406
Name: INJURY, dtype: int64


In [4]:

#Encoded categorical columns for Decision Tree usage.
label_encoded = pd.get_dummies(x_ros,drop_first=True)
print(label_encoded)

           ACCNUM  ObjectId  PEDESTRIAN  CYCLIST  AUTOMOBILE  MOTORCYCLE  \
0           56815     16515           0        0           1           0   
1           56815     16516           0        0           1           0   
2           56815     16517           0        0           1           0   
3           56815     16518           0        0           1           0   
4           93249     16533           0        0           1           0   
...           ...       ...         ...      ...         ...         ...   
32025  7001234920     13945           0        0           1           0   
32026     1252228      7166           0        0           1           0   
32027  3001029733     10361           0        0           1           0   
32028  5001109964     11746           0        0           1           0   
32029     1298747      8508           0        0           1           0   

       TRUCK  TRSN_CITY_VEH  EMERG_VEH  PASSENGER  ...  VISIBILITY_Snow  \
0          0

In [5]:
#Drop any columns that are still not needed in model
X = label_encoded.drop(['ACCNUM', 'ObjectId'],axis=1)
Y = y_ros

In [6]:
#Splitting into test and train dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 1)
print(X_train)
print(Y_train)

       PEDESTRIAN  CYCLIST  AUTOMOBILE  MOTORCYCLE  TRUCK  TRSN_CITY_VEH  \
22423           0        0           1           0      0              0   
12329           0        0           1           0      0              0   
27806           0        0           1           0      0              0   
6086            0        0           1           0      0              0   
26352           0        0           1           0      0              0   
...           ...      ...         ...         ...    ...            ...   
17289           1        0           1           0      0              0   
5192            0        0           1           0      0              0   
12172           1        0           1           0      0              0   
235             0        0           1           1      0              0   
29733           1        0           1           0      0              0   

       EMERG_VEH  PASSENGER  SPEEDING  AG_DRIV  ...  VISIBILITY_Snow  \
22423          

# Decision Tree Model

In [7]:
DT_model = DecisionTreeClassifier(criterion = "entropy")

In [8]:
DT_model.fit(X_train, Y_train)

DecisionTreeClassifier(criterion='entropy')

In [9]:
DT_prediction = DT_model.predict(X_test)

In [10]:
#Test accuracy of prediction
#For Fatal and Major it is important we have high recall (FN is worse)
#For Minimal to Minor, it is more important to check precision

print(accuracy_score(DT_prediction,Y_test))
print(confusion_matrix(DT_prediction,Y_test))
print(classification_report(DT_prediction,Y_test))

0.5659277760432927
[[1271  647   78   94   35]
 [ 283  740  119  166  119]
 [ 132  242  996  489  226]
 [ 208  278  534 1062  211]
 [   0   43  177   90 1369]]
              precision    recall  f1-score   support

       Fatal       0.67      0.60      0.63      2125
       Major       0.38      0.52      0.44      1427
     Minimal       0.52      0.48      0.50      2085
       Minor       0.56      0.46      0.51      2293
        None       0.70      0.82      0.75      1679

    accuracy                           0.57      9609
   macro avg       0.57      0.57      0.57      9609
weighted avg       0.57      0.57      0.57      9609



In [11]:
# Rules Interpretation of Decision Tree
text_representation = export_text(DT_model, feature_names = list(X_train.columns))
print(text_representation)

|--- INVTYPE_Pedestrian <= 0.50
|   |--- PEDESTRIAN <= 0.50
|   |   |--- PASSENGER <= 0.50
|   |   |   |--- INVTYPE_Driver <= 0.50
|   |   |   |   |--- INVTYPE_Truck Driver <= 0.50
|   |   |   |   |   |--- INVTYPE_Vehicle Owner <= 0.50
|   |   |   |   |   |   |--- INVTYPE_Other <= 0.50
|   |   |   |   |   |   |   |--- SPEEDING <= 0.50
|   |   |   |   |   |   |   |   |--- AUTOMOBILE <= 0.50
|   |   |   |   |   |   |   |   |   |--- LIGHT_Daylight <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- LIGHT_Dusk, artificial <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 9
|   |   |   |   |   |   |   |   |   |   |--- LIGHT_Dusk, artificial >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |--- LIGHT_Daylight >  0.50
|   |   |   |   |   |   |   |   |   |   |--- INVTYPE_Driver - Not Hit <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 10
|   |   |   |   |

# Function to Print Rules in Readable Form

In [12]:
def get_rules(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    # sort by samples count
    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]
    
    rules = []
    for path in paths:
        rule = "if "
        
        for p in path[:-1]:
            if rule != "if ":
                rule += " and "
            rule += str(p)
        rule += " then "
        if class_names is None:
            rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
        rule += f" | based on {path[-1][1]:,} samples"
        rules += [rule]
        
    return rules

In [13]:
#print rules
rules = get_rules(DT_model, list(X_train.columns), df['INJURY'])
for r in rules:
    print(r + '\n')

if (INVTYPE_Pedestrian > 0.5) and (TRUCK <= 0.5) and (AG_DRIV <= 0.5) and (MOTORCYCLE <= 0.5) and (LIGHT_Daylight > 0.5) and (PASSENGER <= 0.5) and (CYCLIST <= 0.5) and (VISIBILITY_Rain <= 0.5) and (VISIBILITY_Snow <= 0.5) and (AUTOMOBILE > 0.5) and (DISABILITY <= 0.5) and (ALCOHOL <= 0.5) and (VISIBILITY_Other <= 0.5) and (VISIBILITY_Strong wind <= 0.5) and (VISIBILITY_Freezing Rain <= 0.5) then class: None (proba: 48.91%) | based on 875 samples

if (INVTYPE_Pedestrian > 0.5) and (TRUCK <= 0.5) and (AG_DRIV > 0.5) and (SPEEDING <= 0.5) and (TRSN_CITY_VEH <= 0.5) and (REDLIGHT <= 0.5) and (VISIBILITY_Rain <= 0.5) and (LIGHT_Dusk, artificial <= 0.5) and (LIGHT_Daylight > 0.5) and (PASSENGER <= 0.5) and (MOTORCYCLE <= 0.5) and (VISIBILITY_Other <= 0.5) and (VISIBILITY_Snow <= 0.5) and (ALCOHOL <= 0.5) and (AUTOMOBILE > 0.5) and (VISIBILITY_Freezing Rain <= 0.5) then class: Major (proba: 45.65%) | based on 644 samples

if (INVTYPE_Pedestrian <= 0.5) and (PEDESTRIAN <= 0.5) and (PASSENGER 