In [2]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(42)

In [3]:
#Read in the data

FARS = pd.read_csv("FARSDATA.csv")


FARS.head()

FileNotFoundError: [Errno 2] No such file or directory: 'FARSDATA.csv'

In [3]:
#split the data

from sklearn.model_selection import train_test_split

train, test = train_test_split(FARS, test_size=0.3)

In [4]:
#drop the case number data

train = train.drop(['ST_CASE'], axis=1)
test = test.drop(['ST_CASE'], axis=1)

In [5]:
train.isna().sum()

STATENAME       0
MONTHNAME       0
HOUR            0
RUR_URBNAME     0
FUNC_SYSNAME    0
MOD_YEARNAME    0
ROLLOVERNAME    0
IMPACT1NAME     0
FIRE_EXPNAME    0
AGE             0
SEXNAME         0
INJ_SEVNAME     0
REST_USENAME    0
REST_MISNAME    0
AIR_BAGNAME     0
EJECTIONNAME    0
ALC_RESNAME     0
Year            0
Overlimit       0
dtype: int64

In [6]:
test.isna().sum()

STATENAME       0
MONTHNAME       0
HOUR            0
RUR_URBNAME     0
FUNC_SYSNAME    0
MOD_YEARNAME    0
ROLLOVERNAME    0
IMPACT1NAME     0
FIRE_EXPNAME    0
AGE             0
SEXNAME         0
INJ_SEVNAME     0
REST_USENAME    0
REST_MISNAME    0
AIR_BAGNAME     0
EJECTIONNAME    0
ALC_RESNAME     0
Year            0
Overlimit       0
dtype: int64

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import FunctionTransformer

In [8]:
#set the inj severity column as the Y variable and drop it

train_y = train['INJ_SEVNAME']
test_y = test['INJ_SEVNAME']

train_inputs = train.drop(['INJ_SEVNAME'], axis=1)
test_inputs = test.drop(['INJ_SEVNAME'], axis=1)

In [10]:
train_inputs.dtypes

STATENAME       object
MONTHNAME       object
HOUR             int64
RUR_URBNAME      int64
FUNC_SYSNAME    object
MOD_YEARNAME     int64
ROLLOVERNAME    object
IMPACT1NAME     object
FIRE_EXPNAME    object
AGE              int64
SEXNAME         object
REST_USENAME    object
REST_MISNAME    object
AIR_BAGNAME     object
EJECTIONNAME    object
ALC_RESNAME     object
Year             int64
Overlimit       object
dtype: object

In [11]:
# set columns as categorical, numeric and binary -  I ended up not setting anything as binary because I couldn't figure out how 
#to do the label encoder for those columns and would have had to go to the excel doc and change everything to 1s and 0s
#I also need someone to double check the columns I left as numeric- would it make more sense to change some of these to categorical
# for instance the hour or years?

#binary_columns = ['RUR_URBNAME', 'ROLLOVERNAME', 'FIRE_EXPNAME', 'SEXNAME']

categorical_columns = ['RUR_URBNAME', 'ROLLOVERNAME', 'FIRE_EXPNAME', 'SEXNAME','STATENAME','MONTHNAME','FUNC_SYSNAME','IMPACT1NAME','REST_USENAME','REST_MISNAME','AIR_BAGNAME','EJECTIONNAME','ALC_RESNAME','Overlimit']

numeric_columns = ['HOUR','MOD_YEARNAME','AGE','Year']

In [12]:
#numeric_columns
categorical_columns
#binary_columns

['RUR_URBNAME',
 'ROLLOVERNAME',
 'FIRE_EXPNAME',
 'SEXNAME',
 'STATENAME',
 'MONTHNAME',
 'FUNC_SYSNAME',
 'IMPACT1NAME',
 'REST_USENAME',
 'REST_MISNAME',
 'AIR_BAGNAME',
 'EJECTIONNAME',
 'ALC_RESNAME',
 'Overlimit']

In [13]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [14]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [15]:
#binary_transformer = Pipeline(steps=[
#    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [17]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        #('binary', binary_transformer, binary_columns)
],
        remainder='passthrough')

In [18]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

<173379x624 sparse matrix of type '<class 'numpy.float64'>'
	with 3120822 stored elements in Compressed Sparse Row format>

In [19]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x.shape

(74306, 624)

In [20]:
#baseline

train_y.value_counts()/len(train_y)

Fatal Injury (K)                0.460782
No Apparent Injury (O)          0.280715
Suspected Minor Injury (B)      0.177092
Suspected Serious Injury (A)    0.081411
Name: INJ_SEVNAME, dtype: float64

In [29]:
#basic decision tree model
tree_clf = DecisionTreeClassifier(min_samples_leaf=20, criterion= 'entropy')

tree_clf.fit(train_x, train_y)


from sklearn.tree import DecisionTreeClassifier 




DecisionTreeClassifier(criterion='entropy', min_samples_leaf=10)

In [22]:
from sklearn.metrics import accuracy_score



In [30]:
#Train accuracy:
train_y_pred = tree_clf.predict(train_x)

print(accuracy_score(train_y, train_y_pred))

0.7441616343386454


In [31]:
#Test accuracy:
test_y_pred = tree_clf.predict(test_x)

print(accuracy_score(test_y, test_y_pred))

0.661696229106667


In [32]:
from sklearn.metrics import confusion_matrix

#Test confusion matrix
confusion_matrix(test_y, test_y_pred)

array([[29009,  2082,  2305,   717],
       [ 2472, 15553,  2378,   374],
       [ 4316,  4142,  4058,   825],
       [ 2891,   988,  1648,   548]], dtype=int64)

In [33]:
from sklearn.metrics import classification_report

print(classification_report(test_y, test_y_pred))

                              precision    recall  f1-score   support

            Fatal Injury (K)       0.75      0.85      0.80     34113
      No Apparent Injury (O)       0.68      0.75      0.71     20777
  Suspected Minor Injury (B)       0.39      0.30      0.34     13341
Suspected Serious Injury (A)       0.22      0.09      0.13      6075

                    accuracy                           0.66     74306
                   macro avg       0.51      0.50      0.50     74306
                weighted avg       0.62      0.66      0.64     74306



In [1]:
#plot the tree for visualization purposes

from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(125,50))

tree = plot_tree(tree_clf, 
              #feature_names=train_inputs.columns.values, # our feature names are stripped from the data set
              class_names=np.unique(train_y), 
              filled=True, 
              rounded=True, 
              fontsize=14)

NameError: name 'tree_clf' is not defined

<Figure size 9000x3600 with 0 Axes>

In [35]:
#ensemble model with Hard Voting
dtree_clf = DecisionTreeClassifier(max_depth=6)
log_clf = LogisticRegression(multi_class='multinomial', solver = 'lbfgs', C=10, max_iter=1000)
sgd_clf = SGDClassifier(max_iter=10000, tol=1e-3)

voting_clf = VotingClassifier(
            estimators=[('dt', dtree_clf), 
                        ('lr', log_clf), 
                        ('sgd', sgd_clf)],
            voting='hard')

voting_clf.fit(train_x, train_y)



from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import SGDClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VotingClassifier(estimators=[('dt', DecisionTreeClassifier(max_depth=6)),
                             ('lr',
                              LogisticRegression(C=10, max_iter=1000,
                                                 multi_class='multinomial')),
                             ('sgd', SGDClassifier(max_iter=10000))])

In [36]:
from sklearn.metrics import accuracy_score

#Train accuracy

train_y_pred = voting_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.6939133343715214


In [37]:
#Test accuracy

test_y_pred = voting_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.6914381072860872


In [38]:
#ENSEMBLE MODEL with SOFT VOTING

voting_clf = VotingClassifier(
            estimators=[('dt', dtree_clf), 
                        ('lr', log_clf)],
            voting='soft')

voting_clf.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VotingClassifier(estimators=[('dt', DecisionTreeClassifier(max_depth=6)),
                             ('lr',
                              LogisticRegression(C=10, max_iter=1000,
                                                 multi_class='multinomial'))],
                 voting='soft')

In [40]:
#Train accuracy

train_y_pred = voting_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.6945247117586328


In [41]:
#Test accuracy

test_y_pred = voting_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.6927166043118994


In [45]:
#Random forest model

from sklearn.ensemble import RandomForestClassifier 

rnd_clf = RandomForestClassifier(n_estimators=100, min_samples_split= 10, min_samples_leaf = 10, n_jobs=-1) 

rnd_clf.fit(train_x, train_y)

RandomForestClassifier(min_samples_leaf=10, min_samples_split=10, n_jobs=-1)

In [46]:
#Train accuracy

train_y_pred = rnd_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.7033666130269525


In [47]:
#Test accuracy

test_y_pred = rnd_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.6968347105213576


In [48]:
#feature importance

rnd_clf.feature_importances_

array([9.90279996e-03, 1.45437975e-02, 2.41768567e-02, 4.79047980e-03,
       1.29003514e-02, 1.17021411e-02, 2.76238013e-02, 3.04191528e-02,
       5.96488845e-03, 6.01721369e-03, 3.27422468e-03, 3.11629757e-03,
       8.26436772e-04, 1.19094125e-05, 5.36293486e-04, 5.44135409e-04,
       2.19452097e-03, 3.62574602e-04, 1.01111324e-04, 1.17083142e-04,
       1.17641642e-03, 7.79695463e-04, 2.41385119e-05, 4.81157364e-05,
       5.92050536e-04, 3.72339250e-03, 1.77837552e-04, 6.37269578e-04,
       3.60395642e-04, 1.17276818e-03, 6.70895493e-05, 4.92297877e-04,
       4.21935278e-04, 1.33173155e-03, 2.58661933e-04, 4.60158578e-04,
       1.12867498e-03, 7.23845116e-05, 2.11234809e-04, 1.66596397e-04,
       3.77293900e-05, 4.21602899e-04, 2.68460282e-04, 1.52836480e-03,
       1.39378335e-03, 1.84595713e-05, 5.62989546e-04, 2.58845376e-04,
       3.13619160e-04, 5.58861815e-04, 3.72114258e-06, 1.18650601e-03,
       9.75365222e-05, 9.95708832e-04, 2.01905408e-03, 1.36453618e-04,
      

In [49]:
# Round to two decimals
np.round(rnd_clf.feature_importances_,2)

array([0.01, 0.01, 0.02, 0.  , 0.01, 0.01, 0.03, 0.03, 0.01, 0.01, 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  , 0.  ,
       0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.01, 0.  , 0.01, 0.  , 0.01, 0.02, 0.02, 0.01, 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.1 , 0.  , 0.  , 0.01, 0.01, 0.  ,
       0.1 , 0.09, 0.  , 0.01, 0.02, 0.09, 0.04, 0.  , 0.02, 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.

In [50]:
#Another hard voting classifier with more DT in it


dtree_clf = DecisionTreeClassifier(min_samples_leaf=30, criterion= 'entropy')
dtree_clf2 = DecisionTreeClassifier(min_samples_leaf=10, criterion= 'gini')
dtree_clf3 = DecisionTreeClassifier(max_depth= 35, min_samples_split = 5)
log_clf = LogisticRegression(multi_class='multinomial', solver= 'lbfgs', C=1, max_iter=100)
sgd_clf = SGDClassifier(max_iter=10000, tol=1e-3)

voting_clf = VotingClassifier(
            estimators=[('dt', dtree_clf), ('dt2', dtree_clf2),('dt3', dtree_clf3),
                        ('lr', log_clf), 
                        ('sgd', sgd_clf)],
            voting='hard')

voting_clf.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VotingClassifier(estimators=[('dt',
                              DecisionTreeClassifier(criterion='entropy',
                                                     min_samples_leaf=30)),
                             ('dt2',
                              DecisionTreeClassifier(min_samples_leaf=10)),
                             ('dt3',
                              DecisionTreeClassifier(max_depth=35,
                                                     min_samples_split=5)),
                             ('lr',
                              LogisticRegression(C=1,
                                                 multi_class='multinomial')),
                             ('sgd', SGDClassifier(max_iter=10000))])

In [51]:
#Train accuracy

train_y_pred = voting_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.7412489401830672


In [52]:
#Test accuracy

test_y_pred = voting_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.6943988372405997
