In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

from metrics import classification_summary
from pandas_util import normalize_columns

In [2]:
df = pd.read_csv("./datasets/dmba/TinyData.csv")

predictors = ["Fat", "Salt"]
outcome = "Acceptance"

X = df[predictors]
y = df[outcome]

classes = sorted(y.unique())
clf = MLPClassifier(
    hidden_layer_sizes=(3), activation="logistic", solver="lbfgs", random_state=1
)

clf.fit(X, y)
clf.predict(X)

array(['like', 'dislike', 'dislike', 'dislike', 'like', 'like'],
      dtype='<U7')

In [3]:
print("Intercepts:", clf.intercepts_)
print("Weights:", clf.coefs_)
print(pd.concat([df, pd.DataFrame(clf.predict_proba(X), columns=classes)], axis=1))

Intercepts: [array([0.13368045, 4.07247552, 7.00768104]), array([14.30748676])]
Weights: [array([[ -1.30656481,  -4.20427792, -13.29587332],
       [ -0.04399727,  -4.91606924,  -6.03356987]]), array([[ -0.27348313],
       [ -9.01211573],
       [-17.63504694]])]
   Obs.  Fat  Salt Acceptance   dislike      like
0     1  0.2   0.9       like  0.000490  0.999510
1     2  0.1   0.1    dislike  0.999994  0.000006
2     3  0.2   0.4    dislike  0.999741  0.000259
3     4  0.2   0.5    dislike  0.997368  0.002632
4     5  0.4   0.5       like  0.002133  0.997867
5     6  0.3   0.8       like  0.000075  0.999925


In [4]:
classification_summary(y_true=y, y_pred=clf.predict(X))

Accuracy: 1.0

Confusion matrix:
[[3 0]
 [0 3]]

Classification report:
              precision    recall  f1-score   support

     dislike       1.00      1.00      1.00         3
        like       1.00      1.00      1.00         3

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6



## Classifying Accident Severity

In [5]:
accidents_df = pd.read_csv("./datasets/dmba/accidentsnn.csv")
normalize_columns(accidents_df)

In [6]:
accidents_df.sur_cond = accidents_df.sur_cond.astype("category")
accidents_df.max_sev_ir = accidents_df.max_sev_ir.astype("category")
accidents_df.describe(include="category")

Unnamed: 0,sur_cond,max_sev_ir
count,999,999
unique,5,3
top,1,0
freq,782,551


In [7]:
# Convert the categorical data into dummy variables
# Exclude the column for sur_cond 9 = unknown
processed = pd.get_dummies(accidents_df, columns=["sur_cond"])
processed = processed.drop(columns=["sur_cond_9"])
processed.head()

Unnamed: 0,alchl_i,profil_i_r,veh_invl,max_sev_ir,sur_cond_1,sur_cond_2,sur_cond_3,sur_cond_4
0,2,0,1,0,1,0,0,0
1,2,1,1,2,1,0,0,0
2,1,0,1,0,1,0,0,0
3,2,0,2,1,0,1,0,0
4,2,1,2,1,1,0,0,0


In [8]:
outcome = "max_sev_ir"
predictors = [c for c in processed.columns if c != outcome]

In [9]:
X = processed[predictors]
y = processed[outcome]

train_X, valid_X, train_y, valid_y = train_test_split(
    X, y, test_size=0.4, random_state=1
)

In [10]:
# Train neural network with 2 hidden nodes.
clf = MLPClassifier(
    hidden_layer_sizes=(2), activation="logistic", solver="lbfgs", random_state=1
)
clf.fit(train_X, train_y.values)

MLPClassifier(activation='logistic', hidden_layer_sizes=2, random_state=1,
              solver='lbfgs')

In [11]:
# Training performance (use idmax to revert to one-hot encoding)
classification_summary(y_true=train_y, y_pred=clf.predict(train_X))

Accuracy: 0.8664440734557596

Confusion matrix:
[[331   0   1]
 [  0 180   0]
 [ 30  49   8]]

Classification report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       332
           1       0.79      1.00      0.88       180
           2       0.89      0.09      0.17        87

    accuracy                           0.87       599
   macro avg       0.86      0.70      0.67       599
weighted avg       0.87      0.87      0.82       599



In [12]:
classification_summary(y_true=valid_y, y_pred=clf.predict(valid_X))

Accuracy: 0.855

Confusion matrix:
[[218   0   1]
 [  0 119   0]
 [ 24  33   5]]

Classification report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95       219
           1       0.78      1.00      0.88       119
           2       0.83      0.08      0.15        62

    accuracy                           0.85       400
   macro avg       0.84      0.69      0.66       400
weighted avg       0.86      0.85      0.80       400

