# Neural Network Classifier 
---

# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from collections import Counter

import pickle

In [2]:
from numpy.random import seed
seed(1)

# Set random seed
tf.random.set_seed(42)

# Read in Data

Import data and observe the basics

In [3]:
csv_file = "../data/drugs_2020_simply_imputed.csv"
df = pd.read_csv(csv_file)
print(df.shape)
df.head()

(16829, 64)


Unnamed: 0,accgdln,age,altdum,amttotal,casetype,citwhere,combdrg2,crimhist,disposit,district,...,typemony,typeoths,unit1,mwgt1,wgt1,xcrhissr,xfolsor,xmaxsor,xminsor,sentrnge
0,1.0,20.0,0,0,1.0,211.0,6.0,1.0,1,43,...,1.0,0,1.0,63560990.0,85104.433315,1.0,17.0,30.0,24.0,8.0
1,1.0,64.0,0,0,1.0,211.0,1.0,1.0,1,51,...,1.0,0,1.0,1193400.0,5967.0,3.0,27.0,108.0,87.0,0.0
2,1.0,28.0,0,0,1.0,211.0,3.0,1.0,1,48,...,1.0,0,2.0,2000000.0,2000.0,6.0,27.0,162.0,130.0,2.0
3,2.0,55.0,0,0,1.0,211.0,77.0,1.0,1,65,...,1.0,0,1.0,10300.0,4.12,5.0,13.0,37.0,30.0,0.0
4,1.0,30.0,0,0,1.0,211.0,6.0,1.0,1,87,...,1.0,0,1.0,169200.0,84.6,6.0,25.0,137.0,110.0,2.0


> **16829 rows and 67 columns**
>> **However some of these columns are dropped and one is our target columns, PRISDUM**

In [4]:
df.columns

Index(['accgdln', 'age', 'altdum', 'amttotal', 'casetype', 'citwhere',
       'combdrg2', 'crimhist', 'disposit', 'district', 'drugmin', 'dsplea',
       'educatn', 'intdum', 'methmin', 'monrace', 'monsex', 'mweight',
       'newcit', 'newcnvtn', 'neweduc', 'newrace', 'nodrug', 'numdepen',
       'offguide', 'prisdum', 'probatn', 'probdum', 'quarter', 'reas1',
       'reas2', 'reas3', 'regsxmin', 'relmin', 'restdet1', 'restdum', 'safe',
       'safety', 'senspcap', 'sensplt0', 'sentimp', 'smax1', 'smin1',
       'sources', 'statmax', 'statmin', 'supermax', 'supermin', 'suprdum',
       'suprel', 'timservc', 'totchpts', 'totrest', 'totunit', 'typemony',
       'typeoths', 'unit1', 'mwgt1', 'wgt1', 'xcrhissr', 'xfolsor', 'xmaxsor',
       'xminsor', 'sentrnge'],
      dtype='object')

- Drop the index columns created from saving a DataFrame to a csv.
- Also drop the columns we have identified as either too correlated or not useful for our model.

In [5]:
features = ['accgdln', 'casetype', 'combdrg2', 'crimhist', 'disposit', 
            'district', 'drugmin', 'dsplea', 'intdum', 'methmin', 'mweight','nodrug',
            'offguide', 'quarter', 'reas1', 'reas2', 'reas3', 'sources', 'statmax', 'statmin',
            
            'age', 'newrace', 'monsex', 'monrace', 'neweduc', 'newcnvtn', 'citwhere', 'newcit'
           ]

## Train Test Split

Set our X and Y

In [6]:
X = df[features]
y = df['prisdum']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [8]:
X_no = X.drop(columns=['age', 'newrace', 'monsex', 'monrace', 'neweduc', 'newcnvtn', 'citwhere', 'newcit'])

In [9]:
X_no_train, X_no_test, y_no_train, y_no_test = train_test_split(X_no, y, stratify=y)

# Scale Data for Neural Network Classifier

In [10]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [11]:
sc = StandardScaler()
X_no_train_sc = sc.fit_transform(X_no_train)
X_no_test_sc = sc.transform(X_no_test)

### Null Model

In [12]:
y.value_counts(normalize=True)

1    0.955196
0    0.044804
Name: prisdum, dtype: float64

> We see that we have a very imblanced dataset.

In [13]:
y_test.value_counts()

1    4019
0     189
Name: prisdum, dtype: int64

## Model on Imblanced Data

In [14]:
model = Sequential()
model.add(Dense(64, input_shape=(X_train_sc.shape[1],), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='bce', 
              optimizer='adam', 
              metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

history = model.fit(
    X_train_sc,
    y_train,
    validation_data=(X_test_sc, y_test),
    epochs=200,
    callbacks=[early_stop]
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 00017: early stopping


In [15]:
preds = np.round(model.predict(X_test_sc),0)
tn, fp, fn, tp = metrics.confusion_matrix(y_test, preds).ravel()


cm = pd.DataFrame(metrics.confusion_matrix(y_test, preds), 
                  columns=['predicted_no_prison', 'predicted_prison'], 
                  index=['actual_no_prison', 'actual_prison']
                 )
cm

Unnamed: 0,predicted_no_prison,predicted_prison
actual_no_prison,41,148
actual_prison,26,3993


In [16]:
misclass1 = []
for row_index, (input, prediction, label) in enumerate(zip (X_test_sc, preds, y_test)):
    if prediction != label:
        misclass1.append(row_index)
print(len(misclass1))

174


#### Analysis:

>loss: 0.1126 - accuracy: 0.9609 - val_loss: 0.1429 - val_accuracy: 0.9587

As expected, a very accurate model. However, likely suffering because of the test-set class imbalance as it hardly beats our Null Model of about 95.5%

This very high accuracy leads to only 174 misclassifications, 26 of which are in the position we are most interested in; predicted no-prison, actual prison.

### Without Demographic information

In [17]:
model = Sequential()
model.add(Dense(64, input_shape=(X_no_train_sc.shape[1],), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='bce', 
              optimizer='adam', 
              metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

history = model.fit(
    X_no_train_sc,
    y_no_train,
    validation_data=(X_no_test_sc, y_no_test),
    epochs=200,
    callbacks=[early_stop]
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 00024: early stopping


In [18]:
preds = np.round(model.predict(X_no_test_sc),0)
tn, fp, fn, tp = metrics.confusion_matrix(y_no_test, preds).ravel()



cm = pd.DataFrame(metrics.confusion_matrix(y_no_test, preds), 
                  columns=['predicted_no_prison', 'predicted_prison'], 
                  index=['actual_no_prison', 'actual_prison']
                 )
cm

Unnamed: 0,predicted_no_prison,predicted_prison
actual_no_prison,33,156
actual_prison,18,4001


In [19]:
misclass1_no = []
for row_index, (input, prediction, label) in enumerate(zip (X_no_test_sc, preds, y_no_test)):
    if prediction != label:
        misclass1_no.append(row_index)
print(len(misclass1_no))

174


#### Analysis:
>loss: 0.1251 - accuracy: 0.9608 - val_loss: 0.1591 - val_accuracy: 0.9587

As expected, we again have a very accurate model. However, likely suffering because of the test-set class imbalance as it barely beats our null model.

This very high accuracy leads to only 174 misclassifications, 18 of which are in the position we are most interested in; predicted no-prison, actual prison.

---

# Balance Imbalanced Data

---

## Under Sample Majority

In [20]:
nm = RandomUnderSampler()
X_train_under, y_train_under = nm.fit_resample(X_train_sc, y_train)

In [21]:
y_train_under.value_counts(normalize=True)

0    0.5
1    0.5
Name: prisdum, dtype: float64

In [22]:
model = Sequential()
model.add(Dense(64, input_shape=(X_train_under.shape[1],), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='bce', 
              optimizer='adam', 
              metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

history = model.fit(
    X_train_under,
    y_train_under,
    validation_data=(X_test_sc, y_test),
    epochs=200,
    callbacks=[early_stop]
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 00023: early stopping


In [23]:
preds = np.round(model.predict(X_test_sc),0)
tn, fp, fn, tp = metrics.confusion_matrix(y_test, preds).ravel()



cm = pd.DataFrame(metrics.confusion_matrix(y_test, preds), 
                  columns=['predicted_no_prison', 'predicted_prison'], 
                  index=['actual_no_prison', 'actual_prison']
                 )
cm

Unnamed: 0,predicted_no_prison,predicted_prison
actual_no_prison,160,29
actual_prison,1098,2921


In [24]:
misclass2 = []
for row_index, (input, prediction, label) in enumerate(zip (X_test_sc, preds, y_test)):
    if prediction != label:
        misclass2.append(row_index)
print(len(misclass2))

1127


#### Analysis 

>loss: 0.3535 - accuracy: 0.8451 - val_loss: 0.5578 - val_accuracy: 0.7322

After balancing, we have a much more reasonable model. There is evidence of overfitting when looking at the train accuracy of 84.51% versus a test accuracy of 73.22%, but we are more concerned with what was misclassified.

We see a large relative increase in misclassifications, as expected with such a large loss in accuracy, with a total of 1127 misclassifications, 1098 of which are in the position we are most interested in; predicted no-prison, actual prison.

### Without Demographic information

In [25]:
nm = RandomUnderSampler()
X_no_train_under, y_no_train_under = nm.fit_resample(X_no_train_sc, y_no_train)

In [26]:
model = Sequential()
model.add(Dense(64, input_shape=(X_no_train_under.shape[1],), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='bce', 
              optimizer='adam', 
              metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

history = model.fit(
    X_no_train_under,
    y_no_train_under,
    validation_data=(X_no_test_sc, y_no_test),
    epochs=200,
    callbacks=[early_stop]
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 00014: early stopping


In [27]:
preds = np.round(model.predict(X_no_test_sc),0)
tn, fp, fn, tp = metrics.confusion_matrix(y_no_test, preds).ravel()



cm = pd.DataFrame(metrics.confusion_matrix(y_no_test, preds), 
                  columns=['predicted_no_prison', 'predicted_prison'], 
                  index=['actual_no_prison', 'actual_prison']
                 )
cm

Unnamed: 0,predicted_no_prison,predicted_prison
actual_no_prison,165,24
actual_prison,1161,2858


In [28]:
misclass2_no = []
for row_index, (input, prediction, label) in enumerate(zip (X_no_test_sc, preds, y_no_test)):
    if prediction != label:
        misclass2_no.append(row_index)
print(len(misclass2_no))

1185


#### Analysis 

> loss: 0.4105 - accuracy: 0.8265 - val_loss: 0.5280 - val_accuracy: 0.7184

After removing demographics, we see a small drop in accuracy, however this can be attributed to simple a loss of features. 

We do see a small relative increase in misclassifications from the similar model that included demographics, a total of 1185 misclassifications, 1161 of which are in the position we are most interested in; predicted no-prison, actual prison.

#### Save model for Application

In [29]:
model.save('NN_under_nodem')

INFO:tensorflow:Assets written to: NN_under_nodem\assets


## SMOTE

In [30]:
smo = SMOTE()

X_train_smote, y_train_smote = smo.fit_resample(X_train_sc, y_train)

In [31]:
model = Sequential()
model.add(Dense(64, input_shape=(X_train_smote.shape[1],), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='bce', 
              optimizer='adam', 
              metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

history = model.fit(
    X_train_smote,
    y_train_smote,
    validation_data=(X_test_sc, y_test),
    epochs=200,
    callbacks=[early_stop]
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 00015: early stopping


In [32]:
preds = np.round(model.predict(X_test_sc),0)
tn, fp, fn, tp = metrics.confusion_matrix(y_test, preds).ravel()



cm = pd.DataFrame(metrics.confusion_matrix(y_test, preds), 
                  columns=['predicted_no_prison', 'predicted_prison'], 
                  index=['actual_no_prison', 'actual_prison']
                 )
cm

Unnamed: 0,predicted_no_prison,predicted_prison
actual_no_prison,119,70
actual_prison,396,3623


In [33]:
misclass3 = []
for row_index, (input, prediction, label) in enumerate(zip (X_test_sc, preds, y_test)):
    if prediction != label:
        misclass3.append(row_index)
print(len(misclass3))

466


#### Analysis 

> loss: 0.1826 - accuracy: 0.9312 - val_loss: 0.3377 - val_accuracy: 0.8893

SMOTE produced a much stronger model, with higher accuracy for both train and test sets as well as a smaller difference between the two accuracies.

Due to this higher accuracy, we have a lower misclassification rate with a total of 466, 396 of which are in the position we are most interested in; predicted no-prison, actual prison.

### Without Demographic information

In [34]:
nm = RandomUnderSampler()
X_no_train_under, y_no_train_under = nm.fit_resample(X_no_train_sc, y_no_train)

In [35]:
model = Sequential()
model.add(Dense(64, input_shape=(X_no_train_under.shape[1],), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='bce', 
              optimizer='adam', 
              metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

history = model.fit(
    X_no_train_under,
    y_no_train_under,
    validation_data=(X_no_test_sc, y_no_test),
    epochs=200,
    callbacks=[early_stop]
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 00027: early stopping


In [36]:
preds = np.round(model.predict(X_no_test_sc),0)
tn, fp, fn, tp = metrics.confusion_matrix(y_no_test, preds).ravel()



cm = pd.DataFrame(metrics.confusion_matrix(y_no_test, preds), 
                  columns=['predicted_no_prison', 'predicted_prison'], 
                  index=['actual_no_prison', 'actual_prison']
                 )
cm

Unnamed: 0,predicted_no_prison,predicted_prison
actual_no_prison,159,30
actual_prison,1060,2959


In [37]:
misclass3_no = []
for row_index, (input, prediction, label) in enumerate(zip (X_no_test_sc, preds, y_no_test)):
    if prediction != label:
        misclass3_no.append(row_index)
print(len(misclass3_no))

1090


#### Analysis 

> loss: 0.3958 - accuracy: 0.8292 - val_loss: 0.5057 - val_accuracy: 0.7410

Now we start to see some interesting comparisons. When removing demographics, we see a significant loss in accuracy; part of which can be explained by the removal of features in general. We also see evidence of overfitting, but not to the level of the UnderSample balancing technique.

We do see a large relative increase in misclassifications from the similar model that included demographics, a total of 1090 misclassifications, 1060 of which are in the position we are most interested in; predicted no-prison, actual prison.

#### Save model for Application

In [38]:
model.save('NN_smote_nodem')

INFO:tensorflow:Assets written to: NN_smote_nodem\assets


---

# Misclassifications

In [39]:
import warnings
warnings.filterwarnings("ignore")

In [40]:
tar_misclass_set = [misclass3_no]        
tar_misclass_ids = {}

for misclass in tar_misclass_set:
    for ids in misclass:
        if ids in tar_misclass_ids.keys():
            tar_misclass_ids[ids] += 1
        else:
            tar_misclass_ids[ids] = 1

tar_misclass_df = df.iloc[[item for sublist in tar_misclass_set for item in sublist]]
tar_misclass_df['no_of_misclass'] = 0

for ids in tar_misclass_df.index:
    tar_misclass_df['no_of_misclass'].loc[ids] = tar_misclass_ids[ids]

In [41]:
print(tar_misclass_df.shape)
tar_misclass_df.head()

(1090, 65)


Unnamed: 0,accgdln,age,altdum,amttotal,casetype,citwhere,combdrg2,crimhist,disposit,district,...,typeoths,unit1,mwgt1,wgt1,xcrhissr,xfolsor,xmaxsor,xminsor,sentrnge,no_of_misclass
0,1.0,20.0,0,0,1.0,211.0,6.0,1.0,1,43,...,0,1.0,63560990.0,85104.433315,1.0,17.0,30.0,24.0,8.0,1
2,1.0,28.0,0,0,1.0,211.0,3.0,1.0,1,48,...,0,2.0,2000000.0,2000.0,6.0,27.0,162.0,130.0,2.0,1
5,8.0,22.0,0,125,2.0,211.0,4.0,1.0,1,87,...,0,1.0,63560990.0,85104.433315,1.0,2.0,6.0,0.0,0.0,1
7,1.0,30.0,0,0,1.0,49.0,3.0,1.0,1,39,...,0,1.0,8900000.0,8900.0,1.0,29.0,108.0,87.0,2.0,1
10,1.0,31.0,0,0,1.0,211.0,6.0,1.0,1,62,...,0,1.0,715400.0,35.77,5.0,25.0,125.0,100.0,8.0,1


---
## Save Desired Model's Misclassifications to CSV

We will save the misclassifications from our saved models, SMOTE and Undersample Majority, to a CSV for EDA purposes.

In [42]:
tar_misclass_df.to_csv('NN_misclass_df.csv')