# Train and Test BlackBox Model

In [26]:
# Ignore all warnings
from warnings import simplefilter
simplefilter(action='ignore')

# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import pickle

# Import the models from SKLearn (Model 1 through Model 6)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Local module
from ml_classification import model_performance

## Load data

In [27]:
# Shooter data
shooters_df = pd.read_csv(Path('model_blackbox_shooters.csv'))

# General Population data
genpop_df = pd.read_csv(Path('model_blackbox_genpop_1000.csv'))

### Create a single table

In [28]:
all_data = pd.concat([shooters_df, genpop_df], axis=0)


# Up-sampling
dupl = 4

for i in range(dupl):
    all_data = pd.concat([all_data, shooters_df], axis=0)

all_data

Unnamed: 0,Age,Gender,Race,Immigrant,Education,RelStatus,Employed,Work,MilService,Arrested,ParentDivorce,SES,MentalIllness,MentalIllnessHistory,Autism,HealthIssues,Classification
0,25,Male,White,No,Some college/trade school,Married,Not working,Unknown,Yes,Yes,No evidence,Middle class,Yes,No evidence,No evidence,Yes,1
1,18,Male,White,No,Less than high school,Single,Not working,Unknown,No,No,No evidence,Middle class,Yes,No evidence,No evidence,No evidence,1
2,39,Male,White,No,Some college/trade school,Married,Working,In between,Yes,No,No evidence,Middle class,No evidence,No evidence,No evidence,No evidence,1
3,56,Male,White,No,Unknown,Single,Working,Blue collar,No,No,No evidence,Unknown,No evidence,No evidence,No evidence,No evidence,1
4,31,Male,Black,No,Some college/trade school,Married,Not working,In between,Yes,No,Yes,Middle class,Yes,No evidence,No evidence,No evidence,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,72,Male,Asian,Yes,Unknown,Divorced/separated/widowed,Not working,Unknown,No,Yes,No evidence,Lower class,No evidence,No evidence,No evidence,No evidence,1
189,66,Male,Asian,Yes,Unknown,Married,Working,Blue collar,No,No,No evidence,Lower class,No evidence,No evidence,No evidence,No evidence,1
190,28,Other,White,No,Bachelor's degree,Single,Working,White collar,No,No,No evidence,Middle class,Yes,No evidence,Diagnosed or extremely likely,No evidence,1
191,25,Male,White,No,Graduate school/advanced degree,Unknown,Working,White collar,No,No,No evidence,Middle class,Yes,No evidence,No evidence,Yes,1


In [29]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = all_data['Classification']

# Separate the X variable, the features
X = all_data.drop(columns='Classification')

### Feature engineering

In [30]:
# Get dummies
dummies_df = pd.get_dummies(X.drop(columns=['Age']))
X = pd.concat([X['Age'], dummies_df], axis=1)

### Split the data

In [31]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1,stratify=y)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Train and test model

### Model 1: Logistics Regression (no scaling)

In [32]:
# Instantiate and fit the model
model_1 = LogisticRegression(solver='lbfgs', random_state=10, max_iter=40)
model_1 = model_1.fit(X_train, y_train)

# Score the model
print(f"Training Data Score: {model_1.score(X_train, y_train)}")
print(f"Testing Data Score: {model_1.score(X_test, y_test)}")
print('')

# Test model performance
predictions_1 = model_1.predict(X_test)
model_1_metrics = model_performance(y_test, predictions_1, True)

Training Data Score: 0.9504412763068567
Testing Data Score: 0.9349593495934959

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          241            9
Actual 1           23          219
---
Accuracy Score : 0.9349593495934959
---
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.96      0.94       250
           1       0.96      0.90      0.93       242

    accuracy                           0.93       492
   macro avg       0.94      0.93      0.93       492
weighted avg       0.94      0.93      0.93       492



### Model 2: Logistics Regression (with scaling)

In [33]:
# Instantiate and fit the model
model_2 = LogisticRegression(solver='lbfgs', random_state=10, max_iter=42)
model_2 = model_2.fit(X_train_scaled, y_train)

# Score the model
print(f"Training Data Score: {model_2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_2.score(X_test_scaled, y_test)}")
print('')

# Test model performance
predictions_2 = model_2.predict(X_test_scaled) # Use predict_proba instead!
model_2_metrics = model_performance(y_test, predictions_2, True)

Training Data Score: 0.9694501018329938
Testing Data Score: 0.9369918699186992

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          241            9
Actual 1           22          220
---
Accuracy Score : 0.9369918699186992
---
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       250
           1       0.96      0.91      0.93       242

    accuracy                           0.94       492
   macro avg       0.94      0.94      0.94       492
weighted avg       0.94      0.94      0.94       492



### Model 3: SVM

In [34]:
# Instantiate and fit the model
model_3 = SVC(kernel='linear')
model_3 = model_3.fit(X_train_scaled, y_train)

# Score the model
print(f"Training Data Score: {model_3.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_3.score(X_test_scaled, y_test)}")
print('')

# Test model performance
predictions_3 = model_3.predict(X_test_scaled)
model_3_metrics = model_performance(y_test, predictions_3, True)

Training Data Score: 0.9701289884589274
Testing Data Score: 0.9451219512195121

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          243            7
Actual 1           20          222
---
Accuracy Score : 0.9451219512195121
---
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95       250
           1       0.97      0.92      0.94       242

    accuracy                           0.95       492
   macro avg       0.95      0.94      0.95       492
weighted avg       0.95      0.95      0.95       492



### Model 4: Decision tree

In [35]:
# Creating the decision tree classifier instance
depth = 11
model_4 = DecisionTreeClassifier(max_depth=depth)

# Fitting the model
model_4 = model_4.fit(X_train_scaled, y_train)

# Score the model
print(f"Training Data Score: {model_4.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_4.score(X_test_scaled, y_test)}")
print('')

# Making predictions using the testing data
predictions_4 = model_4.predict(X_test_scaled)

# Model performance
model_4_metrics = model_performance(y_test, predictions_4, True)

# proba_4 = model_4.predict_proba(X_test_scaled)


Training Data Score: 0.9823489477257298
Testing Data Score: 0.9634146341463414

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          247            3
Actual 1           15          227
---
Accuracy Score : 0.9634146341463414
---
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       250
           1       0.99      0.94      0.96       242

    accuracy                           0.96       492
   macro avg       0.96      0.96      0.96       492
weighted avg       0.96      0.96      0.96       492



### Model 5: Random Forest

In [36]:
# Create a random forest classifier
estim = 220
model_5 = RandomForestClassifier(n_estimators=estim, random_state=1)

# Fitting the model
model_5 = model_5.fit(X_train_scaled, y_train)

# Score the model
print(f"Training Data Score: {model_5.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_5.score(X_test_scaled, y_test)}")
print('')

# Making predictions
predictions_5 = model_5.predict(X_test_scaled)

# Model performance
model_5_metrics = model_performance(y_test, predictions_5, True)

Training Data Score: 1.0
Testing Data Score: 0.991869918699187

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          246            4
Actual 1            0          242
---
Accuracy Score : 0.991869918699187
---
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       250
           1       0.98      1.00      0.99       242

    accuracy                           0.99       492
   macro avg       0.99      0.99      0.99       492
weighted avg       0.99      0.99      0.99       492



### Model 6: KNN

In [37]:
# Instantiate the model
knn = 1
model_6 = KNeighborsClassifier(n_neighbors=knn)

# Train the model
model_6.fit(X_train_scaled,y_train)

# Score the model
print(f"Training Data Score: {model_6.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_6.score(X_test_scaled, y_test)}")
print('')

# Create predictions
predictions_6 = model_6.predict(X_test_scaled)

# Model performance
model_6_metrics = model_performance(y_test, predictions_6, True)

Training Data Score: 1.0
Testing Data Score: 0.9776422764227642

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          239           11
Actual 1            0          242
---
Accuracy Score : 0.9776422764227642
---
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       250
           1       0.96      1.00      0.98       242

    accuracy                           0.98       492
   macro avg       0.98      0.98      0.98       492
weighted avg       0.98      0.98      0.98       492



## Overview of performance

In [42]:
models = ['Logisitcs regression', 'SVM', 'Decision tree', 'Random forest', 'KNN']
performance = [model_2_metrics, model_3_metrics, model_4_metrics, model_5_metrics, model_6_metrics]

models_performance = pd.DataFrame(performance)
models_performance['model'] = models
models_performance = models_performance[['model', 'accuracy', 'precision_0', 'precision_1', 'recall_0', 'recall_1']]
models_performance = models_performance.sort_values('recall_1', ascending=False)
models_performance.index += 2
models_performance

Unnamed: 0,model,accuracy,precision_0,precision_1,recall_0,recall_1
5,Random forest,0.99187,1.0,0.98374,0.984,1.0
6,KNN,0.977642,1.0,0.956522,0.956,1.0
4,Decision tree,0.963415,0.942748,0.986957,0.988,0.938017
3,SVM,0.945122,0.923954,0.969432,0.972,0.917355
2,Logisitcs regression,0.936992,0.91635,0.960699,0.964,0.909091


## Pickle model and scaler
While other models provide better performance, they do not directly provide probability estimates. Since this is important to give finer information to the user, Model 2 (Logistics Regression with scaling) is kept instead.

In [43]:
# Save best model
with open('../Server/blackbox.model','wb') as f:
    pickle.dump(model_2,f)

# Save scaling
with open('../Server/blackbox.scaler','wb') as f:
    pickle.dump(X_scaler,f)