# Train and Test BlackBox Model

In [1]:
# Ignore all warnings
from warnings import simplefilter
simplefilter(action='ignore')

# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import pickle

# Import the models from SKLearn (Model 1 through Model 6)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Local module
from ml_classification import model_performance

## Load data

In [2]:
# Shooter data
shooters_df = pd.read_csv(Path('model_blackbox_shooters_v8.csv'))

# General Population data
genpop_df = pd.read_csv(Path('model_blackbox_genpop_1000.csv'))

### Create a single table

In [3]:
all_data = pd.concat([shooters_df, genpop_df], axis=0)


# Up-sampling
dupl = 4

for i in range(dupl):
    all_data = pd.concat([all_data, shooters_df], axis=0)

all_data

Unnamed: 0,Age,Gender,Race,Immigrant,Education,RelStatus,Employed,Work,MilService,Arrested,ParentDivorce,SES,MentalIllness,MentalIllnessHistory,Autism,HealthIssues,Classification
0,25,Male,White,No,Some college/trade school,Married,Not working,Unknown,Yes,Yes,No evidence,Middle class,Yes,No evidence,No evidence,Yes,1
1,18,Male,White,No,Less than high school,Single,Not working,Unknown,No,No,No evidence,Middle class,Yes,No evidence,No evidence,No evidence,1
2,39,Male,White,No,Some college/trade school,Married,Working,In between,Yes,No,No evidence,Middle class,No evidence,No evidence,No evidence,No evidence,1
3,56,Male,White,No,Unknown,Single,Working,Blue collar,No,Yes,No evidence,Unknown,No evidence,No evidence,No evidence,No evidence,1
4,31,Male,Black,No,Some college/trade school,Married,Not working,In between,Yes,No,Yes,Middle class,Yes,No evidence,No evidence,No evidence,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,25,Male,White,No,Graduate school/advanced degree,Boyfriend/girlfriend,Working,White collar,No,No,No evidence,Middle class,Yes,No evidence,No evidence,No evidence,1
193,33,Male,Latinx,No,Unknown,Single,Not working,Blue collar,No,No,No evidence,Unknown,No evidence,No evidence,No evidence,No evidence,1
194,40,Male,Black,No,Some college/trade school,Single,Not working,White collar,No,Yes,No evidence,Middle class,No evidence,No evidence,No evidence,No evidence,1
195,40,Male,Black,Yes,High school/GED,Single,Not working,White collar,Yes,No,No evidence,Unknown,Yes,No evidence,No evidence,No evidence,1


In [4]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = all_data['Classification']

# Separate the X variable, the features
X = all_data.drop(columns='Classification')

### Feature engineering

In [5]:
# Get dummies
dummies_df = pd.get_dummies(X.drop(columns=['Age']))
X = pd.concat([X['Age'], dummies_df], axis=1)

### Split the data

In [6]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1,stratify=y)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Train and test model

### Model 1: Logistics Regression (no scaling)

In [7]:
# Instantiate and fit the model
model_1 = LogisticRegression(solver='lbfgs', random_state=10, max_iter=40)
model_1 = model_1.fit(X_train, y_train)

# Score the model
print(f"Training Data Score: {model_1.score(X_train, y_train)}")
print(f"Testing Data Score: {model_1.score(X_test, y_test)}")
print('')

# Test model performance
predictions_1 = model_1.predict(X_test)
model_1_metrics = model_performance(y_test, predictions_1, True)

Training Data Score: 0.9435483870967742
Testing Data Score: 0.9275653923541247

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          238           12
Actual 1           24          223
---
Accuracy Score : 0.9275653923541247
---
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.95      0.93       250
           1       0.95      0.90      0.93       247

    accuracy                           0.93       497
   macro avg       0.93      0.93      0.93       497
weighted avg       0.93      0.93      0.93       497



### Model 2: Logistics Regression (with scaling)

In [8]:
# Instantiate and fit the model
model_2 = LogisticRegression(solver='lbfgs', random_state=10, max_iter=42)
model_2 = model_2.fit(X_train_scaled, y_train)

# Score the model
print(f"Training Data Score: {model_2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_2.score(X_test_scaled, y_test)}")
print('')

# Test model performance
predictions_2 = model_2.predict(X_test_scaled) # Use predict_proba instead!
model_2_metrics = model_performance(y_test, predictions_2, True)

Training Data Score: 0.9657258064516129
Testing Data Score: 0.9537223340040242

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          242            8
Actual 1           15          232
---
Accuracy Score : 0.9537223340040242
---
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95       250
           1       0.97      0.94      0.95       247

    accuracy                           0.95       497
   macro avg       0.95      0.95      0.95       497
weighted avg       0.95      0.95      0.95       497



### Model 3: SVM

In [9]:
# Instantiate and fit the model
model_3 = SVC(kernel='linear')
model_3 = model_3.fit(X_train_scaled, y_train)

# Score the model
print(f"Training Data Score: {model_3.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_3.score(X_test_scaled, y_test)}")
print('')

# Test model performance
predictions_3 = model_3.predict(X_test_scaled)
model_3_metrics = model_performance(y_test, predictions_3, True)

Training Data Score: 0.9690860215053764
Testing Data Score: 0.9517102615694165

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          240           10
Actual 1           14          233
---
Accuracy Score : 0.9517102615694165
---
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       250
           1       0.96      0.94      0.95       247

    accuracy                           0.95       497
   macro avg       0.95      0.95      0.95       497
weighted avg       0.95      0.95      0.95       497



### Model 4: Decision tree

In [10]:
# Creating the decision tree classifier instance
depth = 11
model_4 = DecisionTreeClassifier(max_depth=depth)

# Fitting the model
model_4 = model_4.fit(X_train_scaled, y_train)

# Score the model
print(f"Training Data Score: {model_4.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_4.score(X_test_scaled, y_test)}")
print('')

# Making predictions using the testing data
predictions_4 = model_4.predict(X_test_scaled)

# Model performance
model_4_metrics = model_performance(y_test, predictions_4, True)

# proba_4 = model_4.predict_proba(X_test_scaled)


Training Data Score: 0.9899193548387096
Testing Data Score: 0.971830985915493

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          241            9
Actual 1            5          242
---
Accuracy Score : 0.971830985915493
---
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       250
           1       0.96      0.98      0.97       247

    accuracy                           0.97       497
   macro avg       0.97      0.97      0.97       497
weighted avg       0.97      0.97      0.97       497



### Model 5: Random Forest

In [11]:
# Create a random forest classifier
estim = 220
model_5 = RandomForestClassifier(n_estimators=estim, random_state=1)

# Fitting the model
model_5 = model_5.fit(X_train_scaled, y_train)

# Score the model
print(f"Training Data Score: {model_5.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_5.score(X_test_scaled, y_test)}")
print('')

# Making predictions
predictions_5 = model_5.predict(X_test_scaled)

# Model performance
model_5_metrics = model_performance(y_test, predictions_5, True)

Training Data Score: 1.0
Testing Data Score: 0.993963782696177

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          247            3
Actual 1            0          247
---
Accuracy Score : 0.993963782696177
---
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       250
           1       0.99      1.00      0.99       247

    accuracy                           0.99       497
   macro avg       0.99      0.99      0.99       497
weighted avg       0.99      0.99      0.99       497



### Model 6: KNN

In [12]:
# Instantiate the model
knn = 1
model_6 = KNeighborsClassifier(n_neighbors=knn)

# Train the model
model_6.fit(X_train_scaled,y_train)

# Score the model
print(f"Training Data Score: {model_6.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_6.score(X_test_scaled, y_test)}")
print('')

# Create predictions
predictions_6 = model_6.predict(X_test_scaled)

# Model performance
model_6_metrics = model_performance(y_test, predictions_6, True)

Training Data Score: 1.0
Testing Data Score: 0.9798792756539235

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          240           10
Actual 1            0          247
---
Accuracy Score : 0.9798792756539235
---
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       250
           1       0.96      1.00      0.98       247

    accuracy                           0.98       497
   macro avg       0.98      0.98      0.98       497
weighted avg       0.98      0.98      0.98       497



## Overview of performance

In [13]:
models = ['Logisitcs regression', 'SVM', 'Decision tree', 'Random forest', 'KNN']
performance = [model_2_metrics, model_3_metrics, model_4_metrics, model_5_metrics, model_6_metrics]

models_performance = pd.DataFrame(performance)
models_performance['model'] = models
models_performance = models_performance[['model', 'accuracy', 'precision_0', 'precision_1', 'recall_0', 'recall_1']]
models_performance = models_performance.sort_values('recall_1', ascending=False)
models_performance.index += 2
models_performance

Unnamed: 0,model,accuracy,precision_0,precision_1,recall_0,recall_1
5,Random forest,0.993964,1.0,0.988,0.988,1.0
6,KNN,0.979879,1.0,0.961089,0.96,1.0
4,Decision tree,0.971831,0.979675,0.964143,0.964,0.979757
3,SVM,0.95171,0.944882,0.958848,0.96,0.94332
2,Logisitcs regression,0.953722,0.941634,0.966667,0.968,0.939271


## Pickle model and scaler
While other models provide better performance, they do not directly provide probability estimates. Since this is important to give finer information to the user, Model 2 (Logistics Regression with scaling) is kept instead.

In [43]:
# Save best model
with open('../Server/blackbox.model','wb') as f:
    pickle.dump(model_2,f)

# Save scaling
with open('../Server/blackbox.scaler','wb') as f:
    pickle.dump(X_scaler,f)