# Train and Test BlackBox Model

In [1]:
# Ignore all warnings
from warnings import simplefilter
simplefilter(action='ignore')

# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import pickle

# Import the models from SKLearn (Model 1 through Model 6)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Local module
from ml_classification import model_performance

## Load data

In [2]:
# Shooter data
shooters_df = pd.read_csv(Path('model_blackbox_shooters.csv'))

# General Population data
genpop_df = pd.read_csv(Path('model_blackbox_genpop.csv'))

### Create a single table

In [3]:
all_data = pd.concat([shooters_df, genpop_df], axis=0)
all_data

Unnamed: 0,Age,Gender,Race,Immigrant,Education,RelStatus,Employed,Work,MilService,Arrested,ParentDivorce,SES,MentalIllness,MentalIllnessHistory,Autism,HealthIssues,Classification
0,25,Male,White,No,Some college/trade school,Married,Not working,Unknown,Yes,Yes,No evidence,Middle class,Yes,No evidence,No evidence,Yes,1
1,18,Male,White,No,Less than high school,Single,Not working,Unknown,No,No,No evidence,Middle class,Yes,No evidence,No evidence,No evidence,1
2,39,Male,White,No,Some college/trade school,Married,Working,In between,Yes,No,No evidence,Middle class,No evidence,No evidence,No evidence,No evidence,1
3,56,Male,White,No,Unknown,Single,Working,Blue collar,No,No,No evidence,Unknown,No evidence,No evidence,No evidence,No evidence,1
4,31,Male,Black,No,Some college/trade school,Married,Not working,In between,Yes,No,Yes,Middle class,Yes,No evidence,No evidence,No evidence,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,20,Male,White,No,Bachelor's degree,Divorced/separated/widowed,Not Working,White collar,No,No,No evidence,Upper class,No evidence,No evidence,No evidence,No evidence,0
9996,53,Female,Black,No,Less than high school,Single,Working,Blue collar,No,No,No evidence,Lower class,No evidence,No evidence,No evidence,No evidence,0
9997,40,Female,White,No,High school/GED,Boyfriend/girlfriend,Working,Blue collar,No,No,No evidence,Upper class,No evidence,No evidence,No evidence,No evidence,0
9998,24,Other,White,No,High school/GED,Boyfriend/girlfriend,Not Working,Blue collar,No,Yes,No evidence,Upper class,No evidence,No evidence,No evidence,No evidence,0


In [4]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = all_data['Classification']

# Separate the X variable, the features
X = all_data.drop(columns='Classification')

### Feature engineering

In [5]:
# Get dummies
dummies_df = pd.get_dummies(X.drop(columns=['Age']))
X = pd.concat([X['Age'], dummies_df], axis=1)

### Split the data

In [6]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1,stratify=y)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Train and test model

### Model 1: Logistics Regression (no scaling)

In [36]:
# Instantiate and fit the model
model_1 = LogisticRegression(solver='lbfgs', random_state=10, max_iter=40)
model_1 = model_1.fit(X_train, y_train)

# Score the model
print(f"Training Data Score: {model_1.score(X_train, y_train)}")
print(f"Testing Data Score: {model_1.score(X_test, y_test)}")
print('')

# Test model performance
predictions_1 = model_1.predict(X_test)
model_1_metrics = model_performance(y_test, predictions_1, True)

Training Data Score: 0.9935897435897436
Testing Data Score: 0.9929384072185171

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         2500            1
Actual 1           17           31
---
Accuracy Score : 0.9929384072185171
---
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2501
           1       0.97      0.65      0.78        48

    accuracy                           0.99      2549
   macro avg       0.98      0.82      0.89      2549
weighted avg       0.99      0.99      0.99      2549



### Model 2: Logistics Regression (with scaling)

In [38]:
# Instantiate and fit the model
model_2 = LogisticRegression(solver='lbfgs', random_state=10, max_iter=42)
model_2 = model_2.fit(X_train_scaled, y_train)

# Score the model
print(f"Training Data Score: {model_2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_2.score(X_test_scaled, y_test)}")
print('')

# Test model performance
predictions_2 = model_2.predict(X_test_scaled) # Use predict_proba instead!
model_2_metrics = model_performance(y_test, predictions_2, True)

Training Data Score: 0.9959445316588174
Testing Data Score: 0.9945076500588466

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         2498            3
Actual 1           11           37
---
Accuracy Score : 0.9945076500588466
---
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2501
           1       0.93      0.77      0.84        48

    accuracy                           0.99      2549
   macro avg       0.96      0.88      0.92      2549
weighted avg       0.99      0.99      0.99      2549



### Model 3: SVM

In [9]:
# Instantiate and fit the model
model_3 = SVC(kernel='linear')
model_3 = model_3.fit(X_train_scaled, y_train)

# Score the model
print(f"Training Data Score: {model_3.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_3.score(X_test_scaled, y_test)}")
print('')

# Test model performance
predictions_3 = model_3.predict(X_test_scaled)
model_3_metrics = model_performance(y_test, predictions_3, True)

Training Data Score: 0.9958137100994244
Testing Data Score: 0.9941153393487642

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         2498            3
Actual 1           12           36
---
Accuracy Score : 0.9941153393487642
---
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2501
           1       0.92      0.75      0.83        48

    accuracy                           0.99      2549
   macro avg       0.96      0.87      0.91      2549
weighted avg       0.99      0.99      0.99      2549



### Model 4: Decision tree

In [39]:
# Creating the decision tree classifier instance
depth = 11
model_4 = DecisionTreeClassifier(max_depth=depth)

# Fitting the model
model_4 = model_4.fit(X_train_scaled, y_train)

# Score the model
print(f"Training Data Score: {model_4.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_4.score(X_test_scaled, y_test)}")
print('')

# Making predictions using the testing data
predictions_4 = model_4.predict(X_test_scaled)

# Model performance
model_4_metrics = model_performance(y_test, predictions_4, True)

# proba_4 = model_4.predict_proba(X_test_scaled)


Training Data Score: 0.9982993197278912
Testing Data Score: 0.9972538250294233

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         2498            3
Actual 1            4           44
---
Accuracy Score : 0.9972538250294233
---
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2501
           1       0.94      0.92      0.93        48

    accuracy                           1.00      2549
   macro avg       0.97      0.96      0.96      2549
weighted avg       1.00      1.00      1.00      2549



### Model 5: Random Forest

In [34]:
# Create a random forest classifier
estim = 220
model_5 = RandomForestClassifier(n_estimators=estim, random_state=1)

# Fitting the model
model_5 = model_5.fit(X_train_scaled, y_train)

# Score the model
print(f"Training Data Score: {model_5.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_5.score(X_test_scaled, y_test)}")
print('')

# Making predictions
predictions_5 = model_5.predict(X_test_scaled)

# Model performance
model_5_metrics = model_performance(y_test, predictions_5, True)

Training Data Score: 1.0
Testing Data Score: 0.994899960768929

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         2498            3
Actual 1           10           38
---
Accuracy Score : 0.994899960768929
---
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2501
           1       0.93      0.79      0.85        48

    accuracy                           0.99      2549
   macro avg       0.96      0.90      0.93      2549
weighted avg       0.99      0.99      0.99      2549



### Model 6: KNN

In [35]:
# Instantiate the model
knn = 1
model_6 = KNeighborsClassifier(n_neighbors=knn)

# Train the model
model_6.fit(X_train_scaled,y_train)

# Score the model
print(f"Training Data Score: {model_6.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_6.score(X_test_scaled, y_test)}")
print('')

# Create predictions
predictions_6 = model_6.predict(X_test_scaled)

# Model performance
model_6_metrics = model_performance(y_test, predictions_6, True)

Training Data Score: 1.0
Testing Data Score: 0.9909768536681052

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         2491           10
Actual 1           13           35
---
Accuracy Score : 0.9909768536681052
---
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2501
           1       0.78      0.73      0.75        48

    accuracy                           0.99      2549
   macro avg       0.89      0.86      0.87      2549
weighted avg       0.99      0.99      0.99      2549



## Pickle model and scaler
While SVM (Model 3) provide better performance, SVMs do not directly provide probability estimates. Since this is important to give finer information to the user, Model 2 (Logistics Regression with scaling) is kept instead.

In [10]:
# Save best model
with open('../Server/blackbox.model','wb') as f:
    pickle.dump(model_2,f)

# Save scaling
with open('../Server/blackbox.scaler','wb') as f:
    pickle.dump(X_scaler,f)