# Train and Test BlackBox Model

In [1]:
# Ignore all warnings
from warnings import simplefilter
simplefilter(action='ignore')

# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt

# Import the models from SKLearn (Model 1 through Model 6)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Local module
from ml_classification import model_performance

## Load data

In [2]:
# Shooter data
shooters_df = pd.read_csv(Path('model_blackbox_shooters.csv'))

# General Population data
genpop_df = pd.read_csv(Path('model_blackbox_genpop.csv'))

In [3]:
all_data = pd.concat([shooters_df, genpop_df], axis=0)
all_data

Unnamed: 0,Age,Gender,Race,Immigrant,Education,RelStatus,Employed,Work,MilService,Arrested,ParentDivorce,SES,MentalIllness,MentalIllnessHistory,Autism,HealthIssues,Classification
0,25,Male,White,No,Some college/trade school,Married,Not working,Unknown,Yes,Yes,No evidence,Middle class,Yes,No evidence,No evidence,Yes,1
1,18,Male,White,No,Less than high school,Single,Not working,Unknown,No,No,No evidence,Middle class,Yes,No evidence,No evidence,No evidence,1
2,39,Male,White,No,Some college/trade school,Married,Working,In between,Yes,No,No evidence,Middle class,No evidence,No evidence,No evidence,No evidence,1
3,56,Male,White,No,Unknown,Single,Working,Blue collar,No,No,No evidence,Unknown,No evidence,No evidence,No evidence,No evidence,1
4,31,Male,Black,No,Some college/trade school,Married,Not working,In between,Yes,No,Yes,Middle class,Yes,No evidence,No evidence,No evidence,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,63,Male,White,No,High school/GED,Divorced/separated/widowed,Working,Blue collar,No,No,No evidence,Lower class,Yes,Yes,No evidence,No evidence,0
9996,11,Male,Black,No,High school/GED,Single,Not working,Blue collar,No,No,No evidence,Middle class,No evidence,No evidence,No evidence,No evidence,0
9997,42,Male,White,No,Some college/trade school,Married,Not Working,In between,No,No,No evidence,Lower class,Yes,Yes,No evidence,No evidence,0
9998,13,Male,Asian,No,Some college/trade school,Single,Not working,In between,No,Yes,No evidence,Upper class,No evidence,No evidence,No evidence,No evidence,0


In [4]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = all_data['Classification']

# Separate the X variable, the features
X = all_data.drop(columns='Classification')

# Get dummies
dummies_df = pd.get_dummies(X.drop(columns=['Age']))
X = pd.concat([X['Age'], dummies_df], axis=1)


In [5]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1,stratify=y)

In [6]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
model_1 = LogisticRegression(solver='lbfgs', random_state=1, max_iter=100)

# Fit the model using training data
model_1 = model_1.fit(X_train, y_train)

# Score the model
print(f"Training Data Score: {model_1.score(X_train, y_train)}")
print(f"Testing Data Score: {model_1.score(X_test, y_test)}")

Training Data Score: 0.9943746729461015
Testing Data Score: 0.99176147508827


## Pickle model

In [7]:
# Save model
import pickle

# save
with open('../Server/blackbox.model','wb') as f:
    pickle.dump(model_1,f)

In [8]:
X_test.columns

Index(['Age', 'Gender_Female', 'Gender_Male', 'Gender_Other', 'Race_Asian',
       'Race_Black', 'Race_Latinx', 'Race_Other', 'Race_White', 'Immigrant_No',
       'Immigrant_Unknown', 'Immigrant_Yes', 'Education_Bachelor's degree',
       'Education_Graduate school/advanced degree',
       'Education_High school/GED', 'Education_Less than high school',
       'Education_Some college/trade school', 'Education_Unknown',
       'RelStatus_Boyfriend/girlfriend',
       'RelStatus_Divorced/separated/widowed', 'RelStatus_Married',
       'RelStatus_Single', 'RelStatus_Unknown', 'Employed_Not Working',
       'Employed_Not working', 'Employed_Unknown', 'Employed_Working',
       'Work_Blue collar', 'Work_In between', 'Work_Unknown',
       'Work_White collar', 'MilService_No', 'MilService_Yes', 'Arrested_No',
       'Arrested_Yes', 'ParentDivorce_No evidence', 'ParentDivorce_Yes',
       'SES_Lower class', 'SES_Middle class', 'SES_Unknown', 'SES_Upper class',
       'MentalIllness_No evidence

In [9]:
# Make a prediction using the testing data
print('---')
print("Predictions vs Actual classification:")
print(f"Number of features: {len(X_test.columns)}")
predictions_1 = model_1.predict(X_test)
classification_df = pd.DataFrame({'Prediction': predictions_1, "Actual": y_test})
classification_df.head(10)

---
Predictions vs Actual classification:
Number of features: 49


Unnamed: 0,Prediction,Actual
1199,0,0
7131,0,0
8798,0,0
79,1,1
4160,0,0
4773,0,0
148,0,0
2250,0,0
4725,0,0
2511,0,0


In [10]:
model_1_metrics = model_performance(y_test, predictions_1, True)

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         2500            1
Actual 1           20           28
---
Accuracy Score : 0.99176147508827
---
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2501
           1       0.97      0.58      0.73        48

    accuracy                           0.99      2549
   macro avg       0.98      0.79      0.86      2549
weighted avg       0.99      0.99      0.99      2549



In [11]:
false_neg = classification_df.loc[(classification_df['Actual']==1) & (classification_df['Prediction']==0),:]

print(f"{len(false_neg)} False Negative")
false_neg

20 False Negative


Unnamed: 0,Prediction,Actual
181,0,1
121,0,1
163,0,1
120,0,1
103,0,1
152,0,1
132,0,1
134,0,1
140,0,1
191,0,1


In [12]:
false_pos = classification_df.loc[(classification_df['Actual']==0) & (classification_df['Prediction']==1),:]

print(f"{len(false_pos)} False Negative")
false_pos

1 False Negative


Unnamed: 0,Prediction,Actual
4163,1,0


In [13]:
true_pos = classification_df.loc[(classification_df['Actual']==1) & (classification_df['Prediction']==1),:]

print(f"{len(true_pos)} True Positive")
true_pos

28 True Positive


Unnamed: 0,Prediction,Actual
79,1,1
68,1,1
44,1,1
138,1,1
40,1,1
28,1,1
130,1,1
38,1,1
8,1,1
101,1,1
