In [7]:
# Ignore all warnings
from warnings import simplefilter
simplefilter(action='ignore')

# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt

# Import the models from SKLearn (Model 1 through Model 6)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import pickle

# Local module
from ml_classification import model_performance

## Load data

In [9]:
# General Population data
genpop_df = pd.read_csv(Path('model_blackbox_shooters.csv'))

# Separate the X variable, the features
X = genpop_df.drop(columns='Classification')

# Get dummies
dummies_df = pd.get_dummies(X.drop(columns=['Age']))
X = pd.concat([X['Age'], dummies_df], axis=1)

# Get features from dataset
dataset_features = X.columns

## Load model

In [10]:
# Load model from file
with open('blackbox.model','rb') as f:
    model_1 = pickle.load(f)

# Get names of necessary features
model_features = model_1.feature_names_in_

## Prepare dataset for predictions

In [11]:
# Create empty list to store missing features
missing_features = []

# Create empty list to store extra features
extra_features = []

# Check if the dataset is missing features
if len(dataset_features) < len(model_features):

    # Loop through model features
    for feature in model_features:
        # Check if feature is missing and add missing feature to the list
        if feature not in dataset_features:
            missing_features.append(feature)

# Check if the dataset has too many features
elif len(dataset_features) > len(model_features):

    # Loop through dataset features
    for feature in dataset_features:
        # Check if feature is missing and add missing feature to the list
        if feature not in model_features:
            extra_features.append(feature)

# If the number of features is the same, make sure they are identical
if len(dataset_features) == len(model_features):
    
    # Loop through dataset features
    for feature in dataset_features:
        # Check if feature is missing and add missing feature to the list
        if feature not in model_features:
            extra_features.append(feature)

    # Loop through model features
    for feature in model_features:
        # Check if feature is missing and add missing feature to the list
        if feature not in dataset_features:
            missing_features.append(feature)


# Add missing features to dataset with value of 0
for mf in missing_features:
    X[mf] = 0

# Drop extra features from DataSet
X = X.drop(columns=extra_features)

In [12]:
# Data Preparation
X = X.values

## Make predictions

In [13]:
# Make predictions
predictions_1 = model_1.predict(X)

# Add classification to DataFrame
genpop_df['Classification'] = predictions_1

# Add probability to DataFrame
genpop_df['Probability'] = 100*model_1.predict_proba(X)[:,1]

In [14]:
genpop_df.loc[genpop_df['Classification']==1,:] #.sort_values('Probability', ascending=False)

Unnamed: 0,Age,Gender,Race,Immigrant,Education,RelStatus,Employed,Work,MilService,Arrested,ParentDivorce,SES,MentalIllness,MentalIllnessHistory,Autism,HealthIssues,Classification,Probability
191,25,Male,White,No,Graduate school/advanced degree,Unknown,Working,White collar,No,No,No evidence,Middle class,Yes,No evidence,No evidence,Yes,1,99.969954
187,31,Male,Black,No,Unknown,Single,Working,Blue collar,No,No,No evidence,Lower class,Yes,No evidence,No evidence,No evidence,1,99.951639
149,45,Male,Asian,Yes,Unknown,Divorced/separated/widowed,Working,Blue collar,No,No,No evidence,Lower class,No evidence,No evidence,No evidence,No evidence,1,99.935583
189,66,Male,Asian,Yes,Unknown,Married,Working,Blue collar,No,No,No evidence,Lower class,No evidence,No evidence,No evidence,No evidence,1,99.925876
112,39,Male,Other,No,Graduate school/advanced degree,Single,Working,White collar,Yes,No,No evidence,Middle class,No evidence,No evidence,No evidence,No evidence,1,99.920191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,32,Male,White,No,High school/GED,Married,Working,Blue collar,No,No,No evidence,Middle class,Yes,No evidence,No evidence,No evidence,1,57.394669
162,28,Male,White,No,Some college/trade school,Divorced/separated/widowed,Not working,Blue collar,Yes,No,No evidence,Middle class,No evidence,No evidence,No evidence,No evidence,1,53.427603
7,33,Male,White,No,Some college/trade school,Unknown,Working,Blue collar,Yes,Yes,No evidence,Middle class,No evidence,No evidence,No evidence,No evidence,1,52.466414
93,44,Male,White,No,High school/GED,Single,Working,In between,Yes,No,Yes,Middle class,Yes,No evidence,No evidence,Yes,1,51.537476
