In [66]:
# Ignore all warnings
from warnings import simplefilter
simplefilter(action='ignore')

# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt

# Import the models from SKLearn (Model 1 through Model 6)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import pickle

# Local module
from ml_classification import model_performance

## Load data

In [67]:
# General Population data
genpop_df = pd.read_csv(Path('model_blackbox_genpop.csv'))

# Separate the X variable, the features
X = genpop_df.drop(columns='Classification')

# Get dummies
dummies_df = pd.get_dummies(X.drop(columns=['Age']))
X = pd.concat([X['Age'], dummies_df], axis=1)

# Get features from dataset
dataset_features = X.columns

## Load model

In [68]:
# Load model from file
with open('blackbox.model','rb') as f:
    model_1 = pickle.load(f)

# Get names of necessary features
model_features = model_1.feature_names_in_

## Prepare dataset for predictions

In [69]:
# Create empty list to store missing features
missing_features = []

# Create empty list to store extra features
extra_features = []

# Check if the dataset is missing features
if len(dataset_features) < len(model_features):

    # Loop through model features
    for feature in model_features:
        # Check if feature is missing and add missing feature to the list
        if feature not in dataset_features:
            missing_features.append(feature)

# Check if the dataset has too many features
elif len(dataset_features) > len(model_features):

    # Loop through dataset features
    for feature in dataset_features:
        # Check if feature is missing and add missing feature to the list
        if feature not in model_features:
            extra_features.append(feature)

# If the number of features is the same, make sure they are identical
if len(dataset_features) == len(model_features):
    
    # Loop through dataset features
    for feature in dataset_features:
        # Check if feature is missing and add missing feature to the list
        if feature not in model_features:
            extra_features.append(feature)

    # Loop through model features
    for feature in model_features:
        # Check if feature is missing and add missing feature to the list
        if feature not in dataset_features:
            missing_features.append(feature)


# Add missing features to dataset with value of 0
for mf in missing_features:
    X[mf] = 0

# Drop extra features from DataSet
X = X.drop(columns=extra_features)

In [70]:
# Data Preparation
X = X.values

## Make predictions

In [94]:
# Make predictions
predictions_1 = model_1.predict(X)

# Add classification to DataFrame
genpop_df['Classification'] = predictions_1

# Add probability to DataFrame
genpop_df['Probability'] = 100*model_1.predict_proba(X)[:,1]

In [95]:
genpop_df.loc[genpop_df['Classification']==1,:].sort_values('Probability', ascending=False)

Unnamed: 0,Age,Gender,Race,Immigrant,Education,RelStatus,Employed,Work,MilService,Arrested,ParentDivorce,SES,MentalIllness,MentalIllnessHistory,Autism,HealthIssues,Classification,Probability
7058,39,Male,White,No,Bachelor's degree,Boyfriend/girlfriend,Working,White collar,No,Yes,No evidence,Lower class,No evidence,Yes,Diagnosed or extremely likely,No evidence,1,99.979946
7858,46,Male,Latinx,No,Bachelor's degree,Boyfriend/girlfriend,Not Working,White collar,No,Yes,No evidence,Lower class,No evidence,Yes,Diagnosed or extremely likely,No evidence,1,99.943663
3685,18,Male,White,No,Bachelor's degree,Divorced/separated/widowed,Working,White collar,No,Yes,No evidence,Lower class,Yes,No evidence,No evidence,No evidence,1,99.930192
4881,24,Male,Asian,No,Bachelor's degree,Divorced/separated/widowed,Working,White collar,No,Yes,No evidence,Upper class,No evidence,No evidence,No evidence,No evidence,1,99.877975
4974,24,Male,White,No,Bachelor's degree,Divorced/separated/widowed,Working,White collar,No,Yes,No evidence,Lower class,No evidence,Yes,No evidence,No evidence,1,99.867040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8775,50,Male,White,No,High school/GED,Boyfriend/girlfriend,Working,Blue collar,No,No,No evidence,Upper class,No evidence,Yes,Diagnosed or extremely likely,No evidence,1,50.711020
178,59,Male,White,No,Bachelor's degree,Divorced/separated/widowed,Not Working,White collar,No,No,No evidence,Lower class,No evidence,No evidence,No evidence,No evidence,1,50.654316
8169,45,Male,Other,Yes,High school/GED,Divorced/separated/widowed,Working,Blue collar,No,No,No evidence,Lower class,No evidence,No evidence,No evidence,No evidence,1,50.535135
6815,33,Male,White,Yes,Graduate school/advanced degree,Divorced/separated/widowed,Not Working,White collar,No,No,No evidence,Upper class,No evidence,No evidence,No evidence,No evidence,1,50.215142
