# BlackBox

## Shooter clustering

In [42]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from pathlib import Path
import sys

# Import local modules
import ml_commons as ml
sys.path.append("../Local_Modules/")
import codebook
from codebook import get_distribution

In [32]:
# Load data into DataFrame
shooters_df = pd.read_csv(Path('model_blackbox_shooters.csv'))

# Drop Classification column (not used for unsupervised clustering)
X = shooters_df.drop(columns=['Classification'])
X.head(3)

Unnamed: 0,Age,Gender,Race,Immigrant,Education,RelStatus,Employed,Work,MilService,Arrested,ParentDivorce,SES,MentalIllness,MentalIllnessHistory,Autism,HealthIssues
0,25,Male,White,No,Some college/trade school,Married,Not working,Unknown,Yes,Yes,No evidence,Middle class,Yes,No evidence,No evidence,Yes
1,18,Male,White,No,Less than high school,Single,Not working,Unknown,No,No,No evidence,Middle class,Yes,No evidence,No evidence,No evidence
2,39,Male,White,No,Some college/trade school,Married,Working,In between,Yes,No,No evidence,Middle class,No evidence,No evidence,No evidence,No evidence


In [33]:
# Get the data types
non_num_cols = list(X.columns)
non_num_cols.remove('Age')

In [34]:
# Get dummies
dummies_df= pd.get_dummies(X[non_num_cols])

In [35]:
dummies_df = pd.concat([X['Age'], dummies_df], axis=1)
dummies_df.head(3)

Unnamed: 0,Age,Gender_Female,Gender_Male,Gender_Other,Race_Asian,Race_Black,Race_Latinx,Race_Other,Race_White,Immigrant_No,...,SES_Unknown,SES_Upper class,MentalIllness_No evidence,MentalIllness_Yes,MentalIllnessHistory_No evidence,MentalIllnessHistory_Yes,Autism_Diagnosed or extremely likely,Autism_No evidence,HealthIssues_No evidence,HealthIssues_Yes
0,25,0,1,0,0,0,0,0,1,1,...,0,0,0,1,1,0,0,1,0,1
1,18,0,1,0,0,0,0,0,1,1,...,0,0,0,1,1,0,0,1,1,0
2,39,0,1,0,0,0,0,0,1,1,...,0,0,1,0,1,0,0,1,1,0


### Elbow method

In [36]:
# Create a DataFrame with the data to plot the Elbow curve
df_elbow = ml.get_elbow(dummies_df)

# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_plot = df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve (with original features)")
# hvplot.save(elbow_plot, 'img/elbow_plot.png')
elbow_plot

## Create 3 clusters

In [37]:
# Fit the K-Means model using the scaled data
clusters_df = ml.fit_model(dummies_df, 3)

In [39]:
clusters_df['cluster']

0      1
1      1
2      2
3      0
4      1
      ..
188    0
189    0
190    1
191    1
192    2
Name: cluster, Length: 193, dtype: int32

In [41]:
shooters_clusters_df = X[:]
shooters_clusters_df['Cluster'] = clusters_df['cluster']
shooters_clusters_df

Unnamed: 0,Age,Gender,Race,Immigrant,Education,RelStatus,Employed,Work,MilService,Arrested,ParentDivorce,SES,MentalIllness,MentalIllnessHistory,Autism,HealthIssues,Cluster
0,25,Male,White,No,Some college/trade school,Married,Not working,Unknown,Yes,Yes,No evidence,Middle class,Yes,No evidence,No evidence,Yes,1
1,18,Male,White,No,Less than high school,Single,Not working,Unknown,No,No,No evidence,Middle class,Yes,No evidence,No evidence,No evidence,1
2,39,Male,White,No,Some college/trade school,Married,Working,In between,Yes,No,No evidence,Middle class,No evidence,No evidence,No evidence,No evidence,2
3,56,Male,White,No,Unknown,Single,Working,Blue collar,No,No,No evidence,Unknown,No evidence,No evidence,No evidence,No evidence,0
4,31,Male,Black,No,Some college/trade school,Married,Not working,In between,Yes,No,Yes,Middle class,Yes,No evidence,No evidence,No evidence,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,72,Male,Asian,Yes,Unknown,Divorced/separated/widowed,Not working,Unknown,No,Yes,No evidence,Lower class,No evidence,No evidence,No evidence,No evidence,0
189,66,Male,Asian,Yes,Unknown,Married,Working,Blue collar,No,No,No evidence,Lower class,No evidence,No evidence,No evidence,No evidence,0
190,28,Other,White,No,Bachelor's degree,Single,Working,White collar,No,No,No evidence,Middle class,Yes,No evidence,Diagnosed or extremely likely,No evidence,1
191,25,Male,White,No,Graduate school/advanced degree,Unknown,Working,White collar,No,No,No evidence,Middle class,Yes,No evidence,No evidence,Yes,1


In [57]:
field = 'Gender'
MilService_df = get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==0,:], field, 0).reset_index()
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==1,:], field, 0).reset_index(), how='outer', on='index')
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==2,:], field, 0).reset_index(), how='outer', on='index')
MilService_df

Unnamed: 0,index,Gender_x,Percent_x,Gender_y,Percent_y,Gender,Percent
0,Male,22.0,95.652174,90,96.774194,75.0,97.402597
1,Female,1.0,4.347826,1,1.075269,2.0,2.597403
2,Other,,,2,2.150538,,


In [58]:
field = 'Race'
MilService_df = get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==0,:], field, 0).reset_index()
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==1,:], field, 0).reset_index(), how='outer', on='index')
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==2,:], field, 0).reset_index(), how='outer', on='index')
MilService_df

Unnamed: 0,index,Race_x,Percent_x,Race_y,Percent_y,Race,Percent
0,White,11,47.826087,55,59.139785,37,48.051948
1,Black,4,17.391304,18,19.354839,18,23.376623
2,Other,3,13.043478,11,11.827957,6,7.792208
3,Asian,3,13.043478,4,4.301075,6,7.792208
4,Latinx,2,8.695652,5,5.376344,10,12.987013


In [59]:
field = 'Education'
MilService_df = get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==0,:], field, 0).reset_index()
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==1,:], field, 0).reset_index(), how='outer', on='index')
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==2,:], field, 0).reset_index(), how='outer', on='index')
MilService_df

Unnamed: 0,index,Education_x,Percent_x,Education_y,Percent_y,Education,Percent
0,Unknown,11,47.826087,14,15.053763,28,36.363636
1,Graduate school/advanced degree,3,13.043478,6,6.451613,3,3.896104
2,Bachelor's degree,3,13.043478,3,3.225806,9,11.688312
3,Some college/trade school,3,13.043478,31,33.333333,18,23.376623
4,High school/GED,2,8.695652,16,17.204301,16,20.779221
5,Less than high school,1,4.347826,23,24.731183,3,3.896104


In [60]:
field = 'RelStatus'
MilService_df = get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==0,:], field, 0).reset_index()
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==1,:], field, 0).reset_index(), how='outer', on='index')
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==2,:], field, 0).reset_index(), how='outer', on='index')
MilService_df

Unnamed: 0,index,RelStatus_x,Percent_x,RelStatus_y,Percent_y,RelStatus,Percent
0,Divorced/separated/widowed,7.0,30.434783,5,5.376344,19,24.675325
1,Married,7.0,30.434783,8,8.602151,15,19.480519
2,Single,5.0,21.73913,63,67.741935,29,37.662338
3,Boyfriend/girlfriend,4.0,17.391304,12,12.903226,12,15.584416
4,Unknown,,,5,5.376344,2,2.597403


In [54]:
field = 'MilService'
MilService_df = get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==0,:], field, 0).reset_index()
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==1,:], field, 0).reset_index(), how='outer', on='index')
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==2,:], field, 0).reset_index(), how='outer', on='index')
MilService_df

Unnamed: 0,index,MilService_x,Percent_x,MilService_y,Percent_y,MilService,Percent
0,No,20,86.956522,78,83.870968,51,66.233766
1,Yes,3,13.043478,15,16.129032,26,33.766234


In [55]:
field = 'Immigrant'
MilService_df = get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==0,:], field, 0).reset_index()
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==1,:], field, 0).reset_index(), how='outer', on='index')
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==2,:], field, 0).reset_index(), how='outer', on='index')
MilService_df

Unnamed: 0,index,Immigrant_x,Percent_x,Immigrant_y,Percent_y,Immigrant,Percent
0,No,18.0,78.26087,82.0,88.172043,62,80.519481
1,Yes,5.0,21.73913,11.0,11.827957,14,18.181818
2,Unknown,,,,,1,1.298701


In [61]:
field = 'Employed'
MilService_df = get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==0,:], field, 0).reset_index()
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==1,:], field, 0).reset_index(), how='outer', on='index')
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==2,:], field, 0).reset_index(), how='outer', on='index')
MilService_df

Unnamed: 0,index,Employed_x,Percent_x,Employed_y,Percent_y,Employed,Percent
0,Not working,12.0,52.173913,54,58.064516,39,50.649351
1,Working,11.0,47.826087,32,34.408602,33,42.857143
2,Unknown,,,7,7.526882,5,6.493506


In [62]:
field = 'Arrested'
MilService_df = get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==0,:], field, 0).reset_index()
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==1,:], field, 0).reset_index(), how='outer', on='index')
MilService_df = pd.merge(MilService_df, get_distribution(shooters_clusters_df.loc[shooters_clusters_df['Cluster']==2,:], field, 0).reset_index(), how='outer', on='index')
MilService_df

Unnamed: 0,index,Arrested_x,Percent_x,Arrested_y,Percent_y,Arrested,Percent
0,No,12,52.173913,45,48.387097,34,44.155844
1,Yes,11,47.826087,48,51.612903,43,55.844156
