In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler, LabelEncoder

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score, roc_curve
from sklearn.inspection import permutation_importance

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# !pip install prettytable
from prettytable import PrettyTable


## Basic Modeling Part 1a
* Predicting race from police dept features (police demographics, policies) etc.
* Database: victim_lemas_only1208.csv

In [2]:
random_state = 109

# load data
data = pd.read_csv('data/victim_lemas_only1208.csv')
response, multi_class = 'race', True
total = len(data)

# drop unnecessary columns
cols_to_drop = ['date', 'city_wapo', 'state', 'AGENCYNAME', 
                'PERS_TOTAL', 'PERS_WHITE', 'PERS_BLACK', 
                'PERS_HISP', 'PERS_AMIND', 'PERS_ASIAN',
                'PERS_HAWPI','PERS_MULTI','PERS_UNK']

data = data.drop(cols_to_drop, axis=1)

# drop nas
data = data.dropna()
print(f'Number of victims missing values: {total-len(data)}')

# convert booleans to integers
# bools_to_ints = ['signs_of_mental_illness', 'body_camera']
# data[bools_to_ints] = data[bools_to_ints].astype(int)

# convert response variable to 0 and n_classes-1
if multi_class:
    le = LabelEncoder()
    data[response] = le.fit_transform(data[response])
    print(f'Response variable classes: {le.classes_}')

# convert other variables to dummies
# data = pd.get_dummies(data)
# data

Number of victims missing values: 0
Response variable classes: ['A' 'B' 'H' 'N' 'O' 'W']


In [3]:
# splits
train_size = 0.8

x_train, x_test, y_train, y_test = train_test_split(data.drop(response, axis=1), data[response], train_size=train_size, random_state=random_state)

col_labels = x_train.columns

scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

print(f'Train size: {len(x_train)/len(data):.2f}')
print(f'Test size: {len(x_test)/len(data):.2f}')

Train size: 0.80
Test size: 0.20


In [4]:
baseline_train_score = accuracy_score(np.ones_like(y_train) * y_train.value_counts().idxmax(), y_train)
print(f'Naive baseline accuracy: {baseline_train_score:.3f}')

Naive baseline accuracy: 0.498


In [5]:
# model fitting
class Model():
    def __init__(self, model):
        self.model = model.fit(x_train, y_train)
    
    def cv(self):
        cv = cross_validate(self.model, x_train, y_train, cv=5, return_train_score=True, scoring='accuracy')
        train_score = np.mean(abs(cv['train_score']))
        val_score = np.mean(abs(cv['test_score']))
        return train_score, val_score

    def test_accuracy(self):
        return self.model.score(x_test, y_test)
    
    def importance(self):
        pi = permutation_importance(self.model, x_test, y_test, random_state=random_state)
        
        plt.figure(figsize=(5,5))
        
        sorted_idx = np.argsort(pi.importances_mean)[-10:]
        indices = np.arange(0, len(pi.importances_mean[-10:])) + 0.5
        
        plt.barh(indices, pi.importances_mean[sorted_idx], height=0.7)
        plt.yticks(indices, col_labels[sorted_idx])
        plt.ylim((0, len(pi.importances_mean[-10:])))
        plt.xlabel("Permutation Feature Importance")
        plt.title(f'{type(self.model).__name__}')


knn = Model(KNeighborsClassifier())
logit = Model(LogisticRegression(multi_class='ovr', max_iter=1000, random_state=random_state))
logit_lasso = Model(LogisticRegression(multi_class='ovr', penalty='l1', solver='liblinear', max_iter=1000, random_state=random_state))
dtree = Model(DecisionTreeClassifier(random_state=random_state))
forest = Model(RandomForestClassifier(random_state=random_state))
boost = Model(AdaBoostClassifier(random_state=random_state))

In [6]:
knn_cv = knn.cv()
logit_cv = logit.cv()
logit_lasso = logit_lasso.cv()
dtree_cv = dtree.cv()
forest_cv = forest.cv()
boost_cv = boost.cv()

x = PrettyTable()
x.field_names = ["Model", "CV Train Accuracy", "CV Validation Accuracy"]
x.add_rows(
    [
        ["KNN", knn_cv[0], knn_cv[1]],
        ["Logistic Regression", logit_cv[0], logit_cv[1]],
        ["Lasso-Logistic Regression", logit_lasso[0], logit_lasso[1]],
        ["Decision Tree", dtree_cv[0], dtree_cv[1]],
        ["Random Forest", forest_cv[0], forest_cv[1]],
        ["AdaBoost", boost_cv[0], boost_cv[1]]
    ]
)

x.align = "l"
x.float_format = '.3'

print('Accuracies for victim_lemas_only1208.csv')
print(x)

Accuracies for victim_lemas_only1208.csv
+---------------------------+-------------------+------------------------+
| Model                     | CV Train Accuracy | CV Validation Accuracy |
+---------------------------+-------------------+------------------------+
| KNN                       | 0.672             | 0.569                  |
| Logistic Regression       | 0.593             | 0.586                  |
| Lasso-Logistic Regression | 0.592             | 0.586                  |
| Decision Tree             | 0.772             | 0.545                  |
| Random Forest             | 0.772             | 0.597                  |
| AdaBoost                  | 0.545             | 0.544                  |
+---------------------------+-------------------+------------------------+


<hr>

## Basic Modeling Part 1b
* Predicting race from police dept features + WaPo features. If the victim demographics are better predictors, we can tie back to EDA
* Database: victim_wapo_lemas_clean1208.csv

In [7]:
random_state = 109

# load data
data = pd.read_csv('data/victim_wapo_lemas_clean1208.csv')
response, multi_class = 'race', True
total = len(data)


# # drop unnecessary columns
# wanted to keep race as single column, multi-class
cols_to_drop = ['id', 'name', 'date', 'armed_wapo', 'gender', 'city_wapo', 'state', 
                'threat_level', 'longitude', 'latitude', 'is_geocoding_exact',
               'street_address', 'zipcode', 'county', 'ORI_agency_id', 
                'ORI9','ORI9_match_type', 'race_A', 'race_B', 
                'race_H', 'race_N', 'race_O', 'race_W', 'AGENCYNAME',
                'PERS_TOTAL', 'PERS_WHITE', 'PERS_BLACK', 
                'PERS_HISP', 'PERS_AMIND', 'PERS_ASIAN',
                'PERS_HAWPI','PERS_MULTI','PERS_UNK'] + list(data.filter(regex='PERS'))

data = data.drop(cols_to_drop, axis=1)

# drop nas
data = data.dropna()
print(f'Number of victims missing values: {total-len(data)}')

# convert booleans to integers
# bools_to_ints = ['signs_of_mental_illness', 'body_camera']
# data[bools_to_ints] = data[bools_to_ints].astype(int)

# convert response variable to 0 and n_classes-1
if multi_class:
    le = LabelEncoder()
    data[response] = le.fit_transform(data[response])
    print(f'Response variable classes: {le.classes_}')

# convert other variables to dummies
data = pd.get_dummies(data)
# data

Number of victims missing values: 324
Response variable classes: ['A' 'B' 'H' 'N' 'O' 'W']


In [8]:
# splits
train_size = 0.8

x_train, x_test, y_train, y_test = train_test_split(data.drop(response, axis=1), data[response], train_size=train_size, random_state=random_state)
col_labels = x_train.columns

scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

print(f'Train size: {len(x_train)/len(data):.2f}')
print(f'Test size: {len(x_test)/len(data):.2f}')

Train size: 0.80
Test size: 0.20


In [9]:
baseline_train_score = accuracy_score(np.ones_like(y_train) * y_train.value_counts().idxmax(), y_train)
print(f'Naive baseline accuracy: {baseline_train_score:.3f}')

Naive baseline accuracy: 0.503


In [10]:
# model fitting
knn = Model(KNeighborsClassifier())
logit = Model(LogisticRegression(multi_class='ovr', max_iter=1000, random_state=random_state))
logit_lasso = Model(LogisticRegression(multi_class='ovr', penalty='l1', solver='liblinear', max_iter=1000, random_state=random_state))
dtree = Model(DecisionTreeClassifier(random_state=random_state))
forest = Model(RandomForestClassifier(random_state=random_state))
boost = Model(AdaBoostClassifier(random_state=random_state))

In [None]:
knn_cv = knn.cv()
logit_cv = logit.cv()
logit_lasso = logit_lasso.cv()
dtree_cv = dtree.cv()
forest_cv = forest.cv()
boost_cv = boost.cv()

x = PrettyTable()
x.field_names = ["Model", "CV Train Accuracy", "CV Validation Accuracy"]
x.add_rows(
    [
        ["KNN", knn_cv[0], knn_cv[1]],
        ["Logistic Regression", logit_cv[0], logit_cv[1]],
        ["Lasso-Logistic Regression", logit_lasso[0], logit_lasso[1]],
        ["Decision Tree", dtree_cv[0], dtree_cv[1]],
        ["Random Forest", forest_cv[0], forest_cv[1]],
        ["AdaBoost", boost_cv[0], boost_cv[1]]
    ]
)

x.align = "l"
x.float_format = '.3'

print('Accuracies for victim_lemas_only1208.csv')
print(x)