# Using Machine Learning to Make Predictions

In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
wapo = pd.read_csv("https://raw.githubusercontent.com/washingtonpost/data-police-shootings/master/fatal-police-shootings-data.csv")

In [4]:
wapo.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


It seems like "armed" is a good category to predict

In [5]:
wapo.armed.value_counts().head(10)

gun               2016
knife              522
unarmed            247
undetermined       146
toy weapon         133
unknown weapon      41
machete             32
Taser               14
ax                  14
sword               13
Name: armed, dtype: int64

In [6]:
wapo.armed = wapo.armed.apply(lambda x: "armed" if x != "unarmed" and x != "undetermined" else x)

In [24]:
wapo.armed.value_counts()

armed           3218
unarmed          247
undetermined     146
Name: armed, dtype: int64

The baseline accuracy to beat is 89% (if the algorithm guessed armed every time)

In [25]:
wapo.threat_level.value_counts()

attack          2260
other           1165
undetermined     186
Name: threat_level, dtype: int64

## Guess if a person is armed from gender, race, and age

### Logistic Regression

In [8]:
wapo_log = wapo.dropna(subset=['age'])
X_train = pd.get_dummies(wapo_log[['gender', 'race', 'age']])
X_train.drop(['gender_F', 'race_A'], axis=1, inplace=True)
Y_train = wapo_log['armed']

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2)

In [11]:
log = LogisticRegression()
log.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
predictions = log.predict(x_test)

In [13]:
log.score(x_test, y_test)

0.8920863309352518

In [14]:
pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted:'], margins=True)

Predicted:,armed,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
armed,620,620
unarmed,49,49
undetermined,26,26
All,695,695


In [61]:
log.coef_

array([[ 0.02925652,  0.13156463, -0.50143601, -0.37020441, -0.18074461,
        -0.59873334, -0.17979033],
       [-0.03704123, -0.56177932,  0.98357467,  0.59005146, -0.15544645,
         1.04540414,  0.58567177],
       [-0.02101471,  0.30634612, -0.51745219, -0.12871371,  0.18160461,
        -0.31683051, -0.55319415]])

Looks like logistic regression was only able to get the baseline score of 89.4%

### Decision Tree

In [15]:
from sklearn import tree

In [16]:
wapo_tree = wapo.dropna(subset=['age'])
X_train = pd.get_dummies(wapo_tree[['gender', 'race', 'age']])
X_train.drop(['gender_F', 'race_A'], axis=1, inplace=True)
Y_train = wapo_tree['armed']

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2)

In [18]:
clf = tree.DecisionTreeClassifier(max_depth=10, min_samples_leaf=3)
clf.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [19]:
predictions = clf.predict(x_test)
clf.score(x_test, y_test)

0.897841726618705

In [20]:
pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted:'], margins=True)

Predicted:,armed,unarmed,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
armed,623,2,625
unarmed,35,1,36
undetermined,34,0,34
All,692,3,695


In [21]:
import graphviz
dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data)
graph.render('decision_tree')

'decision_tree.pdf'

The decision tree is a little better than the baseline accuracy. There are more false positives (predicting someone is armed when they're not) than false negatives (predicting someone is unarmed when they're armed), which is good from the police point of view since they want to err on the side of caution and always be defensive.

### Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
wapo_forest = wapo.dropna(subset=['age'])
X_train = pd.get_dummies(wapo_forest[['gender', 'race', 'age']])
X_train.drop(['gender_F', 'race_A'], axis=1, inplace=True)
Y_train = wapo_forest['armed']

In [39]:
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2, stratify=Y_train)

In [52]:
clf = RandomForestClassifier()
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [53]:
predictions = clf.predict(x_test)
clf.score(x_test, y_test)

0.8762589928057554

In [54]:
pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted:'], margins=True)

Predicted:,armed,unarmed,undetermined,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
armed,609,4,6,619
unarmed,49,0,0,49
undetermined,27,0,0,27
All,685,4,6,695


The random forest performed worse than the Decision Tree and Logistic Regression

### Neural Network

In [55]:
import keras

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


There may not be enough data for a neural network, but hopefully the hidden layers will be able to recognize patterns.