# Modeling

## Objectives:
* Create a model that will predict the likelihood that an arrest will occur during a Terry Stop
* Determine which modeling method will bring about the best performance 

# Import Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.neighbors import KNeighborsClassifier
from scipy.spatial.distance import euclidean
import xgboost as xgb
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline



Let's load the data and take a look at it.

In [2]:
df = pd.read_csv('CSV_Files/Terry_Stops_with_dummies.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.describe()

Unnamed: 0,stop_resolution,weapon_type,officer_id,officer_race,subject_perceived_race,initial_call_type,final_call_type,call_type,precinct,sector,...,frisk_flag__Y,dif_race__Y,dif_gender__Y,repeat_offenders__Y,subject_age_group__1_17,subject_age_group__26_35,subject_age_group__36_45,subject_age_group__46_55,subject_age_group__56_up,subject_age_group__Unknown
count,30364.0,30364.0,30364.0,30364.0,30364.0,30364.0,30364.0,30364.0,30364.0,30364.0,...,30364.0,30364.0,30364.0,30364.0,30364.0,30364.0,30364.0,30364.0,30364.0,30364.0
mean,2.174219,3.45106,633.057502,5.885094,4.400211,94.143426,69.522691,0.839679,2.817646,16.561586,...,0.258629,0.610789,0.278389,0.28076,0.048511,0.334409,0.209492,0.126235,0.049499,0.028619
std,1.002457,1.001798,256.704876,2.173229,2.77232,50.121144,35.351272,1.123204,2.353513,9.586147,...,0.437888,0.487579,0.448213,0.449378,0.214848,0.471791,0.406952,0.33212,0.216912,0.166737
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,3.0,472.0,7.0,1.0,48.0,42.0,0.0,1.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,3.0,647.0,7.0,5.0,116.0,83.0,0.0,2.0,17.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,5.0,835.0,7.0,7.0,136.0,94.0,2.0,6.0,24.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
max,4.0,5.0,1094.0,7.0,7.0,162.0,198.0,3.0,6.0,34.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30364 entries, 0 to 30363
Data columns (total 32 columns):
stop_resolution                                  30364 non-null int64
weapon_type                                      30364 non-null int64
officer_id                                       30364 non-null int64
officer_race                                     30364 non-null int64
subject_perceived_race                           30364 non-null int64
initial_call_type                                30364 non-null int64
final_call_type                                  30364 non-null int64
call_type                                        30364 non-null int64
precinct                                         30364 non-null int64
sector                                           30364 non-null int64
beat                                             30364 non-null int64
incident_year                                    30364 non-null int64
incident_month                                   30

## Change to Floats
Ok, first order of business is to turn all of these columns into floats so that we can get more accurate statistical results.

In [4]:
for col in df.columns:
    df[col] = df[col].astype('float')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30364 entries, 0 to 30363
Data columns (total 32 columns):
stop_resolution                                  30364 non-null float64
weapon_type                                      30364 non-null float64
officer_id                                       30364 non-null float64
officer_race                                     30364 non-null float64
subject_perceived_race                           30364 non-null float64
initial_call_type                                30364 non-null float64
final_call_type                                  30364 non-null float64
call_type                                        30364 non-null float64
precinct                                         30364 non-null float64
sector                                           30364 non-null float64
beat                                             30364 non-null float64
incident_year                                    30364 non-null float64
incident_month             

Nice! Time to split up our data into targets and variables.

In [6]:
y = df['arrest_flag__Y']
X = df.drop('arrest_flag__Y', axis=1)

In [7]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=42)

Next, we'll normalize the data so that the label encoded data isn't weighted as more important.

In [8]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

With that out of the way, lets start modeling!

# Logistic Regression
We'll start with a basic Logistic Regression

In [9]:
logreg = LogisticRegression(fit_intercept=False, C=1e16)

# Fitting the model

logreg.fit(X_train_scaled, y_train)

LogisticRegression(C=1e+16, class_weight=None, dual=False, fit_intercept=False,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## Predictions
We'll make a function that will evaluate the predictions.

In [10]:
def logistic_predictions(X_train, X_test, y_train, y_test):
    """
    Input: Training and test sets of predictors 
    Output: The accuracy_score of each respective set. Percent correct(labeled 
    0.0) and percent incorrect(labeled as 1.0). 
    """
    
    y_hat_train = logreg.predict(X_train)
    y_hat_test = logreg.predict(X_test)
    
    train_residuals = np.abs(y_train - y_hat_train)
    print('------------------------------------')
    print('Training Accuracy',
          pd.Series(train_residuals).value_counts(normalize=True))
    
    test_residuals = np.abs(y_test - y_hat_test)
    print('------------------------------------')
    print('Testing Accuracy: ',
          pd.Series(test_residuals).value_counts(normalize=True))

In [11]:
logistic_predictions(X_train_scaled, X_test_scaled, y_train, y_test)

------------------------------------
Training Accuracy 0.0    0.659641
1.0    0.340359
Name: arrest_flag__Y, dtype: float64
------------------------------------
Testing Accuracy:  0.0    0.649506
1.0    0.350494
Name: arrest_flag__Y, dtype: float64


### Interpretation
Results show that both the Training and Test sets are a little bit better than a coin flip, which is nothing to get excited over.  On the bright side, however, the accuracy between the two models is consistent (65-66%) so the model is not overfit or underfit, which is great!

## Confusion Matrix
We'll plot a Confusion Matrix to see the results

In [12]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(logreg, X_test_scaled, y_test, cmap=plt.cm.Blues)

plt.show()

ImportError: cannot import name 'plot_confusion_matrix'