In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

%pylab inline
pylab.rcParams['figure.figsize'] = (20, 6)
pd.options.mode.chained_assignment = None  # default='warn'

Populating the interactive namespace from numpy and matplotlib


In [2]:
#IMPORTING DATA
df=pd.read_csv('data/VT-clean.csv',index_col='id',low_memory=False)
#print(df.head())
#print(df.info())

In [3]:
df_clean=df[['state','stop_date','stop_time','location_raw','county_name','county_fips','fine_grained_location','police_department','driver_gender','driver_age','driver_race','violation','search_conducted','search_type','contraband_found','stop_outcome','is_arrested','officer_id']]
#print(df_clean.head())
#print(df_clean.info())
df_temp=df_clean[pd.notnull(df_clean.stop_outcome)]
df_temp=df_temp.reset_index()

## K nearest neighbors

In this section I will be using the k nearest neighbors algorithm to predict the stop outcome for a traffic stop from the drivers gender, age, and race.  I will use dummy variables to get ride of the categorical data that I have in my data set and I will use gridsearchsv to tune my parameter for n_neighbors. Lets see how well this algorithm performs.

In [4]:
df_learn=df_temp[['driver_gender','driver_age','driver_race','stop_outcome']]
df_learn=df_learn.dropna()

y=df_learn['stop_outcome'].values
df_x=df_learn.drop(['stop_outcome'],axis=1)
df_x=pd.get_dummies(df_x,drop_first='true')
X=df_x.values

for ind,outcome in enumerate(y):
    if(outcome=='Written Warning' or outcome =='Verbal Warning'):
        y[ind]='Warning'
        
param_grid = {'n_neighbors':np.arange(3,12)}
knn=KNeighborsClassifier()
knn_cv = GridSearchCV(knn,param_grid,cv=5)

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=21)

knn_cv.fit(X_train,y_train)
score=knn_cv.score(X_test,y_test)

print(score)

0.57460174733


We see an accuracy of 57%. This is alright, but lets see if we can get a better score if we add a few more parameters.

In [5]:
df_learn=df_temp[['violation','contraband_found','driver_gender','driver_age','driver_race','stop_outcome']]
df_learn=df_learn.dropna()

y=df_learn['stop_outcome'].values
df_x=df_learn.drop(['stop_outcome'],axis=1)
df_x=pd.get_dummies(df_x,drop_first='true')
X=df_x.values

for ind,outcome in enumerate(y):
    if(outcome=='Written Warning' or outcome =='Verbal Warning'):
        y[ind]='Warning'

#print(df_x.head())

param_grid = {'n_neighbors':np.arange(3,12)}
knn=KNeighborsClassifier()
knn_cv = GridSearchCV(knn,param_grid,cv=5)


X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=21)

knn_cv.fit(X_train,y_train)
score=knn_cv.score(X_test,y_test)


print(score)

0.594299472263


including violation and contraband_found columns into our predictive group increases our success level to ~60%. Meaning we can better predict outcomes with those outcomes This still isn't great so now we will try different techniques

## Logistic Regression

I will now repeat the same process now with logistic regression.

In [6]:
from sklearn.linear_model import LogisticRegression

df_learn=df_temp[['police_department','violation','contraband_found','driver_gender','driver_age','driver_race','stop_outcome']]
df_learn=df_learn.dropna()

y=df_learn['stop_outcome'].values
df_x=df_learn.drop(['stop_outcome'],axis=1)
df_x=pd.get_dummies(df_x,drop_first='true')
X=df_x.values

for ind,outcome in enumerate(y):
    if(outcome=='Written Warning' or outcome =='Verbal Warning'):
        y[ind]='Warning'

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=21)

logreg=LogisticRegression()
param_grid = {'C':[0.01,0.1,1,10,100]}
logreg_cv = GridSearchCV(logreg,param_grid,cv=5)



logreg_cv.fit(X_train,y_train)
score=logreg_cv.score(X_test,y_test)

print(score)

0.639473236217


With logistic regression we see a significant jump in accuracy to 64%. Still room for improvement so we try again with a different technique.

## Naive Bayes

In [7]:
from sklearn.naive_bayes import MultinomialNB

df_learn=df_temp[['police_department','violation','contraband_found','driver_gender','driver_age','driver_race','stop_outcome']]
df_learn=df_learn.dropna()

y=df_learn['stop_outcome'].values
df_x=df_learn.drop(['stop_outcome'],axis=1)
df_x=pd.get_dummies(df_x,drop_first='true')
X=df_x.values

for ind,outcome in enumerate(y):
    if(outcome=='Written Warning' or outcome =='Verbal Warning'):
        y[ind]='Warning'

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=21)

mnb=MultinomialNB()
param_grid = {'alpha':[.1, 1, 5, 10, 50]}
mnb_cv = GridSearchCV(mnb,param_grid,cv=5)


mnb_cv.fit(X_train,y_train)
print(mnb_cv.score(X_test,y_test))

0.634560665386


Naive bayes produces a drop in accuracy to 63%. At this point we have tried multiple machine learning algorithms and we used gridsearchcv to hypertune the necessary parameters. So lastly we will categories stop_outcome into two outcomes rather than 5 outcomes and we will see if logistic regression is more accurately able to predict outcomes

In [12]:
from sklearn.linear_model import LogisticRegression

df_learn=df_temp[['police_department','violation','contraband_found','driver_gender','driver_age','driver_race','stop_outcome']]
df_learn=df_learn.dropna()

y=df_learn['stop_outcome'].values
df_x=df_learn.drop(['stop_outcome'],axis=1)
df_x=pd.get_dummies(df_x,drop_first='true')
X=df_x.values

for ind,outcome in enumerate(y):
    if(outcome=='Written Warning' or outcome =='Verbal Warning'):
        y[ind]='Warning'
        
    else:
        y[ind]='Not Warning'

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=21)

logreg=LogisticRegression()
param_grid = {'C':[0.01,0.1,1,10,100]}
logreg_cv = GridSearchCV(logreg,param_grid,cv=5)

logreg_cv.fit(X_train,y_train)
score_train=logreg_cv.score(X_train,y_train)
score_test=logreg_cv.score(X_test,y_test)

print("training set accuracy:",score_train)
print("training set accuracy:",score_test)

training set accuracy: 0.647146497678
training set accuracy: 0.646562416401


With that alteration accuracy improved to %65.  


## Final Evaluation

Overall one could say from this statistic that these features are not good features to use to be able to predict stop outcome. However, if we compare this to just guessing then this model preforms signficantly better than that. Lets now take a look at how well models would do if they guessed one value for all tests.

In [26]:
df_learn=df_temp[['police_department','violation','contraband_found','driver_gender','driver_age','driver_race','stop_outcome']]
df_learn=df_learn.dropna()

possible_guess=df_learn['stop_outcome'].unique()
y=df_learn['stop_outcome'].values

df_x=df_learn.drop(['stop_outcome'],axis=1)
df_x=pd.get_dummies(df_x,drop_first='true')
X=df_x.values

for guess in possible_guess:
    percentage= len(y[y==guess])/len(y)
    print("Percentage for guessing only",guess,percentage)

Percentage for guessing only Citation 0.3786849065207478
Percentage for guessing only Arrest for Violation 0.011782945736434108
Percentage for guessing only Warrant Arrest 0.0002772457820337437


From the above evaluation we see that our model performs only slightly better than guess Written Warning for all predictions.  This would suggest that there is a slight correlation between the above features and stop outcome