In [1]:
import os
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import neighbors, cross_validation
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error



In [2]:
os.chdir("/Users/adeniyiharrison/Desktop/Traffic Stop Data")
df = pd.read_csv("connecticut-r2.csv", low_memory = False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586849 entries, 0 to 586848
Data columns (total 42 columns):
CreatedDate                              586849 non-null object
ProfileNo                                586849 non-null int64
OrganizationIdentificationID             586849 non-null object
Department Name                          586849 non-null object
OrganizationActivityText                 586849 non-null object
ReportingOfficerIdentificationID         586849 non-null object
InterventionIdentificationID             586825 non-null object
IdentificationCategoryDescriptionText    586849 non-null object
InterventionDate                         586849 non-null object
Month                                    586849 non-null object
SubjectRaceCode                          586849 non-null object
SubjectEthnicityCode                     586849 non-null object
SubjectSexCode                           586849 non-null object
SubjectAge                               586783 non-null f

#### Data Clean

In [4]:
dateTime = []
for x in df["CreatedDate"]:
    try:
        dateTime.append(datetime.datetime.strptime(x, "%m/%d/%y %H:%M"))
    except:
        dateTime.append(datetime.datetime.strptime(x, "%m-%d-%y"))

df["CreatedDate"] = dateTime

In [7]:
df["CreatedDate"].describe()

count                  586849
unique                  36891
top       2015-05-29 10:52:00
freq                    10086
first     2014-10-01 00:16:00
last      2016-03-08 11:53:00
Name: CreatedDate, dtype: object

In [9]:
df["ProfileNo"] = df["ProfileNo"].apply(lambda x: str(x))
df["ID"] = df["Department Name"]+ "_" + df["ProfileNo"]

df.set_index("ID", inplace = True)

In [10]:
df["InterventionDateTime"] = df["InterventionDateTime"].apply(lambda x: datetime.datetime.strptime(x[:19], "%Y-%m-%d %H:%M:%S"))
df["InterventionDate"] = df["InterventionDate"].apply(lambda x: datetime.datetime.strptime(x[:10], "%Y-%m-%d"))
df["InterventionTime"] = df['InterventionTime'].apply(lambda x: datetime.datetime.strptime(x, "%H:%M").time())

In [11]:
df["InterventionDateTime"].describe()

count                  586849
unique                 310818
top       2015-01-20 00:00:00
freq                       77
first     2014-10-01 00:00:00
last      2015-09-30 23:59:00
Name: InterventionDateTime, dtype: object

In [12]:
df["Month"].value_counts()

April        67684
August       59848
May          59425
July         54432
October      46853
March        46590
November     46530
September    45424
June         44672
December     44253
January      40965
February     30173
Name: Month, dtype: int64

In [13]:
dow = {
    0: "Sunday",
    1: "Monday",
    2: "Tuesday",
    3: "Wednesday",
    4: "Thursday",
    5: "Friday",
    6: "Saturday"
}

df["Day of Week"] = df["InterventionDate"].apply(lambda x: dow[x.weekday()])

df["Day of Week"].value_counts()

Thursday     98486
Wednesday    92122
Tuesday      87978
Friday       83655
Monday       83066
Sunday       78113
Saturday     63429
Name: Day of Week, dtype: int64

In [14]:
race = {
    "W": "White",
    "B": "Black",
    "A": "Asian",
    "I": "American Indian"
}


df["SubjectRaceCode"] = df["SubjectRaceCode"].apply(lambda x: race[x])
df["SubjectRaceCode"].value_counts()

White              484540
Black               85345
Asian               11929
American Indian      5035
Name: SubjectRaceCode, dtype: int64

In [15]:
ethnic = {
    "H": "Hispanic",
    "N": "Non Hispanic",
    "M": "Middle Eastern"
}

df["SubjectEthnicityCode"] = df["SubjectEthnicityCode"].apply(lambda x: ethnic[x])
df["SubjectEthnicityCode"].value_counts()

Non Hispanic      500907
Hispanic           73136
Middle Eastern     12806
Name: SubjectEthnicityCode, dtype: int64

In [16]:
df["SubjectSexCode"].value_counts()

M    370630
F    216219
Name: SubjectSexCode, dtype: int64

In [17]:
age = []
for x in df["SubjectAge"]:
    if x < 15:
        age.append(np.nan)
    elif x >= 15 and x <= 85:
        age.append(x)
    else:
        age.append(np.nan)
        
df["SubjectAge"] = age

In [18]:
df["SubjectAge"].describe()

count    582796.000000
mean         38.438363
std          14.622587
min          15.000000
25%          26.000000
50%          36.000000
75%          49.000000
max          85.000000
Name: SubjectAge, dtype: float64

In [19]:
# Resident of Connecticut
df["ResidentIndicator"].value_counts()

True     511460
False     75389
Name: ResidentIndicator, dtype: int64

In [20]:
# Resident of town they were pulled over in
df["TownRecidentIndicator"].value_counts()

False    419706
True     167142
Name: TownRecidentIndicator, dtype: int64

In [21]:
df[pd.isnull(df["InterventionLocationName"])]["ProfileNo"].count()

1

In [22]:
reason = {
    "E": "Equipment Violation",
    "I": "Investigative Stop",
    "V": "Motor Vehicle Violation"
}

df["InterventionReasonCode"] = df["InterventionReasonCode"].apply(lambda x: reason[x])
df["InterventionReasonCode"].value_counts()

Motor Vehicle Violation    520004
Equipment Violation         55409
Investigative Stop          11436
Name: InterventionReasonCode, dtype: int64

In [23]:
duration = {
    1: "0 - 15 Minutes",
    2: "16 - 30 Minutes",
    3: "30+ Minutes"
}

df["InterventionDurationCode"] = df["InterventionDurationCode"].apply(lambda x: duration[x])
df["InterventionDurationCode"].value_counts()

0 - 15 Minutes     526477
16 - 30 Minutes     47451
30+ Minutes         12921
Name: InterventionDurationCode, dtype: int64

In [24]:
df["TowedIndicator"].value_counts()

False    556407
True      30442
Name: TowedIndicator, dtype: int64

In [25]:
df["StatuteReason"].value_counts()

Speed Related             152901
Other                      99934
Cell Phone                 61186
Registration               55484
Defective Lights           49351
Moving Violation           42905
Traffic Control Signal     39305
Stop Sign                  33852
Seatbelt                   21590
Display of Plates          15256
Suspended License           8167
Window Tint                 5251
Equipment Violation         1665
Stop Sign                      2
Name: StatuteReason, dtype: int64

In [26]:
df["VehicleSearchedIndicator"].value_counts()

False    569770
True      17079
Name: VehicleSearchedIndicator, dtype: int64

In [27]:
search = {
    "C": "Consent",
    "I": "Inventory",
    "N": "Not Applicable",
    "O": "Other"
}
df["SearchAuthorizationCode"] = df["SearchAuthorizationCode"].apply(lambda x: search[x])
df["SearchAuthorizationCode"].value_counts()

Not Applicable    566859
Other              10151
Consent             6349
Inventory           3490
Name: SearchAuthorizationCode, dtype: int64

In [28]:
df["ContrabandIndicator"].value_counts()

False    580441
True       6408
Name: ContrabandIndicator, dtype: int64

In [29]:
disposition = {
    "I": "Infraction",
    "M": "Misdemeanor Summons",
    "N": "No Disposition",
    "U": "Uniform Arrest Report",
    "V": "Verbal Warning",
    "W": "Written Warning"
}

df["InterventionDispositionCode"] = df["InterventionDispositionCode"].apply(lambda x: disposition[x])
df["InterventionDispositionCode"].value_counts()

Infraction               276194
Misdemeanor Summons       31585
No Disposition             9491
Uniform Arrest Report      5309
Name: InterventionDispositionCode, dtype: int64

In [30]:
df[df["CustodialArrestIndicator"] == True]["InterventionDispositionCode"].value_counts()

Misdemeanor Summons      4515
Uniform Arrest Report    4069
Infraction               1426
No Disposition             83
Name: InterventionDispositionCode, dtype: int64

In [31]:
df[df["InterventionDispositionCode"] == "Uniform Arrest Report"]["CustodialArrestIndicator"].value_counts()

True     4069
False    1240
Name: CustodialArrestIndicator, dtype: int64

In [32]:
df["CustodialArrestIndicator"].value_counts()

False    576085
True      10764
Name: CustodialArrestIndicator, dtype: int64

In [33]:
df.drop(['InterventionDispositionReasonText', 'IsStatuteCodeValid','IsStatutatoryCitationValid',
        'IsaTest', 'RecordCount', 'SourceId', 'SourceReferenceId',"InterventionLocationLatitude", 
         "InterventionLocationLongitude", "OrganizationActivityText"], axis = 1, inplace = True)

In [34]:
cols = ['InterventionDateTime',
 'InterventionDate',
 'InterventionTime',
 'Month',
 'Day of Week',
 'CreatedDate',
 'Department Name',
 'ProfileNo',
 'OrganizationIdentificationID',
 'ReportingOfficerIdentificationID',
 'InterventionIdentificationID',
 'IdentificationCategoryDescriptionText',
 'InterventionLocationName',
 'ResidentIndicator',
 'TownRecidentIndicator',
 'SubjectRaceCode',
 'SubjectEthnicityCode',
 'SubjectSexCode',
 'SubjectAge',
 'InterventionLocationDescriptionText',
 'InterventionReasonCode',
 'InterventionTechniqueCode',
 'InterventionDurationCode',
 'TowedIndicator',
 'StatuteCodeIdentificationID',
 'StatuteReason',
 'StatutatoryCitation',
 'VehicleSearchedIndicator',
 'SearchAuthorizationCode',
 'ContrabandIndicator',
 'CustodialArrestIndicator',
 'InterventionDispositionCode',
 'InterventionDispositionDate']
        
df = df[cols]

#### Predict the length of the stop

In [35]:
# Using Race, Sex, Age, and Reason code use KNN, Logistic Regression and Random Forest to predict Length of Stop
X = pd.DataFrame(index = df.index ,columns = ["SubjectSexCode", "SubjectRaceCode", "InterventionReasonCode"])
le = LabelEncoder()

for x in ["SubjectSexCode", "SubjectRaceCode", "InterventionReasonCode"]:
    X[x] = le.fit_transform(df[x])

X = X.join(df[["SubjectAge", "InterventionDurationCode"]])
X.dropna(inplace = True)
y = X["InterventionDurationCode"]

X.drop("InterventionDurationCode", axis = 1, inplace = True)

oneHot = OneHotEncoder(categorical_features = [True, True, True, False], sparse = True)
X = oneHot.fit_transform(X)

In [36]:
df[["SubjectSexCode", "SubjectRaceCode", "InterventionReasonCode", "InterventionDurationCode"]].info()

<class 'pandas.core.frame.DataFrame'>
Index: 586849 entries, Ansonia_1038902 to Milford_3177
Data columns (total 4 columns):
SubjectSexCode              586849 non-null object
SubjectRaceCode             586849 non-null object
InterventionReasonCode      586849 non-null object
InterventionDurationCode    586849 non-null object
dtypes: object(4)
memory usage: 22.4+ MB


In [42]:
X

<582796x10 sparse matrix of type '<class 'numpy.float64'>'
	with 2331184 stored elements in COOrdinate format>

In [44]:
X.A

array([[  1.,   0.,   0., ...,   0.,   0.,  24.],
       [  0.,   1.,   0., ...,   0.,   1.,  31.],
       [  0.,   1.,   0., ...,   0.,   1.,  60.],
       ..., 
       [  0.,   1.,   0., ...,   0.,   1.,  45.],
       [  0.,   1.,   0., ...,   0.,   1.,  45.],
       [  0.,   1.,   0., ...,   0.,   0.,  25.]])

In [49]:
from sklearn.model_selection import cross_val_score
knn = neighbors.KNeighborsClassifier(n_neighbors = 10)
cross_val_score(knn, X.A[:1000], y[:1000], cv=10, scoring='accuracy').mean()

0.88106788325891416

In [60]:
kf = cross_validation.KFold(len(y[:1000]),n_folds = 3)
gs = GridSearchCV(
    estimator = neighbors.KNeighborsClassifier(), 
    param_grid = {"n_neighbors" : np.arange(2,20),
                  "weights" : ["distance", "uniform"]},
    cv = kf)

gs.fit(X.A[:1000],y[:1000])
test = gs.best_estimator_

In [63]:
test.score(X.A[:1000], y[:1000])

0.88100000000000001