In [1]:
import utils
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from pygam import LogisticGAM

%matplotlib notebook

# Automating the proess of finding the filename
dataFolder = 'data'
plotsFolder = 'plots'
state = 'TX'
policeDept = 'statewide' # 'statewide' means state patrol

# Choose column names to load
colNamesList = ['date',
                'time',
#                 'location',
#                 'lat', # about 40% are nan
#                 'lng',
                'county_name', # just use categorical metro, micro, other for all location vars
#                 'district',
#                 'precinct',
#                 'region',
                'subject_race',
                'subject_sex',
#                 'officer_id_hash',
#                 'type',
                'violation',
                'citation_issued',
                'warning_issued', 
                'contraband_found',
                'contraband_drugs',
                'contraband_weapons',
                'search_conducted',
                'search_vehicle']
                
# Too many nans or categories
#                 'outcome',
#                 'search_basis',
#                 'vehicle_color',
#                 'vehicle_make',
#                 'vehicle_model',
#                 'vehicle_type',
#                 'vehicle_year']

In [2]:
# Automating the creation of rowName:dtype dict
# This throws up a lot of stupid warnings/errors right now!
# df = pd.read_csv(csvFilepath, nrows=5, names=colNamesList)
# dtypeDict = df.dtypes.to_dict()
# dtypeDict

dtypeDict = {'raw_row_number': 'int64', 
             'date': 'object',
             'time': 'object',
             'location': 'object',
             'lat': 'float64',
             'lng': 'float64',
             'county_name': 'object',
             'district': 'object',
             'precinct': 'object',
             'region': 'float64',
             'subject_race': 'object',
             'subject_sex': 'object',
             'officer_id_hash': 'object',
             'type': 'object',
             'violation': 'object',
             'citation_issued': 'bool',
             'warning_issued': 'bool',
             'outcome': 'object',
             'contraband_found': 'object',
             'contraband_drugs': 'object',
             'contraband_weapons': 'object',
             'search_conducted': 'object',
             'search_vehicle': 'object',
             'search_basis': 'object',
             'vehicle_color': 'object',
             'vehicle_make': 'object',
             'vehicle_model': 'object',
             'vehicle_type': 'object',
             'vehicle_year': 'float64',
             'raw_HA_RACE_SEX': 'object',
             'raw_HA_SEARCH_PC_boolean': 'object',
             'raw_HA_SEARCH_CONCENT_boolean': 'object',
             'raw_HA_INCIDTO_ARREST_boolean': 'object',
             'raw_HA_VEHICLE_INVENT_boolean': 'object'}


        
# search_vehicle only has False and nan
# type is just vehicular
# violation column is of interest - reason for stop (split by |?)

In [42]:
df = utils.load_data(dataFolder, state, policeDept, dtypeDict=dtypeDict, colNames=colNamesList)
df = utils.remove_empty_rows(df, 'subject_race')

# Preprocessing
violations

In [46]:
df = utils.remove_empty_rows(df, 'violation')

In [47]:
df['violation'] = [s.lower() for s in df['violation']]
df = df[df['violation'].str.contains('speed', regex=False)]
len(df)

16048469

In [48]:
df.drop('violation', 1, inplace=True)

date & time

In [49]:
df['year'] = pd.to_datetime(df['date']).dt.year
df['month'] = pd.to_datetime(df['date']).dt.month
df['day'] = pd.to_datetime(df['date']).dt.day

In [50]:
df['minute'] = pd.to_datetime(df['time']).dt.minute
df['hour'] = pd.to_datetime(df['time']).dt.hour

In [51]:
df.drop(['date', 'time'], 1, inplace=True)

county name

In [52]:
county_df = pd.read_csv('2014-2018.csv')
county_df = county_df[county_df['State']=='Texas']

In [53]:
county_df = county_df.filter(items=['Metropolitan Status', 'County Name'])
county_df['Metropolitan Status'].unique()

array(['Micropolitan', 'Metropolitan', 'Non core'], dtype=object)

In [54]:
df = utils.remove_empty_rows(df, 'county_name')

In [55]:
df['county_name'] = [name[:-7] for name in df['county_name']]

In [56]:
df = df.join(county_df.set_index('County Name'), on='county_name')
df.drop('county_name', 1, inplace=True)
df.rename({'Metropolitan Status':'county_type'}, inplace=True)

In [57]:
df.head()

Unnamed: 0,subject_race,subject_sex,citation_issued,warning_issued,contraband_found,contraband_drugs,contraband_weapons,search_conducted,search_vehicle,year,month,day,minute,hour,Metropolitan Status
1,white,male,True,False,,,,False,False,2006,1,1,0,0,Non core
3,white,male,False,True,,,,False,False,2006,1,1,0,0,Metropolitan
5,hispanic,male,False,True,,,,False,False,2006,1,1,0,0,Non core
6,hispanic,male,False,True,,,,False,False,2006,1,1,0,0,Non core
8,hispanic,male,True,False,,,,False,False,2006,1,1,0,0,Non core


# One hot encode

In [58]:
y = np.array(df['citation_issued']).astype(int)
X = np.array(df.drop('citation_issued', 1))
X.shape # n by 14

(16048467, 14)

In [59]:
for col in df.columns:
    df[col].fillna('na', inplace=True)

df.head()

Unnamed: 0,subject_race,subject_sex,citation_issued,warning_issued,contraband_found,contraband_drugs,contraband_weapons,search_conducted,search_vehicle,year,month,day,minute,hour,Metropolitan Status
1,white,male,True,False,na,na,na,False,False,2006,1,1,0,0,Non core
3,white,male,False,True,na,na,na,False,False,2006,1,1,0,0,Metropolitan
5,hispanic,male,False,True,na,na,na,False,False,2006,1,1,0,0,Non core
6,hispanic,male,False,True,na,na,na,False,False,2006,1,1,0,0,Non core
8,hispanic,male,True,False,na,na,na,False,False,2006,1,1,0,0,Non core


In [61]:
X = np.array(df.drop('citation_issued', 1))

enc = OneHotEncoder()
enc.fit(X)
X = enc.transform(X)

enc.categories_

[array(['asian/pacific islander', 'black', 'hispanic', 'other', 'white'],
       dtype=object),
 array(['female', 'male', 'na'], dtype=object),
 array([False, True], dtype=object),
 array(['FALSE', 'TRUE', 'na'], dtype=object),
 array(['FALSE', 'TRUE', 'na'], dtype=object),
 array(['FALSE', 'TRUE', 'na'], dtype=object),
 array(['FALSE', 'TRUE', 'na'], dtype=object),
 array(['FALSE', 'na'], dtype=object),
 array([2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
        2017], dtype=object),
 array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=object),
 array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], dtype=object),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
        53, 54, 55, 56, 57, 58, 59], dtype=object),


In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(13641196, 167) (13641196,) (2407271, 167) (2407271,)


In [63]:
clf = LogisticRegressionCV(random_state=0)
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegressionCV(random_state=0)

In [64]:
# training accuracy
y_train_pred = clf.predict(X_train)
np.mean(y_train_pred == y_train)

0.8746834221867349

In [65]:
# test accuracy
y_test_pred = clf.predict(X_test)
np.mean(y_test_pred == y_test)

0.8743872210482326

In [66]:
clf.coef_

array([[-5.76114413e-02,  1.21710141e-01,  2.88114415e-01,
        -2.00090241e-02, -3.32482019e-01, -6.28737914e-02,
         6.24200777e-02,  1.75785695e-04,  3.15757467e+00,
        -3.15785260e+00, -2.34102020e-01,  5.62321043e-01,
        -3.28496951e-01,  4.28308724e-02,  2.85388151e-01,
        -3.28496951e-01,  3.10702512e-01,  1.75165113e-02,
        -3.28496951e-01, -1.92762210e-01,  3.28219023e-01,
        -1.35734741e-01, -5.38680878e-02,  5.35901598e-02,
         2.56188752e-01,  2.28409279e-01,  1.00522355e-01,
        -4.38994598e-02, -1.08076760e-01, -1.10718851e-01,
        -5.31717712e-02, -3.25480122e-02, -1.67131153e-02,
        -7.78341051e-02, -1.27023999e-02, -1.29733840e-01,
         3.13775544e-02,  2.44692150e-02,  2.92557493e-03,
        -2.12591105e-02, -6.86290406e-04, -8.28377789e-03,
        -9.71267279e-03, -4.40998340e-03, -1.18669831e-02,
        -4.35301283e-03,  1.20760227e-02, -1.05544640e-02,
         2.14929160e-02,  1.30320711e-02,  8.20768887e-0

In [67]:
clf.intercept_

array([2.7836543])