In [1]:
import utils
import time
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from PyAstronomy import pyasl
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from pygam import LogisticGAM

%matplotlib notebook

# Automating the proess of finding the filename
dataFolder = 'data'
plotsFolder = 'plots'
state = 'TX'
policeDept = 'statewide' # 'statewide' means state patrol

# Choose column names to load
colNamesList = ['date',
                'time',
#                 'location',
#                 'lat', # about 40% are nan
#                 'lng',
                'county_name', # just use categorical metro, micro, other for all location vars
#                 'district',
#                 'precinct',
#                 'region',
                'subject_race',
                'subject_sex',
#                 'officer_id_hash',
#                 'type',
                'violation',
                'citation_issued',
                'warning_issued', 
                'contraband_found',
                'contraband_drugs',
                'contraband_weapons',
                'search_conducted',
                'search_vehicle']
                
# Too many nans or categories
#                 'outcome',
#                 'search_basis',
#                 'vehicle_color',
#                 'vehicle_make',
#                 'vehicle_model',
#                 'vehicle_type',
#                 'vehicle_year']

In [2]:
# Automating the creation of rowName:dtype dict
# This throws up a lot of stupid warnings/errors right now!
# df = pd.read_csv(csvFilepath, nrows=5, names=colNamesList)
# dtypeDict = df.dtypes.to_dict()
# dtypeDict

dtypeDict = {'raw_row_number': 'int64', 
             'date': 'object',
             'time': 'object',
             'location': 'object',
             'lat': 'float64',
             'lng': 'float64',
             'county_name': 'object',
             'district': 'object',
             'precinct': 'object',
             'region': 'float64',
             'subject_race': 'object',
             'subject_sex': 'object',
             'officer_id_hash': 'object',
             'type': 'object',
             'violation': 'object',
             'citation_issued': 'bool',
             'warning_issued': 'bool',
             'outcome': 'object',
             'contraband_found': 'object',
             'contraband_drugs': 'object',
             'contraband_weapons': 'object',
             'search_conducted': 'object',
             'search_vehicle': 'object',
             'search_basis': 'object',
             'vehicle_color': 'object',
             'vehicle_make': 'object',
             'vehicle_model': 'object',
             'vehicle_type': 'object',
             'vehicle_year': 'float64',
             'raw_HA_RACE_SEX': 'object',
             'raw_HA_SEARCH_PC_boolean': 'object',
             'raw_HA_SEARCH_CONCENT_boolean': 'object',
             'raw_HA_INCIDTO_ARREST_boolean': 'object',
             'raw_HA_VEHICLE_INVENT_boolean': 'object'}


        
# search_vehicle only has False and nan
# type is just vehicular
# violation column is of interest - reason for stop (split by |?)

In [3]:
start = time.time()
df = utils.load_data(dataFolder, state, policeDept, dtypeDict=dtypeDict, colNames=colNamesList)
df = utils.remove_empty_rows(df, 'subject_race')
print('loading time: %d'%(time.time()-start))

loading time: 237


# Preprocessing
violations

In [4]:
df = utils.remove_empty_rows(df, 'violation')

In [5]:
df['violation'] = [s.lower() for s in df['violation']]
df = df[df['violation'].str.contains('speed', regex=False)]
len(df)

16048469

In [6]:
df.drop('violation', 1, inplace=True)

date & time

In [7]:
df['year'] = pd.to_datetime(df['date']).dt.year
df['yearfrac'] = [pyasl.decimalYear(d) for d in pd.to_datetime(df['date'])]
df['yearfrac'] = df['yearfrac'] - df['year']

In [8]:
df['minute'] = pd.to_datetime(df['time']).dt.minute
df['hour'] = pd.to_datetime(df['time']).dt.hour
df['time'] = df['hour'] + df['minute'] / 60

In [9]:
scaler = MinMaxScaler()
scaler.fit(np.array(df['time']).reshape(-1,1))
df['time'] = scaler.transform(np.array(df['time']).reshape(-1,1))

In [10]:
df.drop(['hour', 'minute', 'date'], 1, inplace=True)

county name

In [11]:
county_df = pd.read_csv('2014-2018.csv')
county_df = county_df[county_df['State']=='Texas']

In [12]:
county_df = county_df.filter(items=['Metropolitan Status', 'County Name'])
county_df['Metropolitan Status'].unique()

array(['Micropolitan', 'Metropolitan', 'Non core'], dtype=object)

In [13]:
df = utils.remove_empty_rows(df, 'county_name')

In [14]:
df['county_name'] = [name[:-7] for name in df['county_name']]

In [15]:
df = df.join(county_df.set_index('County Name'), on='county_name')
df.drop('county_name', 1, inplace=True)
df.rename({'Metropolitan Status':'county_type'}, inplace=True)

In [16]:
for col in df.columns:
    df[col].fillna('na', inplace=True)

df.head()

Unnamed: 0,time,subject_race,subject_sex,citation_issued,warning_issued,contraband_found,contraband_drugs,contraband_weapons,search_conducted,search_vehicle,year,yearfrac,Metropolitan Status
1,0.0,white,male,True,False,na,na,na,False,False,2006,0.0,Non core
3,0.0,white,male,False,True,na,na,na,False,False,2006,0.0,Metropolitan
5,0.0,hispanic,male,False,True,na,na,na,False,False,2006,0.0,Non core
6,0.0,hispanic,male,False,True,na,na,na,False,False,2006,0.0,Non core
8,0.0,hispanic,male,True,False,na,na,na,False,False,2006,0.0,Non core


# One hot encode

In [17]:
y = np.array(df['citation_issued']).astype(int)

cont_df = df.filter(items=['time', 'yearfrac', 'year'])
cat_df = df.drop(['time', 'yearfrac', 'year', 'citation_issued'], 1)
X_cont = np.array(cont_df)
X_cat = np.array(cat_df)

In [18]:
start = time.time()
enc = OneHotEncoder()
enc.fit(X_cat)
X_cat = enc.transform(X_cat)
end = time.time()

enc.categories_

[array(['asian/pacific islander', 'black', 'hispanic', 'other', 'white'],
       dtype=object),
 array(['female', 'male', 'na'], dtype=object),
 array([False, True], dtype=object),
 array(['FALSE', 'TRUE', 'na'], dtype=object),
 array(['FALSE', 'TRUE', 'na'], dtype=object),
 array(['FALSE', 'TRUE', 'na'], dtype=object),
 array(['FALSE', 'TRUE', 'na'], dtype=object),
 array(['FALSE', 'na'], dtype=object),
 array(['Metropolitan', 'Micropolitan', 'Non core', 'na'], dtype=object)]

In [19]:
print('encoding time: %d'%(end-start))

encoding time: 52


In [20]:
import scipy
def delete_rows_csr(mat, indices):
    """
    Remove the rows denoted by ``indices`` form the CSR sparse matrix ``mat``.
    """
    if not isinstance(mat, scipy.sparse.csr_matrix):
        raise ValueError("works only for CSR format -- use .tocsr() first")
    indices = list(indices)
    mask = np.ones(mat.shape[0], dtype=bool)
    mask[indices] = False
    return mat[mask]


In [21]:
from scipy.sparse import csr_matrix, hstack
X_cat = delete_rows_csr(X_cat.copy().T.tocsr(), [4]).T.tocsr()
X = csr_matrix(hstack([X_cat,csr_matrix(X_cont)]))

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(13641196, 30) (13641196,) (2407271, 30) (2407271,)


# Logisitic Regression

In [23]:
start = time.time()
clf = LogisticRegressionCV(random_state=0)
clf.fit(X_train, y_train)
end = time.time()
print('training time: %d'%(end-start))

training time: 714


In [24]:
# training accuracy
y_train_pred = clf.predict(X_train)
np.mean(y_train_pred == y_train)

0.8744580753769684

In [25]:
# test accuracy
y_test_pred = clf.predict(X_test)
np.mean(y_test_pred == y_test)

0.8741716242167998

In [28]:
cols = []
for l in enc.categories_:
    cols.extend(l)
cols.extend(cont_df.columns)
cols.remove('white')

print(cat_df.columns)

print('white', clf.intercept_, np.exp(clf.intercept_))
for i in range(X.shape[1]):
    print(cols[i], clf.coef_.T[i], np.exp(clf.coef_.T[i]))

       'contraband_drugs', 'contraband_weapons', 'search_conducted',
       'search_vehicle', 'Metropolitan Status'],
      dtype='object')
white [0.00373567] [1.00374265]
asian/pacific islander [0.06004772] [1.06188721]
black [0.38474363] [1.46923761]
hispanic [0.57667698] [1.78011324]
other [0.00344933] [1.00345529]
female [-0.05817201] [0.94348765]
male [0.06177547] [1.06372348]
na [2.11144402e-05] [1.00002111]
False [3.26558646] [26.19546909]
True [-3.26196188] [0.03831316]
FALSE [0.09191531] [1.09627198]
TRUE [0.20706746] [1.23006555]
na [-0.29535818] [0.74426496]
FALSE [0.18382587] [1.20180653]
TRUE [0.1151569] [1.12204947]
na [-0.29535818] [0.74426496]
FALSE [0.29189887] [1.3389676]
TRUE [0.0070839] [1.00710905]
na [-0.29535818] [0.74426496]
FALSE [-0.10608813] [0.89934538]
TRUE [0.29898277] [1.34848639]
na [-0.18927005] [0.82756299]
FALSE [-0.04068585] [0.9601307]
na [0.04431044] [1.04530681]
Metropolitan [0.26483026] [1.30320976]
Micropolitan [-0.14580343] [0.86432759]
Non cor

# Logistic GAM

In [None]:
start = time.time()
gam = LogisticGAM().fit(X_train.todense(), y_train)
end = time.time()
print('gam train time: %d'%(end-start))

In [None]:
# training accuracy
y_train_pred = gam.predict(X_train.todense())
np.mean(y_train_pred == y_train)

In [None]:
# testing accuracy
y_test_pred = gam.predict(X_test.todense())
np.mean(y_test_pred == y_test)

In [None]:
print(cat_df.columns)

print('white', gam.coef_[0])
for i in range(X.shape[1]):
    print(cols[i], gam.coef_.T[i+1], np.exp(gam.coef_.T[i+1]))