In [1]:
import utils
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from pygam import LogisticGAM

%matplotlib notebook

# Automating the proess of finding the filename
dataFolder = 'data'
plotsFolder = 'plots'
state = 'TX'
policeDept = 'statewide' # 'statewide' means state patrol

# Choose column names to load
colNamesList = ['date', 
                'subject_race', 
                'citation_issued']

In [2]:
# Automating the creation of rowName:dtype dict
# This throws up a lot of stupid warnings/errors right now!
# df = pd.read_csv(csvFilepath, nrows=5, names=colNamesList)
# dtypeDict = df.dtypes.to_dict()
# dtypeDict

dtypeDict = {'raw_row_number': 'int64', 
             'date': 'object',
             'time': 'object',
             'location': 'object',
             'lat': 'float64',
             'lng': 'float64',
             'county_name': 'object',
             'district': 'object',
             'precinct': 'object',
             'region': 'float64',
             'subject_race': 'object',
             'subject_sex': 'object',
             'officer_id_hash': 'object',
             'type': 'object',
             'violation': 'object',
             'citation_issued': 'bool',
             'warning_issued': 'bool',
             'outcome': 'object',
             'contraband_found': 'object',
             'contraband_drugs': 'object',
             'contraband_weapons': 'object',
             'search_conducted': 'object',
             'search_vehicle': 'object',
             'search_basis': 'object',
             'vehicle_color': 'object',
             'vehicle_make': 'object',
             'vehicle_model': 'object',
             'vehicle_type': 'object',
             'vehicle_year': 'float64',
             'raw_HA_RACE_SEX': 'object',
             'raw_HA_SEARCH_PC_boolean': 'object',
             'raw_HA_SEARCH_CONCENT_boolean': 'object',
             'raw_HA_INCIDTO_ARREST_boolean': 'object',
             'raw_HA_VEHICLE_INVENT_boolean': 'object'}


        
# search_vehicle only has False and nan
# type is just vehicular
# violation column is of interest - reason for stop (split by |?)

In [3]:
df = utils.load_data(dataFolder, state, policeDept, dtypeDict=dtypeDict, colNames=colNamesList)
df = utils.remove_empty_rows(df, 'subject_race')

In [4]:
df['date'] = pd.to_datetime(df['date']).dt.year
df.columns = ['year' if x=='date' else x for x in df.columns]

In [5]:
for year in df.year.unique():
    print(year, len(df[df['year']==year]))

2006 2673511
2007 2405744
2008 2434970
2009 2370986
2010 2452498
2011 2511664
2012 2368995
2013 2076401
2014 1820773
2015 1683954
2016 1832113
2017 2197362


# Create Train/Test sets for one year

In [6]:
year = 2017
year_df = df[df['year'] == year]

X = np.array(year_df['subject_race']).reshape(-1,1)
y = np.array(year_df['citation_issued']).astype(int)

enc = OneHotEncoder(categories=[df['subject_race'].unique().tolist()])
enc.fit(X)
X = enc.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1867757, 5) (1867757,) (329605, 5) (329605,)


# Logistic GAM

In [9]:
gam = LogisticGAM().fit(X_train.todense()[:90000], y_train[:90000])

In [10]:
# training accuracy
y_train_pred = gam.predict(X_train.todense()[:90000])
np.mean(y_train_pred == y_train[:90000])

0.6337444444444444

In [11]:
# testing accuracy
y_test_pred = gam.predict(X_test.todense())
np.mean(y_test_pred == y_test)

0.635724579420822