In [12]:
import os
import numpy as np
import glob
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

%matplotlib notebook

# Automating the proess of finding the filename
dataFolder = 'data'
plotsFolder = 'plots'
state = 'TX'
policeDept = 'statewide' # 'statewide' means state patrol
csvFilepath = glob.glob(os.path.join(dataFolder, state, '{}_{}*.csv'.format(state.lower(), policeDept)))[0]
csvFilepath

# Choose column names to load
colNamesList = ['date', 
                'time', 
                'subject_race', 
                'citation_issued']

In [4]:
# Automating the creation of rowName:dtype dict
# This throws up a lot of stupid warnings/errors right now!
# df = pd.read_csv(csvFilepath, nrows=5, names=colNamesList)
# dtypeDict = df.dtypes.to_dict()
# dtypeDict

dtypeDict = {'raw_row_number': 'int64', 
             'date': 'object',
             'time': 'object',
             'location': 'object',
             'lat': 'float64',
             'lng': 'float64',
             'county_name': 'object',
             'district': 'object',
             'precinct': 'object',
             'region': 'float64',
             'subject_race': 'object',
             'subject_sex': 'object',
             'officer_id_hash': 'object',
             'type': 'object',
             'violation': 'object',
             'citation_issued': 'bool',
             'warning_issued': 'bool',
             'outcome': 'object',
             'contraband_found': 'object',
             'contraband_drugs': 'object',
             'contraband_weapons': 'object',
             'search_conducted': 'object',
             'search_vehicle': 'object',
             'search_basis': 'object',
             'vehicle_color': 'object',
             'vehicle_make': 'object',
             'vehicle_model': 'object',
             'vehicle_type': 'object',
             'vehicle_year': 'float64',
             'raw_HA_RACE_SEX': 'object',
             'raw_HA_SEARCH_PC_boolean': 'object',
             'raw_HA_SEARCH_CONCENT_boolean': 'object',
             'raw_HA_INCIDTO_ARREST_boolean': 'object',
             'raw_HA_VEHICLE_INVENT_boolean': 'object'}


        
# search_vehicle only has False and nan
# type is just vehicular
# violation column is of interest - reason for stop (split by |?)

In [36]:
chunk = pd.read_csv(csvFilepath,chunksize=1000000, dtype=dtypeDict, usecols=colNamesList)
df = pd.concat(chunk)

In [37]:
# Delete rows with 'unknown' and 'nan' for 'subject_race'
df = df[df['subject_race'] != 'unknown']
df = df[df['subject_race'].notna()]

In [38]:
# keep only year from date and hour from time
df['date'] = pd.to_datetime(df['date']).dt.year
df = df.rename(columns={'date': 'year'})
df['time'] = pd.to_datetime(df['time']).dt.hour
df = df.rename(columns={'time': 'hour'})

# map races to numbers
race_map = {'white': 0, 'hispanic': 1, 'black': 2, 'asian/pacific islander': 3, 'other': 4}
df['subject_race'] = df['subject_race'].map(race_map)

# map citation issued to numbers
df['citation_issued'] = df['citation_issued'].astype(int)

In [57]:
df.head()

Unnamed: 0,year,hour,subject_race,citation_issued
0,2006,0,0,False
1,2006,0,0,True
2,2006,0,1,True
3,2006,0,0,False
4,2006,0,0,True


In [40]:
X = df.drop('citation_issued', axis=1)
y = df.filter(items=['citation_issued'])

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [50]:
clf = LogisticRegressionCV(random_state=0)
clf.fit(X_train, y_train['citation_issued'])

LogisticRegressionCV(random_state=0)

In [53]:
# training accuracy
y_train_pred = clf.predict(X_train)
print(len(y_train_pred))
print(len(y_train))
np.mean(y_train_pred == y_train['citation_issued'])

22804625
22804625


0.6256219078366778

In [54]:
# test accuracy
y_test_pred = clf.predict(X_test)
np.mean(y_test_pred == y_test['citation_issued'])

0.6254439851841765

In [56]:
clf.coef_

array([[-0.00024625, -0.00853047,  0.16960716]])