In [1]:
import utils
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from pygam import LogisticGAM

%matplotlib notebook

# Automating the proess of finding the filename
dataFolder = 'data'
plotsFolder = 'plots'
state = 'TX'
policeDept = 'statewide' # 'statewide' means state patrol

# Choose column names to load
colNamesList = ['date',
                'time',
                'location',
                'lat',
                'lng',
                'county_name',
#                 'district',
#                 'precinct',
#                 'region',
                'subject_race',
                'subject_sex',
                'officer_id_hash',
                'type',
                'violation',
                'citation_issued',
                'warning_issued',
                'outcome',
                'contraband_found',
                'contraband_drugs',
                'contraband_weapons',
                'search_conducted',
                'search_vehicle',
                'search_basis',
                'vehicle_color',
                'vehicle_make',
                'vehicle_model',
                'vehicle_type',
                'vehicle_year']

In [2]:
# Automating the creation of rowName:dtype dict
# This throws up a lot of stupid warnings/errors right now!
# df = pd.read_csv(csvFilepath, nrows=5, names=colNamesList)
# dtypeDict = df.dtypes.to_dict()
# dtypeDict

dtypeDict = {'raw_row_number': 'int64', 
             'date': 'object',
             'time': 'object',
             'location': 'object',
             'lat': 'float64',
             'lng': 'float64',
             'county_name': 'object',
             'district': 'object',
             'precinct': 'object',
             'region': 'float64',
             'subject_race': 'object',
             'subject_sex': 'object',
             'officer_id_hash': 'object',
             'type': 'object',
             'violation': 'object',
             'citation_issued': 'bool',
             'warning_issued': 'bool',
             'outcome': 'object',
             'contraband_found': 'object',
             'contraband_drugs': 'object',
             'contraband_weapons': 'object',
             'search_conducted': 'object',
             'search_vehicle': 'object',
             'search_basis': 'object',
             'vehicle_color': 'object',
             'vehicle_make': 'object',
             'vehicle_model': 'object',
             'vehicle_type': 'object',
             'vehicle_year': 'float64',
             'raw_HA_RACE_SEX': 'object',
             'raw_HA_SEARCH_PC_boolean': 'object',
             'raw_HA_SEARCH_CONCENT_boolean': 'object',
             'raw_HA_INCIDTO_ARREST_boolean': 'object',
             'raw_HA_VEHICLE_INVENT_boolean': 'object'}


        
# search_vehicle only has False and nan
# type is just vehicular
# violation column is of interest - reason for stop (split by |?)

In [3]:
df = utils.load_data(dataFolder, state, policeDept, dtypeDict=dtypeDict, colNames=colNamesList)
df = utils.remove_empty_rows(df, 'subject_race')

In [4]:
X = np.array(df['subject_race']).reshape(-1,1)
y = np.array(df['citation_issued']).astype(int)

enc = OneHotEncoder(categories=[df['subject_race'].unique().tolist()])
enc.fit(X)
X = enc.transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
X_train = X_train[:,1:]
X_test = X_test[:,1:]
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(22804625, 4) (22804625,) (4024346, 4) (4024346,)


# Logistic Regression with built-in Cross Validation
l2 penalty and lbfgs solver

In [7]:
clf = LogisticRegressionCV(random_state=0)
clf.fit(X_train, y_train)

LogisticRegressionCV(random_state=0)

In [8]:
# training accuracy
y_train_pred = clf.predict(X_train)
np.mean(y_train_pred == y_train)

0.6262521308725751

In [9]:
# test accuracy
y_test_pred = clf.predict(X_test)
np.mean(y_test_pred == y_test)

0.6260950226446732

In [10]:
clf.coef_

array([[0.3167457 , 0.27873825, 0.25599212, 0.05141742]])

In [11]:
clf.intercept_

array([-0.64852582])

In [12]:
enc.categories_

[array(['white', 'hispanic', 'black', 'asian/pacific islander', 'other'],
       dtype=object)]

In [13]:
np.exp(clf.coef_)

array([[1.37265347, 1.32146141, 1.29174255, 1.05276225]])

In [14]:
np.exp(clf.intercept_)

array([0.52281593])

# Logistic Regression with built-in Cross Validation
l2 penalty and sag solver

In [11]:
clf2 = LogisticRegressionCV(solver='sag', penalty='l2', random_state=0)
clf2.fit(X_train, y_train)

LogisticRegressionCV(random_state=0, solver='sag')

In [12]:
# training accuracy
y_train_pred = clf2.predict(X_train)
np.mean(y_train_pred == y_train)

0.6262521308725751

In [13]:
# test accuracy
y_test_pred = clf2.predict(X_test)
np.mean(y_test_pred == y_test)

0.6260950226446732

In [15]:
clf2.coef_

array([[-0.21279648,  0.10758595,  0.07197385,  0.06741477, -0.03416888]])

# Logistic Regression with built-in Cross Validation
l1 penalty and saga solver

In [16]:
clf3 = LogisticRegressionCV(solver='saga', penalty='l1', random_state=0)
clf3.fit(X_train, y_train)

LogisticRegressionCV(penalty='l1', random_state=0, solver='saga')

In [17]:
# training accuracy
y_train_pred = clf3.predict(X_train)
np.mean(y_train_pred == y_train)

0.6262521308725751

In [18]:
# test accuracy
y_test_pred = clf3.predict(X_test)
np.mean(y_test_pred == y_test)

0.6260950226446732

In [19]:
clf3.coef_

array([[-0.28190316,  0.03056305,  0.        ,  0.        ,  0.        ]])

# Logisitic Regression with GridSearchCV

In [None]:
parameters = {
    'penalty': ['l1', 'l2', 'elasticnet'], 
    'C': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1, 1e2, 1e3, 1e4], 
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}
logreg = LogisticRegression(max_iter=400, random_state=0)
clf1 = GridSearchCV(logreg, parameters)
clf1.fit(X_train, y_train)

In [None]:
# training accuracy
y_train_pred1 = clf1.predict(X_train)
np.mean(y_train_pred1 == y_train)

In [None]:
# test accuracy
y_test_pred1 = clf1.predict(X_test)
np.mean(y_test_pred1 == y_test)

In [None]:
clf1.cv_results_

In [None]:
clf1.coef_

# Logistic GAM

In [None]:
gam = LogisticGAM().gridsearch(X_train.todense(), y_train)

N/A% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--

In [None]:
# training accuracy
y_train_pred2 = gam.predict(X_train)
np.mean(y_train_pred2 == y_train)

In [None]:
# test accuracy
y_test_pred2 = gam.predict(X_test)
np.mean(y_test_pred2 == y_test)

In [14]:
X_train

<22804625x5 sparse matrix of type '<class 'numpy.float64'>'
	with 22804625 stored elements in Compressed Sparse Row format>