In [293]:
import csv
import pandas as pd
import numpy as np

In [294]:
data = pd.read_csv('data.csv')

In [295]:
data.head()

Unnamed: 0,raw_row_number,date,time,location,lat,lng,beat,subject_age,subject_race,subject_sex,...,contraband_weapons,search_conducted,search_basis,reason_for_stop,use_of_force_description,raw_subject_sdrace,raw_subject_resultofencounter,raw_subject_searchconducted,raw_subject_typeofsearch,raw_subject_resultofsearch
0,17213,,01:33:00,E. 28th St. & Park BLVD,37.803084,-122.237247,,,asian/pacific islander,female,...,,False,,Traffic Violation,,A,"Citation,",No,,
1,17214,,14:48:00,1759 SEMINARY AV,37.767568,-122.19682,,,black,male,...,False,True,other,Probation/Parole,,B,"FI Report,",Yes,"Prob./Parole,",
2,1,2013-04-01,00:00:00,31st St And Mlk Jr Way,37.820599,-122.270734,,,white,male,...,False,True,other,Traffic Violation,,W,"Warning,",Yes,"Prob./Parole,",
3,2,2013-04-01,00:01:00,3000 Block Of San Pablo Ave,37.821246,-122.276488,,,black,male,...,,False,,Traffic Violation,,B,"FI Report,",No,,
4,19,2013-04-01,10:41:00,12th St/Broadway,37.802937,-122.271738,,,hispanic,female,...,,False,,Probable Cause,,H,"Citation,",No,,


In [296]:
set(data['subject_race'])

{'asian/pacific islander', 'black', 'hispanic', 'other', 'white'}

In [297]:
data.columns

Index(['raw_row_number', 'date', 'time', 'location', 'lat', 'lng', 'beat',
       'subject_age', 'subject_race', 'subject_sex', 'officer_assignment',
       'contraband_found', 'contraband_drugs', 'contraband_weapons',
       'search_conducted', 'search_basis', 'reason_for_stop',
       'use_of_force_description', 'raw_subject_sdrace',
       'raw_subject_resultofencounter', 'raw_subject_searchconducted',
       'raw_subject_typeofsearch', 'raw_subject_resultofsearch'],
      dtype='object')

In [298]:
data['action'] = data['citation_issued'] | data['arrest_made']

In [299]:
data[['subject_race', 'citation_issued', 'arrest_made', 'action']].head()

Unnamed: 0,subject_race,citation_issued,arrest_made,action
0,asian/pacific islander,True,False,True
1,black,False,False,False
2,white,False,False,False
3,black,False,False,False
4,hispanic,True,False,True


In [300]:
inputs = ['subject_race', 'subject_sex', 'lat', 'lng', 'subject_age', 'reason_for_stop']
label = ['action', 'arrest_made']

In [301]:
data['subject_age'].median()

29.0

In [302]:
data['subject_age'] = data['subject_age'].fillna(data['subject_age'].median())
old_data = data.copy()
data = data[inputs + label].dropna(axis=0)

In [303]:
#set(data['reason_for_stop'])

In [304]:
races = list(set(data['subject_race']))
sexes = list(set(data['subject_sex']))
reasons = list(set(data['reason_for_stop']))

In [305]:
data['clean_lng'] = data['lng'] - data['lng'].mean() 
data['clean_lat'] = data['lat'] - data['lat'].mean()
data['clean_age'] = data['subject_age'] - data['subject_age'].mean()

In [306]:
data['race'] = data.apply(lambda row: races.index(row['subject_race']), axis=1)
data['sex']= data.apply(lambda row: sexes.index(row['subject_sex']), axis=1)
data['reason']= data.apply(lambda row: reasons.index(row['reason_for_stop']), axis=1)

In [499]:
inputs = ['race', 'sex', 'clean_lat', 'clean_lng', 'clean_age', 'reason_for_stop']
#inputs = ['race', 'sex', 'clean_age']
label = ['action']

In [500]:
reasons = list(set(sum([x.replace(',', '').split('|') for x in reasons], [])))
reasons

['Other-Consensual',
 'Traffic Violation',
 'Probable Cause',
 'Consensual Encounter',
 'Reasonable Suspicion',
 'Probation/Parole']

## Load Data

In [501]:
import torch

In [502]:
inputs_npy = data[inputs].to_numpy()
labels_npy = data[label].to_numpy().astype('int')


In [503]:
new_inputs_npy = []
for inp in inputs_npy:
    race = inp[0]
    race_oh = np.zeros(5)
    race_oh[race] = 1
    middle = inp[1:-1]
    reason = np.zeros(6)
    for r in inp[-1].replace(',', '').split("|"):
        reason[reasons.index(r)] = 1
    new_inputs_npy.append(np.concatenate((race_oh, middle, reason)))
    

In [504]:
new_inputs_npy = np.array(new_inputs_npy).astype(np.float32)
new_inputs_npy[0]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        1.        ,  0.01568815, -0.00902605, -0.96774095,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
      dtype=float32)

In [505]:
data_npy = np.array(list(zip(new_inputs_npy, labels_npy)))

In [506]:
inds = np.arange(len(data))
np.random.shuffle(inds)
train_inds = inds[:int(len(data)*0.8)]
test_inds = inds[int(len(data)*0.8):]
train_data = [list(x) for x in data_npy[train_inds]][:50000]
test_data = [list(x) for x in data_npy[test_inds]]

In [507]:
train_loader = torch.utils.data.DataLoader(train_data, batch_size=128)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=128)

In [508]:
#train_data

## Models

In [509]:
import torch
import torch.nn.functional as F

In [589]:
inp_size=15
mlp_model = torch.nn.Sequential(torch.nn.Linear(inp_size, 128),
                                  torch.nn.ReLU(),
                                  torch.nn.Linear(128, 128),
                                  torch.nn.ReLU(),
                                  torch.nn.Linear(128, 1),
                                  torch.nn.Sigmoid())
mlp_optim = torch.optim.Adam(mlp_model.parameters(), lr=5e-4)

In [610]:
mlp_optim = torch.optim.Adam(mlp_model.parameters(), lr=1e-4)

## Training

In [590]:
def train_model(model, optim, train_loader, test_loader, activation='sigmoid', n_epochs=20):
    train_losses = []
    val_losses = []
    for epoch in range(n_epochs):
        epoch_losses = []
        for x, y in train_loader:
            #print(x)
            y_prime = model(x.float())
           # print(y_prime.max(), y_prime.min())
            #y_prime = F.sigmoid(y_prime)
            #print(y_prime.max(), y_prime.min())
            loss = F.binary_cross_entropy(y_prime, y.float())
            #print("LOSS", loss)
            optim.zero_grad()
            loss.backward()
            optim.step()
            epoch_losses.append(loss.item())
        train_losses.append(np.mean(epoch_losses))
        val_losses.append(evaluate_model(model, test_loader))
        print("EPOCH {}: Train Loss: {}, Val Loss: {}".format(epoch, train_losses[-1], val_losses[-1]))    
    return train_losses, val_losses

In [591]:
def evaluate_model(model, test_loader):
    with torch.no_grad():
        losses = []
        for x, y in test_loader:
            y_prime = model(x.float())
            loss = F.binary_cross_entropy(y_prime, y.float())
            losses.append(loss.item())
    return np.mean(losses)
        

In [611]:
t_loss, v_loss = train_model(mlp_model, mlp_optim, train_loader, test_loader, n_epochs=20)

EPOCH 0: Train Loss: 0.6556823331376781, Val Loss: 0.6609923919422205
EPOCH 1: Train Loss: 0.6553857890541291, Val Loss: 0.6609695539520117
EPOCH 2: Train Loss: 0.6553088675069687, Val Loss: 0.660965267836192
EPOCH 3: Train Loss: 0.6552602750870883, Val Loss: 0.6609624247801932
EPOCH 4: Train Loss: 0.6552179626491673, Val Loss: 0.6609678425286946
EPOCH 5: Train Loss: 0.6551785037645599, Val Loss: 0.6609657438177812
EPOCH 6: Train Loss: 0.6551415873000689, Val Loss: 0.6609774297504334
EPOCH 7: Train Loss: 0.6551075318585271, Val Loss: 0.6609695779079455
EPOCH 8: Train Loss: 0.6550749505267424, Val Loss: 0.6609743576871151
EPOCH 9: Train Loss: 0.6550427179812165, Val Loss: 0.6609815447524404
EPOCH 10: Train Loss: 0.6550120003997822, Val Loss: 0.6609867468975378
EPOCH 11: Train Loss: 0.6549833552611758, Val Loss: 0.6609925630560332
EPOCH 12: Train Loss: 0.6549554868122501, Val Loss: 0.6609938575320267
EPOCH 13: Train Loss: 0.6549290718934725, Val Loss: 0.6610112227321241
EPOCH 14: Train L

In [612]:
mlp_model(torch.tensor([0., 1, 0, 0., 0, 0., 0., 0., 0., 0, 0, 0, 0, 0, 0]))

tensor([0.4027], grad_fn=<SigmoidBackward>)

In [613]:
def get_accuracy(inp, labels, model, threshold=0.5):
    with torch.no_grad():
        inp, labels = torch.from_numpy(inp), torch.from_numpy(labels)
        guess = model(inp.float())
        preds = (guess > threshold).numpy().astype(int)
        return preds

In [614]:
inp = np.array([list(x) for x in np.array(test_data)[:, 0]])
labels = np.array([list(x) for x in np.array(test_data)[:, 1]])

In [615]:
a = get_accuracy(inp, labels, mlp_model, threshold=0.5)

In [605]:
sum(a == labels)[0] / len(labels)

0.5967869073983709

In [616]:
sum(a == labels)[0] / len(labels)

0.5956608235426598

In [608]:
sum(a) / len(a)

array([0.62126046])

In [497]:
sum(a)

array([809])