In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [2]:
#Read in Data
rec = pd.read_csv('data/recidivism_clean.csv')

In [3]:
#Drop these for predictions prior to prison release
rec = rec.drop(['Supervision_Risk_Score_First', 'Supervision_Level_First', 'Violations_ElectronicMonitoring', 'Violations_FailToReport', 
                'Violations_FailToReport', 'Violations_MoveWithoutPermission', 'Delinquency_Reports', 'Program_Attendances', 
                'Program_UnexcusedAbsences','Residence_Changes', 'DrugTests_THC_Positive', 'DrugTests_Cocaine_Positive', 
                'DrugTests_Meth_Positive', 'DrugTests_Other_Positive', 'Employment_Exempt', 'employed'], axis=1)

In [4]:
# Convert strings/objects to categories
rec[rec.select_dtypes(['object']).columns] = rec.select_dtypes(['object']).astype('category')

#rec_train was the data provided to train the model and rec_val represents the evaluation data
rec_train = rec[rec['Training_Sample'] == 1]
rec_val = rec[rec['Training_Sample'] == 0]

#Get Number of Observations for training and  validation set
n_train = rec_train.shape[0]
n_val = rec_val.shape[0]


In [5]:
#split the data used for cross validation as rec_train and data for final evaluation as rec_test
#Also drop the columns I do not want, and capturing the variable of interest in the y variable

X = rec_train.drop(['Recidivism_Arrest_Year1', 'Recidivism_Arrest_Year2', 'Recidivism_Arrest_Year3', 'Recidivism_Within_3years', 'Training_Sample', 'ID', 'Race'], axis=1)
y = rec_train['Recidivism_Within_3years']

X_test = rec_val.drop(['Recidivism_Arrest_Year1', 'Recidivism_Arrest_Year2', 'Recidivism_Arrest_Year3', 'Recidivism_Within_3years', 'Training_Sample', 'ID', 'Race'], axis=1)
y_test = rec_val['Recidivism_Within_3years']

In [6]:
#Define Bayesian Optimization Search Space

space = {
    'num_leaves': 5, #hp.quniform('num_leaves', 2, 100, 1),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.75, 1),
    'feature_fraction': 0.9710810081829547, #hp.uniform('feature_fraction', 0.25, 1),
    'learning_rate': 0.1554172850388652, #hp.loguniform('learning_rate', -3, -1),    
    'reg_alpha': hp.uniform('reg_alpha', 0, 0.25),
    'reg_lambda': hp.uniform('reg_lambda', 0, 0.25),
    'max_depth': 8, # hp.quniform('max_depth', 1, 10, 1),
}

In [7]:
#Create function to run cross validation while searching for optimal hyper-parameters
def objective(params):
    
    params = {
        'num_leaves': int(params['num_leaves']),
        'max_depth': int(params['max_depth']),
        'bagging_fraction': float(params['bagging_fraction']),
        'feature_fraction': float(params['feature_fraction']),
        'learning_rate': float(params['learning_rate']),
        'reg_alpha': float(params['reg_alpha']),
        'reg_lambda': float(params['reg_lambda']), 
        'n_jobs' : -1,
        'boosting_type': 'gbdt'
    }

    model = lgb.LGBMClassifier(**params, random_state=1994)
    kf = KFold(n_splits=6, random_state=1994, shuffle=True)
    score = np.mean(cross_val_score(model, X, y, cv=kf, scoring='neg_brier_score'))
    
    return {'loss': -score, 'status': STATUS_OK}


In [8]:
#Use hyperopt fmin function to find optimal parameters
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=200, trials=trials)

100%|██████████| 200/200 [02:04<00:00,  1.61trial/s, best loss: 0.20243617660114033]


In [9]:
print(best)

#print('max_depth', best['max_depth'] + depth_start)

{'bagging_fraction': 0.7990114523775765, 'reg_alpha': 0.06766451909430012, 'reg_lambda': 0.19624464361828178}


In [13]:
#Bring in best hyperparameters from tuning
params = {
    'num_leaves': int(best['num_leaves']),
    'bagging_fraction': best['bagging_fraction'],
    'feature_fraction': best['feature_fraction'],
    'learning_rate': best['learning_rate'],
    'reg_alpha': best['reg_alpha'],
    'reg_lambda': best['reg_lambda'],
    'n_jobs' : -1,
    'max_depth': best['max_depth']
    }

#Fit model
model = lgb.LGBMClassifier(**params, random_state=1994)
model.fit(X,y)



In [14]:
#Predict recidivism, print accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)


Accuracy:  0.7051357733175915


In [15]:
#Predict probability of recidivism
y_pred_prob = model.predict_proba(X_test)

#Extract probability of recidivism as predict_proba returns an array that has both probability of not recidivating and probability of recidivating
y_pred_prob = y_pred_prob[:,1]
y_prob = [int(b) for b in y_test]

#Calculate brier score and print
brier_score = sum((y_pred_prob - y_prob)**2) / n_val
print("Brier Score:", brier_score)

Brier Score: 0.1931743389205207


In [16]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[4493  180]
 [1818  285]]


In [14]:
#Add predicted outcome to the validation set
rec_val['y_pred'] = y_pred

#Split dataframes by race
rec_val_black = rec_val[rec_val['Race'] == 'BLACK']
rec_val_white = rec_val[rec_val['Race'] == 'WHITE']

#Calculate False Positive Rates by race
n_white_val = rec_val_white.shape[0]
n_black_val = rec_val_black.shape[0]

fpr_black = len(rec_val_black[(rec_val_black['Recidivism_Arrest_Year1'] == False) & (rec_val_black['y_pred'] == True)]) / n_black_val
fpr_white = len(rec_val_white[(rec_val_white['Recidivism_Arrest_Year1'] == False) & (rec_val_white['y_pred'] == True)]) / n_white_val

fairness_penalty = 1 - abs(fpr_black - fpr_white)

print("FPR Black:", fpr_black)
print("FPR White:", fpr_white)

FPR Black: 0.4487581384133108
FPR White: 0.4419931532902244


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_val['y_pred'] = y_pred


In [15]:
score = (1 - brier_score) * fairness_penalty
score

0.7901415144213332