In [1]:
import pandas as pd
import numpy as np
import optuna
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.metrics import brier_score_loss
from sklearn.calibration import calibration_curve


from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

#import data
df = pd.read_csv('data/Recidivism_Data_cleaned.csv')
df_original = df.copy()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# We only want to predict recidivism within 3 years
#We are also looking at just Males
df.drop(['Recidivism_Arrest_Year1', 'Recidivism_Arrest_Year2', 'Recidivism_Arrest_Year3', 'Gender'], axis=1, inplace=True)

# Remove variables that are determined only after prison release
df.drop(['Supervision_Risk_Score_First', 'Supervision_Level_First',
         'Violations_ElectronicMonitoring', 'Violations_FailToReport','Violations_FailToReport', 'Violations_MoveWithoutPermission',
         'Delinquency_Reports', 'Program_Attendances', 'Program_UnexcusedAbsences','Residence_Changes', 
         'DrugTests_THC_Positive', 'DrugTests_Cocaine_Positive','DrugTests_Meth_Positive', 'DrugTests_Other_Positive', 
         'Percent_Days_Employed', 'Jobs_Per_Year', 'Employment_Exempt'], axis=1, inplace=True)

#Split Train and Test Data
train_df = df[df['Training_Sample'] == 1]
test_df = df[df['Training_Sample'] == 0]

X_train = train_df.drop(['Recidivism_Within_3years', 'Training_Sample'], axis=1)
y_train = train_df['Recidivism_Within_3years']

X_test = test_df.drop(['Recidivism_Within_3years', 'Training_Sample'], axis=1)
y_test = test_df['Recidivism_Within_3years']

In [3]:
#Cat Features if post parole features are removed
cat_features = ['Race', 'Residence_PUMA', 'Gang_Affiliated',
       'Education_Level', 'Prison_Offense',
       'Prior_Arrest_Episodes_Felony', 'Prior_Arrest_Episodes_Misd',
       'Prior_Arrest_Episodes_Violent', 'Prior_Arrest_Episodes_Property',
       'Prior_Arrest_Episodes_Drug',
       'Prior_Arrest_Episodes_PPViolationCharges',
       'Prior_Arrest_Episodes_DVCharges', 'Prior_Arrest_Episodes_GunCharges',
       'Prior_Conviction_Episodes_Felony', 'Prior_Conviction_Episodes_Misd',
       'Prior_Conviction_Episodes_Viol', 'Prior_Conviction_Episodes_Prop',
       'Prior_Conviction_Episodes_Drug',
       'Prior_Conviction_Episodes_PPViolationCharges',
       'Prior_Conviction_Episodes_DomesticViolenceCharges',
       'Prior_Conviction_Episodes_GunCharges', 'Prior_Revocations_Parole',
       'Prior_Revocations_Probation', 'Condition_MH_SA', 'Condition_Cog_Ed',
       'Condition_Other', 'Violations_Instruction', 'Required_DrugTests']


In [None]:
#Cat Features if post parole features are included
cat_features = ['Race', 'Residence_PUMA', 'Gang_Affiliated',
       'Education_Level', 'Prison_Offense',
       'Prior_Arrest_Episodes_Felony', 'Prior_Arrest_Episodes_Misd',
       'Prior_Arrest_Episodes_Violent', 'Prior_Arrest_Episodes_Property',
       'Prior_Arrest_Episodes_Drug',
       'Prior_Arrest_Episodes_PPViolationCharges',
       'Prior_Arrest_Episodes_DVCharges', 'Prior_Arrest_Episodes_GunCharges',
       'Prior_Conviction_Episodes_Felony', 'Prior_Conviction_Episodes_Misd',
       'Prior_Conviction_Episodes_Viol', 'Prior_Conviction_Episodes_Prop',
       'Prior_Conviction_Episodes_Drug',
       'Prior_Conviction_Episodes_PPViolationCharges',
       'Prior_Conviction_Episodes_DomesticViolenceCharges',
       'Prior_Conviction_Episodes_GunCharges', 'Prior_Revocations_Parole',
       'Prior_Revocations_Probation', 'Condition_MH_SA', 'Condition_Cog_Ed',
       'Condition_Other', 'Violations_Instruction', 'Required_DrugTests', 
       'DrugTests_THC_Positive', 'DrugTests_Cocaine_Positive','DrugTests_Meth_Positive', 'DrugTests_Other_Positive',
       'Employment_Exempt', 'Violations_ElectronicMonitoring', 'Violations_FailToReport','Violations_FailToReport', 
       'Violations_MoveWithoutPermission', 'Supervision_Level_First']


In [4]:
#Hyperparameter Tuning Function
def objective(trial):
    param = {
        'iterations': 2500,  #trial.suggest_int('iterations', 100, 10000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0),
        # Add more parameters here if needed
    }

    model = CatBoostClassifier(**param, loss_function='Logloss', verbose=False)
    model.fit(X_train, y_train, cat_features=cat_features)
    
    y_prob = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_prob)

    return roc_auc


In [5]:
# Tune hyperparameters
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can change the number of trials

best_params = study.best_params
print('Best parameters:', best_params)


[I 2023-11-29 22:05:04,866] A new study created in memory with name: no-name-546272ce-9055-4067-96bd-5328651d031c
[I 2023-11-29 22:06:17,271] Trial 0 finished with value: 0.7302239337806378 and parameters: {'learning_rate': 0.00468488764896517, 'depth': 4, 'l2_leaf_reg': 1.1515623336987726}. Best is trial 0 with value: 0.7302239337806378.
[I 2023-11-29 22:07:55,414] Trial 1 finished with value: 0.7313742206924949 and parameters: {'learning_rate': 0.024259542253144604, 'depth': 5, 'l2_leaf_reg': 0.29553855859279143}. Best is trial 1 with value: 0.7313742206924949.
[I 2023-11-29 22:10:16,247] Trial 2 finished with value: 0.7272090277374523 and parameters: {'learning_rate': 0.03358062177447614, 'depth': 6, 'l2_leaf_reg': 2.987557025210064}. Best is trial 1 with value: 0.7313742206924949.
[I 2023-11-29 22:15:03,215] Trial 3 finished with value: 0.7146691055494261 and parameters: {'learning_rate': 0.04847103154409602, 'depth': 10, 'l2_leaf_reg': 3.0960773236353196}. Best is trial 1 with val

In [None]:
#Train Model with Best Parameters with all features

model = CatBoostClassifier(
    iterations=2500, 
    learning_rate=0.046300532196226735, 
    depth=4,
    l2_leaf_reg=5.365114424519505,
    loss_function='Logloss',
    verbose=True
)

model.fit(X_train, y_train, cat_features=cat_features)

In [None]:
#Train Model with Best Parameters removing features that are determined after prison release

model = CatBoostClassifier(
    iterations=2500, 
    learning_rate=0.024259542253144604, 
    depth=5,
    l2_leaf_reg=0.29553855859279143,
    loss_function='Logloss',
    verbose=True
)

model.fit(X_train, y_train, cat_features=cat_features)

In [None]:
# Make predictions

y_prob = model.predict_proba(X_test)[:, 1]  # Probability estimates
y_pred = y_prob > 0.5 # Threshold predictions

# Evaluate the model
roc_auc = roc_auc_score(y_test, y_prob)
accuracy = accuracy_score(y_test, y_pred)
brier_score = brier_score_loss(y_test, y_prob)

print(f'ROC AUC Score: {roc_auc:.4f}')
print(f'Accuracy: {accuracy:.4f}')
print(f'Brier Score: {brier_score:.4f}')


In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:

# Generate the calibration curve
prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10, strategy='uniform')

# Create a calibration plot
plt.figure(figsize=(8, 6))
plt.plot(prob_pred, prob_true, marker='o', linestyle='-', label='Calibration Plot')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfectly Calibrated')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Plot')
plt.legend()
plt.show()