In [1]:
import pandas as pd
import numpy as np
import optuna
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, auc

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

#import data
df = pd.read_csv('data/Recidivism_Data_cleaned.csv')
df_original = df.copy()

In [2]:
# We only want to predict recidivism within 3 years
#We are also looking at just Males
df.drop(['Recidivism_Arrest_Year1', 'Recidivism_Arrest_Year2', 'Recidivism_Arrest_Year3', 'Gender'], axis=1, inplace=True)

# Remove variables thaat are determined only after prison release
#df.drop(['Supervision_Risk_Score_First', 'Supervision_Level_First',
#         'Violations_ElectronicMonitoring', 'Violations_FailToReport','Violations_FailToReport', 'Violations_MoveWithoutPermission',
#         'Delinquency_Reports', 'Program_Attendances', 'Program_UnexcusedAbsences','Residence_Changes', 
#         'DrugTests_THC_Positive', 'DrugTests_Cocaine_Positive','DrugTests_Meth_Positive', 'DrugTests_Other_Positive', 
#         'Percent_Days_Employed', 'Jobs_Per_Year', 'Employment_Exempt'], axis=1, inplace=True)

#Split Train and Test Data
train_df = df[df['Training_Sample'] == 1]
test_df = df[df['Training_Sample'] == 0]

X_train = train_df.drop(['Recidivism_Within_3years', 'Training_Sample'], axis=1)
y_train = train_df['Recidivism_Within_3years']

X_test = test_df.drop(['Recidivism_Within_3years', 'Training_Sample'], axis=1)
y_test = test_df['Recidivism_Within_3years']

In [None]:

cat_features = ['Race', 'Residence_PUMA', 'Gang_Affiliated',
       'Education_Level', 'Prison_Offense',
       'Prior_Arrest_Episodes_Felony', 'Prior_Arrest_Episodes_Misd',
       'Prior_Arrest_Episodes_Violent', 'Prior_Arrest_Episodes_Property',
       'Prior_Arrest_Episodes_Drug',
       'Prior_Arrest_Episodes_PPViolationCharges',
       'Prior_Arrest_Episodes_DVCharges', 'Prior_Arrest_Episodes_GunCharges',
       'Prior_Conviction_Episodes_Felony', 'Prior_Conviction_Episodes_Misd',
       'Prior_Conviction_Episodes_Viol', 'Prior_Conviction_Episodes_Prop',
       'Prior_Conviction_Episodes_Drug',
       'Prior_Conviction_Episodes_PPViolationCharges',
       'Prior_Conviction_Episodes_DomesticViolenceCharges',
       'Prior_Conviction_Episodes_GunCharges', 'Prior_Revocations_Parole',
       'Prior_Revocations_Probation', 'Condition_MH_SA', 'Condition_Cog_Ed',
       'Condition_Other', 'Violations_Instruction', 'Required_DrugTests']


In [14]:

cat_features = ['Race', 'Residence_PUMA', 'Gang_Affiliated',
       'Education_Level', 'Prison_Offense',
       'Prior_Arrest_Episodes_Felony', 'Prior_Arrest_Episodes_Misd',
       'Prior_Arrest_Episodes_Violent', 'Prior_Arrest_Episodes_Property',
       'Prior_Arrest_Episodes_Drug',
       'Prior_Arrest_Episodes_PPViolationCharges',
       'Prior_Arrest_Episodes_DVCharges', 'Prior_Arrest_Episodes_GunCharges',
       'Prior_Conviction_Episodes_Felony', 'Prior_Conviction_Episodes_Misd',
       'Prior_Conviction_Episodes_Viol', 'Prior_Conviction_Episodes_Prop',
       'Prior_Conviction_Episodes_Drug',
       'Prior_Conviction_Episodes_PPViolationCharges',
       'Prior_Conviction_Episodes_DomesticViolenceCharges',
       'Prior_Conviction_Episodes_GunCharges', 'Prior_Revocations_Parole',
       'Prior_Revocations_Probation', 'Condition_MH_SA', 'Condition_Cog_Ed',
       'Condition_Other', 'Violations_Instruction', 'Required_DrugTests', 
       'DrugTests_THC_Positive', 'DrugTests_Cocaine_Positive','DrugTests_Meth_Positive', 'DrugTests_Other_Positive',
       'Employment_Exempt', 'Violations_ElectronicMonitoring', 'Violations_FailToReport','Violations_FailToReport', 
       'Violations_MoveWithoutPermission', 'Supervision_Level_First']


In [15]:
X_train.dtypes

Race                                                  object
Age_at_Release                                       float64
Residence_PUMA                                         int64
Gang_Affiliated                                         bool
Supervision_Risk_Score_First                         float64
Supervision_Level_First                               object
Education_Level                                       object
Dependents                                             int64
Prison_Offense                                        object
Prison_Years                                           int64
Prior_Arrest_Episodes_Felony                           int64
Prior_Arrest_Episodes_Misd                             int64
Prior_Arrest_Episodes_Violent                          int64
Prior_Arrest_Episodes_Property                         int64
Prior_Arrest_Episodes_Drug                             int64
Prior_Arrest_Episodes_PPViolationCharges               int64
Prior_Arrest_Episodes_DV

In [16]:
def objective(trial):
    param = {
        'iterations': 5000,  #trial.suggest_int('iterations', 100, 10000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0),
        # Add more parameters here if needed
    }

    model = CatBoostClassifier(**param, loss_function='Logloss', verbose=False)
    model.fit(X_train, y_train, cat_features=cat_features)
    
    y_prob = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_prob)

    return roc_auc


In [17]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)  # You can change the number of trials

best_params = study.best_params
print('Best parameters:', best_params)


[I 2023-11-22 14:37:08,030] A new study created in memory with name: no-name-d7be21e0-1e12-4e1c-bf1d-343a1ea3deb4
[I 2023-11-22 14:39:55,998] Trial 0 finished with value: 0.7911286959367791 and parameters: {'learning_rate': 0.07654215605536062, 'depth': 10, 'l2_leaf_reg': 6.181635203891206}. Best is trial 0 with value: 0.7911286959367791.
[I 2023-11-22 14:50:57,550] Trial 1 finished with value: 0.7918618564719255 and parameters: {'learning_rate': 0.05680854808546975, 'depth': 10, 'l2_leaf_reg': 2.6254094489733815}. Best is trial 1 with value: 0.7918618564719255.
[I 2023-11-22 14:53:58,665] Trial 2 finished with value: 0.800732104741744 and parameters: {'learning_rate': 0.0046979384590038675, 'depth': 10, 'l2_leaf_reg': 9.111343245468456}. Best is trial 2 with value: 0.800732104741744.
[I 2023-11-22 14:54:44,214] Trial 3 finished with value: 0.8011644521400052 and parameters: {'learning_rate': 0.046300532196226735, 'depth': 4, 'l2_leaf_reg': 5.365114424519505}. Best is trial 3 with valu

KeyboardInterrupt: 

In [18]:
model = CatBoostClassifier(
    iterations=5000, 
    learning_rate=0.046300532196226735, 
    depth=4,
    l2_leaf_reg=5.365114424519505,
    loss_function='Logloss',
    verbose=True
)

model.fit(X_train, y_train, cat_features=cat_features)

0:	learn: 0.6836290	total: 7.39ms	remaining: 36.9s
1:	learn: 0.6744830	total: 15ms	remaining: 37.5s
2:	learn: 0.6664074	total: 22.9ms	remaining: 38.2s
3:	learn: 0.6593577	total: 29.9ms	remaining: 37.3s
4:	learn: 0.6527688	total: 37.9ms	remaining: 37.9s
5:	learn: 0.6464744	total: 44.9ms	remaining: 37.4s
6:	learn: 0.6409785	total: 51.8ms	remaining: 36.9s
7:	learn: 0.6359884	total: 59.4ms	remaining: 37.1s
8:	learn: 0.6312039	total: 67ms	remaining: 37.1s
9:	learn: 0.6264885	total: 73.9ms	remaining: 36.9s
10:	learn: 0.6225813	total: 81.4ms	remaining: 36.9s
11:	learn: 0.6192303	total: 89.3ms	remaining: 37.1s
12:	learn: 0.6158701	total: 98ms	remaining: 37.6s
13:	learn: 0.6125130	total: 107ms	remaining: 38s
14:	learn: 0.6093341	total: 114ms	remaining: 37.8s
15:	learn: 0.6069825	total: 121ms	remaining: 37.8s
16:	learn: 0.6043052	total: 131ms	remaining: 38.3s
17:	learn: 0.6017603	total: 138ms	remaining: 38.1s
18:	learn: 0.5991525	total: 145ms	remaining: 38.1s
19:	learn: 0.5969567	total: 154ms	re

<catboost.core.CatBoostClassifier at 0x15dc22c20>

In [19]:
# Make predictions

y_prob = model.predict_proba(X_test)[:, 1]  # Probability estimates
y_pred = y_prob > 0.5 # Threshold predictions

# Evaluate the model
roc_auc = roc_auc_score(y_test, y_prob)
accuracy = accuracy_score(y_test, y_pred)

print(f'ROC AUC Score: {roc_auc:.4f}')
print(f'Accuracy: {accuracy:.4f}')


ROC AUC Score: 0.8012
Accuracy: 0.7363
