In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (classification_report, confusion_matrix, 
                            roc_auc_score, roc_curve, 
                            f1_score)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('FINAL_features.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nTarget variable - pretrial_recidivism:")
print(df['pretrial_recidivism'].value_counts())

Dataset shape: (434235, 58)

Target variable - pretrial_recidivism:
pretrial_recidivism
0.0    333144
1.0    101091
Name: count, dtype: int64


In [2]:
# Remove rows with missing target variable
df = df[df['pretrial_recidivism'].notna()]
print(f"Dataset shape after removing missing targets: {df.shape}")

# Drop ID, date columns, data leakage (other recidivism types), and post-outcome features
drop_cols = [
    'id', 'docketnumber', 'defendantdisplayname', 'name',
    'filingdate', 'offensedate', 'complaintdate', 'offensedispositiondate', 
    'disp_date', 'arrest_date', 'pretrial_start', 'pretrial_end', 'dob_y',
    'earliest_in_window_arrest',
    # Data leakage - other recidivism outcomes
    'misdemeanor_recidivism', 'felony_recidivism', 'other_recidivism', 'offense_during_same_year_flag', 
    # Post-outcome features
    'cost', 'costadjustment',
    # Features that are direct outcomes or aggregates of future behavior
    'n_in_window_arrests', 'num_prior_cases', 'casedisposition', 
    # Demographic features
    'sex', 'race_y', 'ethnicity'
]

X = df.drop(columns=drop_cols + ['pretrial_recidivism'])
y = df['pretrial_recidivism'].astype(int)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"\nCategorical features: {categorical_cols}")
print(f"Numerical features: {numerical_cols}")

# Handle missing values
for col in numerical_cols:
    if X[col].isna().sum() > 0:
        X[col].fillna(X[col].median(), inplace=True)

for col in categorical_cols:
    if X[col].isna().sum() > 0:
        X[col].fillna('Unknown', inplace=True)

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

print(f"\nFinal feature matrix shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Calculate scale_pos_weight
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Class imbalance ratio: {scale_pos_weight:.2f}")



Dataset shape after removing missing targets: (434235, 58)

Categorical features: ['casestatus', 'casecategory', 'countyofoffense', 'county', 'charge', 'title', 'section', 'subsection', 'grade_x', 'citation', 'citationcomplaintnumber', 'age_group']
Numerical features: ['dv_flag', 'juvflag', 'conv_flag', 'age_at_offense', 'days_since_last_offense', 'drug_flag', 'violent_flag', 'property_flag', 'max_charge_severity', 'min_charge_severity', 'avg_charge_severity', 'std_charge_severity', 'range_charge_severity', 'num_charges_case', 'num_charges_individual', 'offense_year', 'prev_recid_flag', 'multi_charge_flag', 'offense_type_intensity']

Final feature matrix shape: (434235, 31)
Target distribution:
pretrial_recidivism
0    333144
1    101091
Name: count, dtype: int64

Train set size: 347388
Test set size: 86847
Class imbalance ratio: 3.30


In [3]:
df_drop = df.drop(columns=drop_cols)
df_drop.columns

Index(['casestatus', 'casecategory', 'countyofoffense', 'county', 'dv_flag',
       'juvflag', 'conv_flag', 'charge', 'title', 'section', 'subsection',
       'grade_x', 'citation', 'citationcomplaintnumber', 'pretrial_recidivism',
       'age_at_offense', 'age_group', 'days_since_last_offense', 'drug_flag',
       'violent_flag', 'property_flag', 'max_charge_severity',
       'min_charge_severity', 'avg_charge_severity', 'std_charge_severity',
       'range_charge_severity', 'num_charges_case', 'num_charges_individual',
       'offense_year', 'prev_recid_flag', 'multi_charge_flag',
       'offense_type_intensity'],
      dtype='object')

In [30]:
# Save the engineered dataset
df_drop.to_csv('modeling_features.csv', index=False)