In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr, chi2_contingency
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
df = pd.read_csv('final_engineered.csv')

print(f"Dataset shape: {df.shape}")
print(f"Overall recidivism rate: {df['pretrial_recidivism'].mean()*100:.2f}%\n")

Dataset shape: (434235, 60)
Overall recidivism rate: 23.28%



In [3]:
df = df.drop('waived_or_dismissed_flag', axis=1)
df.columns

Index(['id', 'docketnumber', 'filingdate', 'offensedate', 'complaintdate',
       'offensedispositiondate', 'disp_date', 'arrest_date', 'casestatus',
       'casecategory', 'casedisposition', 'countyofoffense', 'county',
       'defendantdisplayname', 'name', 'sex', 'race_y', 'ethnicity', 'dob_y',
       'dv_flag', 'juvflag', 'conv_flag', 'cost', 'costadjustment', 'charge',
       'title', 'section', 'subsection', 'grade_x', 'citation',
       'citationcomplaintnumber', 'pretrial_recidivism',
       'misdemeanor_recidivism', 'felony_recidivism', 'other_recidivism',
       'n_in_window_arrests', 'earliest_in_window_arrest', 'pretrial_start',
       'pretrial_end', 'age_at_offense', 'age_group',
       'days_since_last_offense', 'drug_flag', 'violent_flag', 'property_flag',
       'max_charge_severity', 'min_charge_severity', 'avg_charge_severity',
       'std_charge_severity', 'range_charge_severity', 'num_charges_case',
       'num_charges_individual', 'num_prior_cases', 'offense_year'

In [4]:
df = df.drop('severity_dismiss_interaction', axis=1)
df.columns

Index(['id', 'docketnumber', 'filingdate', 'offensedate', 'complaintdate',
       'offensedispositiondate', 'disp_date', 'arrest_date', 'casestatus',
       'casecategory', 'casedisposition', 'countyofoffense', 'county',
       'defendantdisplayname', 'name', 'sex', 'race_y', 'ethnicity', 'dob_y',
       'dv_flag', 'juvflag', 'conv_flag', 'cost', 'costadjustment', 'charge',
       'title', 'section', 'subsection', 'grade_x', 'citation',
       'citationcomplaintnumber', 'pretrial_recidivism',
       'misdemeanor_recidivism', 'felony_recidivism', 'other_recidivism',
       'n_in_window_arrests', 'earliest_in_window_arrest', 'pretrial_start',
       'pretrial_end', 'age_at_offense', 'age_group',
       'days_since_last_offense', 'drug_flag', 'violent_flag', 'property_flag',
       'max_charge_severity', 'min_charge_severity', 'avg_charge_severity',
       'std_charge_severity', 'range_charge_severity', 'num_charges_case',
       'num_charges_individual', 'num_prior_cases', 'offense_year'

In [5]:
# Save the engineered dataset
df.to_csv('FINAL_features.csv', index=False)

In [5]:
df.head()

Unnamed: 0,id,docketnumber,filingdate,offensedate,complaintdate,offensedispositiondate,disp_date,arrest_date,casestatus,casecategory,...,std_charge_severity,range_charge_severity,num_charges_case,num_charges_individual,num_prior_cases,offense_year,offense_during_same_year_flag,prev_recid_flag,multi_charge_flag,offense_type_intensity
0,99578730,MJ-28304-CR-0000053-2017,2017-03-07 15:29:00,2017-03-06,2017-03-07 15:29:00,2018-05-02 13:30:00,2018-05-02,2017-03-07,Closed,Court Case,...,0.0,0,1,1,0,2017,0,0,0,0
1,99578731,MJ-43303-CR-0000164-2015,2015-05-26 11:34:00,2014-12-01,2015-05-26 11:34:00,2015-06-30 10:00:00,2015-06-30,2014-12-01,Closed,Court Case,...,0.0,0,4,4,0,2014,0,0,1,0
2,99578733,MJ-14202-CR-0000239-2017,2017-06-09 22:00:00,2017-06-09,2017-06-09 22:00:00,2017-08-15 09:30:00,2017-08-15,2017-06-09,Closed,Court Case,...,1.0,2,3,3,0,2017,0,0,1,1
3,99578734,MJ-09305-CR-0000251-2017,2017-06-22 11:10:00,2017-05-27,2017-06-22 11:10:00,2017-09-22 08:15:00,2018-10-23,2017-05-27,Closed,Court Case,...,0.5,1,2,2,0,2017,0,0,1,0
4,99578734,MJ-09305-CR-0000084-2018,2018-03-05 13:36:00,2018-02-26,2018-03-05 13:36:00,2018-05-04 08:56:00,2018-08-28,2018-02-26,Closed,Court Case,...,0.5,1,2,4,1,2018,0,1,1,0
