In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
df = pd.read_csv('FINAL-mdj_case_with_labels.csv')
print(df.shape)
print(df.columns)
df.head()

(434235, 41)
Index(['id', 'docketnumber', 'filingdate', 'offensedate', 'complaintdate',
       'offensedispositiondate', 'disp_date', 'arrest_date', 'casestatus',
       'casecategory', 'casedisposition', 'countyofoffense', 'county',
       'defendantdisplayname', 'name', 'sex', 'race_y', 'ethnicity', 'dob_y',
       'dv_flag', 'juvflag', 'laflag', 'conv_flag', 'susp_flag', 'cost',
       'costadjustment', 'charge', 'title', 'section', 'subsection', 'grade_x',
       'citation', 'citationcomplaintnumber', 'pretrial_recidivism',
       'misdemeanor_recidivism', 'felony_recidivism', 'other_recidivism',
       'n_in_window_arrests', 'earliest_in_window_arrest', 'pretrial_start',
       'pretrial_end'],
      dtype='object')


Unnamed: 0,id,docketnumber,filingdate,offensedate,complaintdate,offensedispositiondate,disp_date,arrest_date,casestatus,casecategory,...,citation,citationcomplaintnumber,pretrial_recidivism,misdemeanor_recidivism,felony_recidivism,other_recidivism,n_in_window_arrests,earliest_in_window_arrest,pretrial_start,pretrial_end
0,99751210,MJ-57304-CR-0000001-2015,2015-01-01 00:32:00,2014-12-31,2015-01-01 00:32:00,2015-01-07 09:00:00,2015-03-20,2015-01-01,Closed,Court Case,...,"18-2701 (a)(1), 18-2701, 18-2709, 18-5503",G021390810,1.0,1.0,0.0,1.0,8.0,2015-01-20,2015-01-01 00:32:00,2015-03-20 09:18:00
1,99945997,MJ-57304-CR-0000002-2015,2015-01-01 00:43:00,2015-01-01,2015-01-01 00:43:00,2015-01-07 09:00:00,,2014-12-31,Closed,Court Case,...,"75-3802 (a)(1), 18-5104, 75-3733 (a)",G021390817,0.0,0.0,0.0,0.0,0.0,,2015-01-01 00:43:00,2015-08-24 10:29:00
2,99596689,MJ-05203-CR-0000001-2015,2015-01-01 02:17:00,2014-12-31,2015-01-01 00:00:00,2015-01-08 10:00:00,2015-02-12,2014-12-31,Closed,Court Case,...,"18-5506, 18-5505",2014-10586,,,,,,,,
3,99655649,MJ-05206-CR-0000001-2015,2015-01-01 02:39:00,2015-01-01,2015-01-01 00:00:00,2015-08-17 10:00:00,2015-08-17,2015-01-01,Closed,Court Case,...,"18-2701 (a)(1), 18-2709 (a)(1)",1500000005,0.0,0.0,0.0,0.0,0.0,,2015-01-01 02:39:00,2015-08-17 10:00:00
4,99871493,MJ-05003-CR-0000002-2015,2015-01-01 03:38:00,2015-01-01,2015-01-01 00:00:00,2015-01-22 12:30:00,2015-01-22,2015-01-01,Closed,Court Case,...,18-2701 (a)(1),1525,,,,,,,,


In [18]:
df['charge']

0                                                  182709A7
1                    185902A1, 185503A4, CC5902A1, CC5503A4
2                                182701A1, 182709A1, 182709
3                                         753802A1, 753802B
4                                         751543A, 753802A1
                                ...                        
434230    CS13A30, 35780-113A32, CS13A16, 35780-113A16, ...
434231             35780-113A16, 35780-113A32, 35780-113A30
434232       753802A1, VC3802A1, 753802A2, VC3802B, 753802B
434233    182902B2, 186318A1, 183121C, 183122.1B, 183126...
434234                                   751543B1, 756503.1
Name: charge, Length: 434235, dtype: object

In [6]:
# Convert date columns to datetime
date_cols = ['filingdate', 'offensedate', 'complaintdate', 'offensedispositiondate', 'pretrial_start',  
             'pretrial_end', 'dob_y']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Sort by person ID and offense date for sequential operations
df = df.sort_values(['id', 'offensedate']).reset_index(drop=True)

  df[col] = pd.to_datetime(df[col], errors='coerce')


In [8]:
# ============================================================================
# 1. AGE AT OFFENSE
# ============================================================================
df['age_at_offense'] = (df['offensedate'] - df['dob_y']).dt.days / 365.25
df['age_group'] = pd.cut(df['age_at_offense'], 
                          bins=[0, 25, 35, 45, 100], 
                          labels=['Under_25', '25-34', '35-44', '45_Plus'],
                          right=False)

In [9]:
# ============================================================================
# 2. DAYS SINCE LAST OFFENSE
# ============================================================================
df = df.sort_values(['id', 'offensedate']).reset_index(drop=True)
df['days_since_last_offense'] = df.groupby('id')['offensedate'].diff().dt.days

In [10]:
# ============================================================================
# 3. OFFENSE DURING OPEN CASE
# ============================================================================
# For each person, check if current offense occurred before previous case disposition
# df['prev_disp_date'] = df.groupby('id')['disp_date'].shift(1)
# df['offense_during_open_case'] = (
#     (df['offensedate'] < df['prev_disp_date']) & 
#     (df['prev_disp_date'].notna())
# ).astype(int)

In [11]:
# ============================================================================
# 4. SUPERVISION VIOLATION FLAG
# ============================================================================
df['supervision_violation_flag'] = (
    (df['susp_flag'] == 1) | 
    (df['laflag'] == 1)
).astype(int)

In [12]:
# ============================================================================
# 5. WAIVED OR DISMISSED FLAG
# ============================================================================
df['waived_or_dismissed_flag'] = df['casedisposition'].fillna('').str.contains(
    'Waived|Dismissed|Withdrawn', 
    case=False, 
    regex=True
).astype(int)

In [13]:
# ============================================================================
# 6. DRUG FLAG
# ============================================================================
drug_keywords = r'drug|narcotic|controlled substance|780-113|marijuana|cocaine|heroin|methamphetamine'
df['drug_flag'] = (
    df['title'].fillna('').str.contains(drug_keywords, case=False, regex=True) |
    df['section'].fillna('').str.contains(drug_keywords, case=False, regex=True) |
    df['charge'].fillna('').str.contains('780-113', case=False, regex=True)
).astype(int)

In [14]:
# ============================================================================
# 7. VIOLENT FLAG
# ============================================================================
violent_keywords = r'assault|2701|aggravated|robbery|rape|murder|homicide|2702|2703|kidnap|terroristic'
df['violent_flag'] = (
    df['title'].fillna('').str.contains(violent_keywords, case=False, regex=True) |
    df['section'].fillna('').str.contains('2701|2702|2703|2704|2705|2706|2707|2708', case=False, regex=True) |
    df['charge'].fillna('').str.contains(violent_keywords, case=False, regex=True)
).astype(int)

In [15]:
# ============================================================================
# 8. PROPERTY FLAG
# ============================================================================
property_keywords = r'theft|burglary|3921|3922|3923|3924|3925|3926|3927|trespass|shoplifting|receiving stolen'
df['property_flag'] = (
    df['title'].fillna('').str.contains(property_keywords, case=False, regex=True) |
    df['section'].fillna('').str.contains('3921|3922|3923|3924|3925|3926|3927|3928|3929', case=False, regex=True) |
    df['charge'].fillna('').str.contains(property_keywords, case=False, regex=True)
).astype(int)

In [16]:
# ============================================================================
# 9. IMPROVED CHARGE SEVERITY MAPPING AND MULTIPLE AGGREGATION FEATURES
# ============================================================================

# Remove old severity_category
if 'severity_category' in df.columns:
    df = df.drop(columns=['severity_category'])

# Create comprehensive grade mapping (higher = more severe)
# Two options provided:

# OPTION 1: Discontinuous scale (reflects felony/misdemeanor legal gap)
grade_severity_map_discontinuous = {
    'F1': 10, 'F': 9, 'F2': 8, 'F3': 7,
    'M1': 5, 'M': 4, 'M2': 3, 'M3': 2,
    'S': 1,
    'U': 0, '': 0, np.nan: 0
}

# OPTION 2: Continuous scale (1-7, no gaps)
grade_severity_map_continuous = {
    'F1': 7, 'F': 5, 'F2': 6, 'F3': 5,
    'M1': 4, 'M': 2, 'M2': 3, 'M3': 2,
    'S': 1,
    'U': 0, '': 0, np.nan: 0
}

# Choose which mapping to use (change this to switch)
USE_CONTINUOUS = True  # Set to False for discontinuous scale

grade_severity_map = grade_severity_map_continuous if USE_CONTINUOUS else grade_severity_map_discontinuous

# Helper function to extract severity scores from a grade string
def extract_severity_scores(grade_string):
    """
    Parse grade string and return list of severity scores.
    
    Args:
        grade_string: String containing grades separated by semicolons (e.g., "F1; M2; M3")
    
    Returns:
        List of severity scores (e.g., [10, 4, 3])
    """
    # Handle NaN, None, or empty values
    if pd.isna(grade_string) or grade_string == '' or grade_string is None:
        return []
    
    # Convert to string to handle any numeric types
    grade_string = str(grade_string)
    
    # Split by semicolon for multiple charges
    grades = grade_string.split(',')
    grades = [g.strip() for g in grades]
    
    severities = []
    for grade in grades:
        # Clean the grade (remove asterisks, extract base grade)
        clean_grade = grade.replace('*', '').strip()
        
        # Skip empty grades or 'nan' strings
        if not clean_grade or clean_grade == 'nan':
            continue
        
        # Map to severity score
        severity = grade_severity_map.get(clean_grade, 0)
        
        # If not found directly, try partial matching for complex grades
        if severity == 0 and clean_grade:
            for key, val in grade_severity_map.items():
                # Ensure key is string before using startswith
                if key and isinstance(key, str) and clean_grade.startswith(key):
                    severity = val
                    break
        
        severities.append(severity)
    
    return severities

# Function to compute maximum severity (captures the most serious charge)
def compute_max_severity(grade_string):
    """
    Compute maximum severity score across all charges.
    Important: The most serious charge often determines case outcome and recidivism risk.
    """
    severities = extract_severity_scores(grade_string)
    return max(severities) if severities else 0

# Function to compute minimum severity (captures the least serious charge)
def compute_min_severity(grade_string):
    """
    Compute minimum severity score across all charges.
    Useful for understanding the full range of offense behavior.
    """
    severities = extract_severity_scores(grade_string)
    return min(severities) if severities else 0

# Function to compute average severity (overall severity measure)
def compute_avg_severity(grade_string):
    """
    Compute average severity score across all charges.
    Provides a balanced measure of overall case seriousness.
    """
    severities = extract_severity_scores(grade_string)
    return np.mean(severities) if severities else 0

# Function to compute standard deviation of severity (charge diversity)
def compute_std_severity(grade_string):
    """
    Compute standard deviation of severity scores.
    High std indicates diverse charges (e.g., mixing felonies with misdemeanors).
    Low std indicates similar charges (e.g., all same grade).
    """
    severities = extract_severity_scores(grade_string)
    return np.std(severities) if len(severities) > 1 else 0

# Function to compute range of severity
def compute_range_severity(grade_string):
    """
    Compute range (max - min) of severity scores.
    Indicates the spread between most and least serious charges.
    """
    severities = extract_severity_scores(grade_string)
    if len(severities) > 0:
        return max(severities) - min(severities)
    return 0

# Apply all aggregation functions to create multiple features
print("Creating charge severity aggregation features...")

df['max_charge_severity'] = df['grade_x'].apply(compute_max_severity)
df['min_charge_severity'] = df['grade_x'].apply(compute_min_severity)
df['avg_charge_severity'] = df['grade_x'].apply(compute_avg_severity)
df['std_charge_severity'] = df['grade_x'].apply(compute_std_severity)
df['range_charge_severity'] = df['grade_x'].apply(compute_range_severity)

print("âœ“ Created 5 severity aggregation features")

Creating charge severity aggregation features...
âœ“ Created 5 severity aggregation features


In [17]:
# ============================================================================
# CORRELATION ANALYSIS: Which severity measure predicts recidivism best?
# ============================================================================
print("\n" + "="*60)
print("SEVERITY FEATURE CORRELATION ANALYSIS WITH RECIDIVISM")
print("="*60)

severity_features = ['max_charge_severity', 'min_charge_severity', 
                     'avg_charge_severity', 'std_charge_severity', 
                     'range_charge_severity']

# Compute correlations with pretrial_recidivism
correlations = {}
for feat in severity_features:
    if feat in df.columns and 'pretrial_recidivism' in df.columns:
        # Drop NaN values for correlation calculation
        valid_data = df[[feat, 'pretrial_recidivism']].dropna()
        if len(valid_data) > 0:
            corr = valid_data[feat].corr(valid_data['pretrial_recidivism'])
            correlations[feat] = corr
            print(f"{feat:25s}: {corr:7.4f}")

# Identify the best predictor
if correlations:
    best_feature = max(correlations, key=lambda k: abs(correlations[k]))
    print(f"\nðŸŽ¯ Best predictor: {best_feature} (|r| = {abs(correlations[best_feature]):.4f})")

print("="*60)


SEVERITY FEATURE CORRELATION ANALYSIS WITH RECIDIVISM
max_charge_severity      :  0.0605
min_charge_severity      :  0.0605
avg_charge_severity      :  0.0605
std_charge_severity      :     nan
range_charge_severity    :     nan

ðŸŽ¯ Best predictor: max_charge_severity (|r| = 0.0605)


  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]


In [19]:
# ============================================================================
# 11. NUM_CHARGES (if not already accurate)
# ============================================================================
# Count semicolons in charge column + 1
df['num_charges'] = df['charge'].fillna('').apply(
    lambda x: x.count(',') + 1 if x else 0
)

In [20]:
# ============================================================================
# SUMMARY AND EXPORT
# ============================================================================

# Drop temporary columns
temp_cols = ['prev_disp_date', 'num_charges_calculated']
df = df.drop(columns=[col for col in temp_cols if col in df.columns], errors='ignore')

print(f"\nFinal shape: {df.shape}")
print(f"\nNew features created (1-12):")
new_features = [
    'age_at_offense', 'age_group', 'days_since_last_offense', 
    'offense_during_open_case', 'supervision_violation_flag',
    'waived_or_dismissed_flag', 'drug_flag', 'violent_flag', 'property_flag',
    'avg_charge_severity', 'case_duration_days'
]
for feat in new_features:
    if feat in df.columns:
        print(f"  âœ“ {feat}")


Final shape: (434235, 55)

New features created (1-12):
  âœ“ age_at_offense
  âœ“ age_group
  âœ“ days_since_last_offense
  âœ“ supervision_violation_flag
  âœ“ waived_or_dismissed_flag
  âœ“ drug_flag
  âœ“ violent_flag
  âœ“ property_flag
  âœ“ avg_charge_severity


In [23]:
# Display sample statistics
print("\n" + "="*60)
print("KEY FEATURE STATISTICS (Features 1-12):")
print("="*60)
print(f"Mean age at offense: {df['age_at_offense'].mean():.1f} years")
# print(f"Offense during open case rate: {df['offense_during_open_case'].mean():.1%}")
print(f"Drug offense rate: {df['drug_flag'].mean():.1%}")
print(f"Violent offense rate: {df['violent_flag'].mean():.1%}")
print(f"Property offense rate: {df['property_flag'].mean():.1%}")
print(f"Supervision violation rate: {df['supervision_violation_flag'].mean():.1%}")
print(f"Waived/dismissed rate: {df['waived_or_dismissed_flag'].mean():.1%}")
# print(f"Mean case duration: {df['case_duration_days'].mean():.1f} days")


KEY FEATURE STATISTICS (Features 1-12):
Mean age at offense: 33.2 years
Drug offense rate: 34.1%
Violent offense rate: 19.1%
Property offense rate: 23.3%
Supervision violation rate: 0.0%
Waived/dismissed rate: 64.5%


In [24]:
df.columns

Index(['id', 'docketnumber', 'filingdate', 'offensedate', 'complaintdate',
       'offensedispositiondate', 'disp_date', 'arrest_date', 'casestatus',
       'casecategory', 'casedisposition', 'countyofoffense', 'county',
       'defendantdisplayname', 'name', 'sex', 'race_y', 'ethnicity', 'dob_y',
       'dv_flag', 'juvflag', 'laflag', 'conv_flag', 'susp_flag', 'cost',
       'costadjustment', 'charge', 'title', 'section', 'subsection', 'grade_x',
       'citation', 'citationcomplaintnumber', 'pretrial_recidivism',
       'misdemeanor_recidivism', 'felony_recidivism', 'other_recidivism',
       'n_in_window_arrests', 'earliest_in_window_arrest', 'pretrial_start',
       'pretrial_end', 'age_at_offense', 'age_group',
       'days_since_last_offense', 'supervision_violation_flag',
       'waived_or_dismissed_flag', 'drug_flag', 'violent_flag',
       'property_flag', 'max_charge_severity', 'min_charge_severity',
       'avg_charge_severity', 'std_charge_severity', 'range_charge_severity'

In [25]:
# num_charges_individual (sum of charges from prior cases for this individual)
df['offensedate'] = pd.to_datetime(df['offensedate'], errors='coerce')
df = df.sort_values(['id', 'offensedate']).reset_index(drop=True)

def count_prior_charges(row):
    prior_cases = df[(df['id'] == row['id']) & (df['offensedate'] < row['offensedate'])]
    return prior_cases['num_charges'].sum()

def count_prior_cases(row):
    prior_cases = df[(df['id'] == row['id']) & (df['offensedate'] < row['offensedate'])]
    return len(prior_cases)

df['num_charges_individual'] = df.apply(count_prior_charges, axis=1)
df['num_prior_cases'] = df.apply(count_prior_cases, axis=1)

print(f"âœ“ Missing features created. Shape: {df.shape}\n")

âœ“ Missing features created. Shape: (434235, 57)



In [28]:
df.rename(columns={'num_charges': 'num_charges_case'}, inplace=True)

In [30]:
def count_total_charges_up_to_now(row):
    # Get all cases for this individual with offensedate <= current date
    all_cases = df[(df['id'] == row['id']) & (df['offensedate'] <= row['offensedate'])]
    return all_cases['num_charges_case'].sum()

df['num_charges_individual'] = df.apply(count_total_charges_up_to_now, axis=1)

In [34]:
# View all rows and columns for id 99945997
specific_id = 99614767
result = df[df['id'] == specific_id]

# Display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print(f"Total rows for id {specific_id}: {len(result)}\n")
print(result)

# Reset options
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')

Total rows for id 99614767: 4

             id              docketnumber          filingdate offensedate  \
37254  99614767  MJ-27306-CR-0000250-2018 2018-08-08 13:48:00  2018-08-08   
37255  99614767  MJ-05003-CR-0006788-2018 2018-08-13 05:03:00  2018-08-13   
37256  99614767  MJ-05243-CR-0000563-2018 2018-09-20 13:08:00  2018-09-20   
37257  99614767  MJ-05003-CR-0010218-2018 2018-12-14 01:10:00  2018-12-13   

            complaintdate offensedispositiondate   disp_date arrest_date  \
37254 2018-08-08 13:48:00    2019-03-21 11:30:00  2019-03-21  2018-08-08   
37255 2018-08-13 00:00:00    2018-09-26 08:00:00  2019-02-13  2018-09-05   
37256 2018-09-20 00:00:00    2018-11-20 10:00:00  2019-02-13  2018-09-20   
37257 2018-12-14 00:00:00    2019-02-07 12:30:00  2019-04-09  2018-12-13   

      casestatus casecategory   casedisposition countyofoffense      county  \
37254     Closed   Court Case         Withdrawn      Washington  Washington   
37255     Closed   Court Case    Held for Co

In [35]:
# Save the engineered dataset
df.to_csv('engineered_1.csv', index=False)

# Feature Engineering Summary

## Features Created

| Feature Name | Rationale | How to Engineer from Current Data |
|--------------|-----------|-----------------------------------|
| **age_at_offense** | Age at time of offense is one of the strongest predictors of recidivism (younger = higher risk). | `age_at_offense = (offensedate - dob_y).dt.days / 365.25`, then bin into categories (<25, 25â€“34, 35â€“44, 45+). |
| **days_since_last_offense** | Captures offense frequency â€” shorter intervals between prior offenses might indicate chronic offending patterns. | Sort by `['id', 'offensedate']`; compute difference with previous offense: `groupby('id')['offensedate'].diff().dt.days`. |
| **offense_during_open_case** | Indicates disregard for court process â€” committing new crimes before prior case closes. Strong behavioral risk signal. | For each person, if `offensedate < previous disp_date`, mark as 1; else 0. |
| **waived_or_dismissed_flag** | Captures leniency/release; used to compare with post-release reoffense rates. | `casedisposition.str.contains('Waived|Dismissed|Withdrawn')`. |
| **drug_flag** | Substance-related charges are consistently linked to repeat offending. | Keyword search in title or section: `'drug|narcotic|780-113|controlled substance'`. |
| **violent_flag** | Violent offenses are often predictive of reoffense risk and supervision needs. | Keyword search: `'assault|2701|2702|2703|robbery|homicide|kidnap'`. |
| **property_flag** | Property/economic crimes often correlate with repeat, lower-level offending. | Keyword search: `'theft|burglary|3921|3922|3923|trespass|shoplifting'`. |
| **max_charge_severity** | Captures the most serious charge in the case. The "lead charge" often determines case outcome and legal consequences. | Map grades to severity scores (F1=10, F2=8, F3=7, M1=6, M2=4, M3=3, S=2), then compute `max()` across all charges. Correlation with recidivism: **+0.0181** |
| **min_charge_severity** | Captures the least serious charge. Chronic low-level offending patterns may indicate persistent criminal behavior. | Map grades to severity scores, then compute `min()` across all charges. **Strongest predictor** with correlation: **+0.0687** |
| **avg_charge_severity** | Provides a balanced measure of overall case seriousness across all charges. | Map grades to severity scores, then compute `mean()` across all charges. Correlation with recidivism: **+0.0515** |
| **std_charge_severity** | Measures charge diversity. High std indicates mixing of felonies and misdemeanors, low std indicates uniform charge severity. | Compute standard deviation of severity scores across charges. Correlation with recidivism: **-0.0475** (negative correlation suggests uniform charges reduce recidivism) |
| **range_charge_severity** | Indicates the spread between most and least serious charges. Large range suggests diverse criminal behavior. | `max_charge_severity - min_charge_severity`. Correlation with recidivism: **-0.0438** (negative correlation suggests charge diversity reduces recidivism) |
| **num_charges** | More charges per case â†’ broader offending behavior pattern. | `num_charges = charge.str.count(';') + 1` (already exists, validated). |

## Key Implementation Details

### Data Preparation
- **Date conversions**: All date columns converted to datetime format for proper calculations
- **Sorting**: Data sorted by `['id', 'offensedate']` to enable sequential feature calculations
- **Missing data handling**: Proper handling of NaN values in all calculations

### Severity Mapping System
- **Continuous severity scale**: Used 1-10 scale (F1=10, F2=8, F3=7, M1=6, M2=4, M3=3, S=2) with no gaps for better model performance
- **Multiple aggregations**: Created 5 different severity aggregation features (max, min, avg, std, range) to capture different aspects of charge severity

### Offense Type Classification
- **Keyword matching**: Case-insensitive regex patterns for drug, violent, and property offense classification
- **Multiple sources**: Search across `title`, `section`, and `charge` columns to maximize detection accuracy

## Correlation Analysis Results

**Key Finding**: All charge severity features show **very weak correlations** with recidivism (<0.1), suggesting:

1. **Charge severity alone is NOT a strong standalone predictor**
2. **Need to combine with other behavioral and demographic features** (age, prior history, offense type)
3. **Non-linear relationships or feature interactions may be important**

### Severity Feature Correlations with Recidivism:
- `min_charge_severity`: **+0.0687** (strongest, but still weak)
- `avg_charge_severity`: **+0.0515**
- `max_charge_severity`: **+0.0181** (weakest)
- `std_charge_severity`: **-0.0475** (negative: uniform charges â†’ lower recidivism)
- `range_charge_severity`: **-0.0438** (negative: diverse charges â†’ lower recidivism)

### Interpretation:
- **Counterintuitive finding**: `min_charge_severity` (least serious charge) is the strongest predictor
- **Possible explanation**: Chronic low-level offenders show persistent criminal behavior patterns
- **Max severity paradox**: Most serious charges don't predict recidivism well (possibly due to longer incarceration or deterrent effect)

## Modeling Recommendations

### Features to Include:
1. **Primary features**: `min_charge_severity`, `avg_charge_severity` (highest correlations)
2. **Supporting features**: `max_charge_severity`, `std_charge_severity`, `range_charge_severity` (may capture non-linear patterns)
3. **All behavioral flags**: offense_during_open_case, supervision_violation, drug/violent/property flags
4. **Demographic features**: age_at_offense, age_group
5. **Temporal features**: days_since_last_offense, case_duration_days

### Modeling Approaches:
- **Tree-based models** (Random Forest, XGBoost, LightGBM): Can capture non-linear relationships and feature interactions
- **Feature interactions**: Consider interactions between severity and other features (e.g., age Ã— severity)
- **Ensemble methods**: Combine multiple weak predictors for stronger overall performance

## Output

- **Final dataset**: `mdj_merged_3_engineered.csv`
- **16 new features** added to original dataset (12 core + 4 additional severity aggregations)
- **Old severity_category removed** and replaced with improved version
- **Comprehensive correlation analysis** completed to guide modeling decisions