In [1]:
################################################################################
# PHASE 4: BASELINE MODELING
################################################################################

###############################
#        IMPORTS & SETUP
###############################
import pandas as pd
import numpy as np
from pathlib import Path

# Preprocessing & Splits
from sklearn.model_selection import train_test_split

# Models for Classification & Regression
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    mean_squared_error, r2_score
)

###############################
#   PATHS / DATA LOAD
###############################
PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "data").exists() and (PROJECT_ROOT.parent / "data").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
DATA_DIR = PROJECT_ROOT / "data"

print("=== SUB-PHASE 4.1: DATA LOAD & TARGET PREP ===")

feature_file = DATA_DIR / "cleaned_data_phase3.csv"
original_file = DATA_DIR / "cleaned_data_phase3_unencoded_DISPOSIT.csv"

# Load original (targets + unencoded columns)
original_df = pd.read_csv(original_file, low_memory=False)
print(f"Original target dataset loaded: {original_df.shape} shape")

# Load encoded features if available; otherwise build them here as a fallback
if feature_file.exists():
    encoded_features_df = pd.read_csv(feature_file, low_memory=False)
    print(f"Features dataset loaded: {encoded_features_df.shape} shape")
else:
    print(f"WARNING: missing {feature_file}. Building one-hot encoded features as fallback...")

    # Keep a feature-only frame (drop targets)
    feature_df = original_df.copy()
    for c in ["DISPOSIT", "SENTTOT", "SENTTOT_RAW"]:
        if c in feature_df.columns:
            feature_df.drop(columns=[c], inplace=True)

    # Treat object/category columns as categorical
    cat_cols = [c for c in feature_df.columns if feature_df[c].dtype == "object" or str(feature_df[c].dtype) == "category"]
    encoded_features_df = pd.get_dummies(feature_df, columns=cat_cols, drop_first=True)
    print(f"Fallback encoded features shape: {encoded_features_df.shape}")

# Merge to form final_df
# If `SENTTOT_RAW` exists (added in Phase 3), keep it for interpretable units.
merge_cols = ["DISPOSIT", "SENTTOT"]
if "SENTTOT_RAW" in original_df.columns:
    merge_cols.append("SENTTOT_RAW")

final_df = encoded_features_df.join(original_df[merge_cols])
print(f"Final merged dataset: {final_df.shape} shape")

print("\nQuick checks on merged data:")
print(final_df.head(3))

=== SUB-PHASE 4.1: DATA LOAD & TARGET PREP ===
Original target dataset loaded: (76314, 19) shape
Features dataset loaded: (76314, 433) shape
Final merged dataset: (76314, 436) shape

Quick checks on merged data:
        AGE  CRIMHIST    SENTYR  NUMDEPEN  NEWRACE_Asian or Pacific Islander  \
0 -0.218985  0.400021 -1.803579  0.263778                              False   
1 -0.036088  0.400021 -1.803579  0.263778                              False   
2  1.244187  0.400021 -1.803579 -0.330798                              False   

   NEWRACE_Black  NEWRACE_Hispanic  NEWRACE_Other  NEWRACE_Unknown  \
0          False              True          False            False   
1          False              True          False            False   
2          False              True          False            False   

   NEWRACE_White  ...  SENTMON_July  SENTMON_June  SENTMON_March  SENTMON_May  \
0          False  ...         False         False          False        False   
1          False  ...   

In [2]:
###############################
#   SUB-PHASE 4.2: ENCODE TARGETS 
#         & DATA SPLITS
###############################
print("\n=== SUB-PHASE 4.2: ENCODE TARGETS & DATA SPLITS ===")

# --- 4.2.1: DISPOSIT as Classification Target ---
print("\n--- Classification Target: DISPOSIT ---")

def binary_disposit(disposit: str) -> int:
    """Map multi-class disposition into a binary proxy:
    1 = went to trial, 0 = did not go to trial.
    """
    if disposit in ["Jury trial", "Trial by judge or bench trial", "Guilty plea and trial (>1count)"]:
        return 1
    if disposit in ["Guilty plea", "Nolo contendere"]:
        return 0
    return 0

final_df["DISPOSIT_BINARY"] = final_df["DISPOSIT"].apply(binary_disposit)
print("\nBinary DISPOSIT distribution:")
print(final_df["DISPOSIT_BINARY"].value_counts())

# Define X/y
X_classif = final_df.drop(columns=["DISPOSIT", "SENTTOT", "DISPOSIT_BINARY"])
y_classif = final_df["DISPOSIT_BINARY"]

print("\nFeature matrix shape:", X_classif.shape)
print("Binary target vector shape:", y_classif.shape)

# Train/test split with stratification (severe class imbalance)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_classif,
    y_classif,
    test_size=0.2,
    random_state=42,
    stratify=y_classif,
)

print("\nTrain set shape:", X_train_c.shape, "y_train:", y_train_c.shape)
print("Test set shape:", X_test_c.shape, "y_test:", y_test_c.shape)

print("\nBinary DISPOSIT Distribution in Train set:")
print(y_train_c.value_counts(normalize=True))

print("\nBinary DISPOSIT Distribution in Test set:")
print(y_test_c.value_counts(normalize=True))


=== SUB-PHASE 4.2: ENCODE TARGETS & DATA SPLITS ===

--- Classification Target: DISPOSIT ---

Binary DISPOSIT distribution:
DISPOSIT_BINARY
0    74486
1     1828
Name: count, dtype: int64

Feature matrix shape: (76314, 434)
Binary target vector shape: (76314,)

Train set shape: (61051, 434) y_train: (61051,)
Test set shape: (15263, 434) y_test: (15263,)

Binary DISPOSIT Distribution in Train set:
DISPOSIT_BINARY
0    0.976053
1    0.023947
Name: proportion, dtype: float64

Binary DISPOSIT Distribution in Test set:
DISPOSIT_BINARY
0    0.97602
1    0.02398
Name: proportion, dtype: float64


In [3]:
# --- 4.2.2: Sentence Length as Regression Target ---
# Prefer raw units if available.
if "SENTTOT_RAW" in final_df.columns:
    target_col = "SENTTOT_RAW"
    print("\n--- Regression Target: SENTTOT_RAW (interpretable units) ---")
else:
    target_col = "SENTTOT"
    print("\n--- Regression Target: SENTTOT (standardized) ---")

final_df[target_col] = pd.to_numeric(final_df[target_col], errors="coerce")

# Prepare features/target
X_regress = final_df.drop(columns=["DISPOSIT", "DISPOSIT_BINARY", "SENTTOT"] + (["SENTTOT_RAW"] if "SENTTOT_RAW" in final_df.columns else []))
y_regress = final_df[target_col]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_regress,
    y_regress,
    test_size=0.2,
    random_state=42,
)

print("\nRegression splits:")
print("X_train_r:", X_train_r.shape, "| y_train_r:", y_train_r.shape)
print("X_test_r:", X_test_r.shape, "| y_test_r:", y_test_r.shape)

print(f"\n{target_col} Stats in Train:")
print(y_train_r.describe())
print(f"\n{target_col} Stats in Test:")
print(y_test_r.describe())


--- Regression Target: SENTTOT_RAW (interpretable units) ---

Regression splits:
X_train_r: (61051, 433) | y_train_r: (61051,)
X_test_r: (15263, 433) | y_test_r: (15263,)

SENTTOT_RAW Stats in Train:
count    61051.000000
mean       208.120914
std        543.784218
min          0.030000
25%          6.000000
50%         24.000000
75%         78.000000
max       2000.000000
Name: SENTTOT_RAW, dtype: float64

SENTTOT_RAW Stats in Test:
count    15263.000000
mean       206.918344
std        540.800285
min          0.030000
25%          6.000000
50%         24.000000
75%         80.000000
max       2000.000000
Name: SENTTOT_RAW, dtype: float64


In [4]:
###############################
#  SUB-PHASE 4.3: BASELINE 
#      CLASSIFICATION
###############################
print("\n=== SUB-PHASE 4.3: BASELINE CLASSIFICATION (DISPOSIT) ===")

# 1. Baseline Model with Class Weights & Multinomial
model_clf = LogisticRegression(
    max_iter=1000,
    random_state=42,
    class_weight='balanced',  # Addresses class imbalance
    solver='lbfgs'
)
model_clf.fit(X_train_c, y_train_c)

# 2. Predictions & Evaluation
y_pred_c_train = model_clf.predict(X_train_c)
y_pred_c_test  = model_clf.predict(X_test_c)

# 3. Metrics
from sklearn.metrics import accuracy_score, classification_report

train_acc_c = accuracy_score(y_train_c, y_pred_c_train)
test_acc_c  = accuracy_score(y_test_c, y_pred_c_test)

print(f"Train Accuracy (DISPOSIT): {train_acc_c:.3f}")
print(f"Test Accuracy  (DISPOSIT): {test_acc_c:.3f}")

print("\nClassification Report (Test):")
print(classification_report(
    y_test_c, y_pred_c_test,
    zero_division=0  # to avoid warnings, sets metrics to 0 instead of "undefined"
))

print("\nConfusion Matrix (Test Set):")
print(confusion_matrix(y_test_c, y_pred_c_test))


=== SUB-PHASE 4.3: BASELINE CLASSIFICATION (DISPOSIT) ===
Train Accuracy (DISPOSIT): 0.696
Test Accuracy  (DISPOSIT): 0.688

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.99      0.69      0.81     14897
           1       0.06      0.78      0.11       366

    accuracy                           0.69     15263
   macro avg       0.52      0.73      0.46     15263
weighted avg       0.97      0.69      0.79     15263


Confusion Matrix (Test Set):
[[10213  4684]
 [   81   285]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
###############################
#   SUB-PHASE 4.4: BASELINE
#      REGRESSION
###############################
print("\n=== SUB-PHASE 4.4: BASELINE REGRESSION (SENTTOT) ===")

# 1. Baseline Model: Linear Regression
model_reg = LinearRegression()
model_reg.fit(X_train_r, y_train_r)

# 2. Predictions & Evaluation
y_pred_r_train = model_reg.predict(X_train_r)
y_pred_r_test = model_reg.predict(X_test_r)

# 3. Metrics: MSE, R2
mse_train_r = mean_squared_error(y_train_r, y_pred_r_train)
mse_test_r  = mean_squared_error(y_test_r,  y_pred_r_test)

r2_train_r = r2_score(y_train_r, y_pred_r_train)
r2_test_r  = r2_score(y_test_r,  y_pred_r_test)

print(f"Train MSE (SENTTOT): {mse_train_r:.3f}, R2: {r2_train_r:.3f}")
print(f"Test  MSE (SENTTOT): {mse_test_r:.3f}, R2: {r2_test_r:.3f}")



=== SUB-PHASE 4.4: BASELINE REGRESSION (SENTTOT) ===
Train MSE (SENTTOT): 229689.860, R2: 0.223
Test  MSE (SENTTOT): 233805.901, R2: 0.201


In [6]:
################################################################################
# Fairness & Group-Level Metrics for Baseline Classification
################################################################################
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

# 1. Identify or Reconstruct Your Protected Attribute
# Example: If 'NEWRACE_Black' is a one-hot column in X_test_c:
protected_col = 'NEWRACE_Black'  # Adjust for your actual column

# 2. Group-Level Accuracy
def group_accuracy(X, y_true, y_pred, group_col, group_val):
    idx = X[group_col] == group_val
    # careful to align indices:
    # if X_test_c was reindexed after train_test_split, ensure alignment
    return accuracy_score(y_true[idx], y_pred[idx])

acc_protected_1 = group_accuracy(X_test_c, y_test_c, y_pred_c_test, protected_col, 1)
acc_protected_0 = group_accuracy(X_test_c, y_test_c, y_pred_c_test, protected_col, 0)

print(f"\nGroup-Level Accuracy for {protected_col} = 1: {acc_protected_1:.3f}")
print(f"Group-Level Accuracy for {protected_col} = 0: {acc_protected_0:.3f}")

# 3. Demographic Parity (Positive Prediction Rates)
# This is relevant if you define "positive" = predicted class is 1
# For a multi-class scenario, you might pick one class (e.g., "Guilty" if encoded as 1).
def demographic_parity(X, y_pred, group_col, positive_label=1):
    groups = [0,1]  # if your one-hot col is 0 or 1
    for g in groups:
        idx = X[group_col] == g
        # Probability model assigned label=positive_label in that group
        pos_rate = np.mean((y_pred[idx] == positive_label))
        print(f"Group: {g}, Positive Rate for label={positive_label}: {pos_rate:.3f}")

print("\nDemographic Parity for 'DISPOSIT' = 1 (assuming label 1 is positive):")
demographic_parity(X_test_c, y_pred_c_test, protected_col, positive_label=1)

# 4. TPR/FPR for each group (Binary scenario)
# If your DISPOSIT is truly multi-class, you'd have to define a "positive" class vs. "all else."
from sklearn.metrics import confusion_matrix

def compute_tpr_fpr(y_true, y_pred, positive_label=1):
    # For simplicity, treat everything that isn't 'positive_label' as 0
    # This is only valid if your classification is effectively binary
    y_true_bin = (y_true == positive_label).astype(int)
    y_pred_bin = (y_pred == positive_label).astype(int)
    
    cm = confusion_matrix(y_true_bin, y_pred_bin, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()
    
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    return tpr, fpr

# Example usage:
tpr_1, fpr_1 = compute_tpr_fpr(
    y_test_c[X_test_c[protected_col] == 1], 
    y_pred_c_test[X_test_c[protected_col] == 1],
    positive_label=1
)
tpr_0, fpr_0 = compute_tpr_fpr(
    y_test_c[X_test_c[protected_col] == 0], 
    y_pred_c_test[X_test_c[protected_col] == 0],
    positive_label=1
)

print(f"\nProtected Group (1) TPR: {tpr_1:.3f}, FPR: {fpr_1:.3f}")
print(f"Non-Protected Group (0) TPR: {tpr_0:.3f}, FPR: {fpr_0:.3f}")



Group-Level Accuracy for NEWRACE_Black = 1: 0.341
Group-Level Accuracy for NEWRACE_Black = 0: 0.777

Demographic Parity for 'DISPOSIT' = 1 (assuming label 1 is positive):
Group: 0, Positive Rate for label=1: 0.230
Group: 1, Positive Rate for label=1: 0.699

Protected Group (1) TPR: 0.918, FPR: 0.688
Non-Protected Group (0) TPR: 0.685, FPR: 0.221
