In [21]:
################################################################################
# PHASE 4: BASELINE MODELING
################################################################################

###############################
#        IMPORTS & SETUP
###############################
import pandas as pd
import numpy as np

# Preprocessing & Splits
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Models for Classification & Regression
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    mean_squared_error, r2_score
)

###############################
#   SUB-PHASE 4.1: DATA LOAD
#          & TARGET PREP
###############################
print("=== SUB-PHASE 4.1: DATA LOAD & TARGET PREP ===")

# 1. Load Feature-Engineered Dataset (PHASE 3 Output)
feature_file = "../data/cleaned_data_phase3.csv"
encoded_features_df = pd.read_csv(feature_file)
print(f"Features dataset loaded: {encoded_features_df.shape} shape")

# 2. Load Original Columns (DISPOSIT, SENTTOT)
original_file = "../data/cleaned_data_phase3_unencoded_DISPOSIT.csv"
original_df = pd.read_csv(original_file)
print(f"Original target dataset loaded: {original_df.shape} shape")

# 3. Merge to Form final_df
#    Ensures we have both feature-engineered columns + original DISPOSIT & SENTTOT
final_df = encoded_features_df.join(original_df[['DISPOSIT', 'SENTTOT']])
print(f"Final merged dataset: {final_df.shape} shape")

# 4. Quick Checks
print("\nQuick checks on merged data:")
print(final_df.head(3))

=== SUB-PHASE 4.1: DATA LOAD & TARGET PREP ===
Features dataset loaded: (76314, 433) shape
Original target dataset loaded: (76314, 18) shape
Final merged dataset: (76314, 435) shape

Quick checks on merged data:
        AGE  CRIMHIST    SENTYR  NUMDEPEN  NEWRACE_Asian or Pacific Islander  \
0 -0.218985  0.400021 -1.803579  0.263778                              False   
1 -0.036088  0.400021 -1.803579  0.263778                              False   
2  1.244187  0.400021 -1.803579 -0.330798                              False   

   NEWRACE_Black  NEWRACE_Hispanic  NEWRACE_Other  NEWRACE_Unknown  \
0          False              True          False            False   
1          False              True          False            False   
2          False              True          False            False   

   NEWRACE_White  ...  SENTMON_January  SENTMON_July  SENTMON_June  \
0          False  ...            False         False         False   
1          False  ...            False        

In [22]:
###############################
#   SUB-PHASE 4.2: ENCODE TARGETS 
#         & DATA SPLITS
###############################
print("\n=== SUB-PHASE 4.2: ENCODE TARGETS & DATA SPLITS ===")

# --- 4.2.1: DISPOSIT as Classification Target ---
print("\n--- Classification Target: DISPOSIT ---")

# Step 1: Create Binary DISPOSIT
def binary_disposit(disposit):
    if disposit in ['Jury trial', 'Trial by judge or bench trial', 'Guilty plea and trial (>1count)']:
        return 1  # Went to Trial
    elif disposit in ['Guilty plea', 'Nolo contendere']:
        return 0  # Did Not Go to Trial
    else:
        return 0  # Default to 0 if unexpected category

final_df['DISPOSIT_BINARY'] = final_df['DISPOSIT'].apply(binary_disposit)
print("\nBinary DISPOSIT distribution:")
print(final_df['DISPOSIT_BINARY'].value_counts())

# Step 2: Define Feature Matrix X and Binary Target y
X_classif = final_df.drop(columns=['DISPOSIT', 'SENTTOT', 'DISPOSIT_BINARY'])
y_classif = final_df['DISPOSIT_BINARY']

print("\nFeature matrix shape:", X.shape)
print("Binary target vector shape:", y.shape)

# Step 3: Perform Train/Test Split with Stratification
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_classif, y_classif,
    test_size=0.2,
    random_state=42,
    stratify=y_classif  # Maintains the distribution of classes
)

print("\nTrain set shape:", X_train_c.shape, "y_train:", y_train_c.shape)
print("Test set shape:", X_test_c.shape, "y_test:", y_test_c.shape)

# Step 4: Verify the Split
print("\nBinary DISPOSIT Distribution in Train set:")
print(y_train_c.value_counts(normalize=True))

print("\nBinary DISPOSIT Distribution in Test set:")
print(y_test_c.value_counts(normalize=True))


=== SUB-PHASE 4.2: ENCODE TARGETS & DATA SPLITS ===

--- Classification Target: DISPOSIT ---

Binary DISPOSIT distribution:
DISPOSIT_BINARY
0    74486
1     1828
Name: count, dtype: int64

Feature matrix shape: (76314, 433)
Binary target vector shape: (76314, 2)

Train set shape: (61051, 433) y_train: (61051,)
Test set shape: (15263, 433) y_test: (15263,)

Binary DISPOSIT Distribution in Train set:
DISPOSIT_BINARY
0    0.976053
1    0.023947
Name: proportion, dtype: float64

Binary DISPOSIT Distribution in Test set:
DISPOSIT_BINARY
0    0.97602
1    0.02398
Name: proportion, dtype: float64


In [24]:
# --- 4.2.2: SENTTOT as Regression Target ---
print("\n--- Regression Target: SENTTOT ---")

# a) No label encoding for numeric SENTTOT. We keep it as float/int
#    If SENTTOT was read as object, ensure it's converted:
final_df['SENTTOT'] = pd.to_numeric(final_df['SENTTOT'], errors='coerce')

# b) Prepare X_regress & y_regress
#    We exclude the columns used as classification target (DISPOSIT, DISPOSIT_ENCODED)
#    to avoid confusion, but you can keep them if you want them as features.
X_regress = final_df.drop(columns=['DISPOSIT', 'DISPOSIT_BINARY', 'SENTTOT'])
y_regress = final_df['SENTTOT']

# c) Train/Test Split for Regression
#    No stratify needed for numeric target
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_regress,
    y_regress,
    test_size=0.2,
    random_state=42
)

print("\nRegression splits:")
print("X_train_r:", X_train_r.shape, "| y_train_r:", y_train_r.shape)
print("X_test_r:", X_test_r.shape,   "| y_test_r:", y_test_r.shape)

# d) Basic Stats for the Regression Target
print("\nSENTTOT Stats in Train:")
print(y_train_r.describe())
print("\nSENTTOT Stats in Test:")
print(y_test_r.describe())


--- Regression Target: SENTTOT ---

Regression splits:
X_train_r: (61051, 433) | y_train_r: (61051,)
X_test_r: (15263, 433) | y_test_r: (15263,)

SENTTOT Stats in Train:
count    61051.000000
mean         0.000443
std          1.001109
min         -0.382653
25%         -0.371663
50%         -0.338525
75%         -0.239110
max          3.299299
Name: SENTTOT, dtype: float64

SENTTOT Stats in Test:
count    15263.000000
mean        -0.001771
std          0.995616
min         -0.382653
25%         -0.371663
50%         -0.338525
75%         -0.235428
max          3.299299
Name: SENTTOT, dtype: float64


In [25]:
###############################
#  SUB-PHASE 4.3: BASELINE 
#      CLASSIFICATION
###############################
print("\n=== SUB-PHASE 4.3: BASELINE CLASSIFICATION (DISPOSIT) ===")

# 1. Baseline Model with Class Weights & Multinomial
model_clf = LogisticRegression(
    max_iter=1000,
    random_state=42,
    class_weight='balanced',  # Addresses class imbalance
    solver='lbfgs'
)
model_clf.fit(X_train_c, y_train_c)

# 2. Predictions & Evaluation
y_pred_c_train = model_clf.predict(X_train_c)
y_pred_c_test  = model_clf.predict(X_test_c)

# 3. Metrics
from sklearn.metrics import accuracy_score, classification_report

train_acc_c = accuracy_score(y_train_c, y_pred_c_train)
test_acc_c  = accuracy_score(y_test_c, y_pred_c_test)

print(f"Train Accuracy (DISPOSIT): {train_acc_c:.3f}")
print(f"Test Accuracy  (DISPOSIT): {test_acc_c:.3f}")

print("\nClassification Report (Test):")
print(classification_report(
    y_test_c, y_pred_c_test,
    zero_division=0  # to avoid warnings, sets metrics to 0 instead of "undefined"
))

print("\nConfusion Matrix (Test Set):")
print(confusion_matrix(y_test_c, y_pred_c_test))


=== SUB-PHASE 4.3: BASELINE CLASSIFICATION (DISPOSIT) ===
Train Accuracy (DISPOSIT): 0.694
Test Accuracy  (DISPOSIT): 0.686

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.99      0.68      0.81     14897
           1       0.05      0.74      0.10       366

    accuracy                           0.69     15263
   macro avg       0.52      0.71      0.46     15263
weighted avg       0.97      0.69      0.79     15263


Confusion Matrix (Test Set):
[[10196  4701]
 [   95   271]]


In [26]:
###############################
#   SUB-PHASE 4.4: BASELINE
#      REGRESSION
###############################
print("\n=== SUB-PHASE 4.4: BASELINE REGRESSION (SENTTOT) ===")

# 1. Baseline Model: Linear Regression
model_reg = LinearRegression()
model_reg.fit(X_train_r, y_train_r)

# 2. Predictions & Evaluation
y_pred_r_train = model_reg.predict(X_train_r)
y_pred_r_test = model_reg.predict(X_test_r)

# 3. Metrics: MSE, R2
mse_train_r = mean_squared_error(y_train_r, y_pred_r_train)
mse_test_r  = mean_squared_error(y_test_r,  y_pred_r_test)

r2_train_r = r2_score(y_train_r, y_pred_r_train)
r2_test_r  = r2_score(y_test_r,  y_pred_r_test)

print(f"Train MSE (SENTTOT): {mse_train_r:.3f}, R2: {r2_train_r:.3f}")
print(f"Test  MSE (SENTTOT): {mse_test_r:.3f}, R2: {r2_test_r:.3f}")



=== SUB-PHASE 4.4: BASELINE REGRESSION (SENTTOT) ===
Train MSE (SENTTOT): 0.778, R2: 0.223
Test  MSE (SENTTOT): 0.792, R2: 0.201


In [29]:
################################################################################
# Fairness & Group-Level Metrics for Baseline Classification
################################################################################
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

# 1. Identify or Reconstruct Your Protected Attribute
# Example: If 'NEWRACE_Black' is a one-hot column in X_test_c:
protected_col = 'NEWRACE_Black'  # Adjust for your actual column

# 2. Group-Level Accuracy
def group_accuracy(X, y_true, y_pred, group_col, group_val):
    idx = X[group_col] == group_val
    # careful to align indices:
    # if X_test_c was reindexed after train_test_split, ensure alignment
    return accuracy_score(y_true[idx], y_pred[idx])

acc_protected_1 = group_accuracy(X_test_c, y_test_c, y_pred_c_test, protected_col, 1)
acc_protected_0 = group_accuracy(X_test_c, y_test_c, y_pred_c_test, protected_col, 0)

print(f"\nGroup-Level Accuracy for {protected_col} = 1: {acc_protected_1:.3f}")
print(f"Group-Level Accuracy for {protected_col} = 0: {acc_protected_0:.3f}")

# 3. Demographic Parity (Positive Prediction Rates)
# This is relevant if you define "positive" = predicted class is 1
# For a multi-class scenario, you might pick one class (e.g., "Guilty" if encoded as 1).
def demographic_parity(X, y_pred, group_col, positive_label=1):
    groups = [0,1]  # if your one-hot col is 0 or 1
    for g in groups:
        idx = X[group_col] == g
        # Probability model assigned label=positive_label in that group
        pos_rate = np.mean((y_pred[idx] == positive_label))
        print(f"Group: {g}, Positive Rate for label={positive_label}: {pos_rate:.3f}")

print("\nDemographic Parity for 'DISPOSIT' = 1 (assuming label 1 is positive):")
demographic_parity(X_test_c, y_pred_c_test, protected_col, positive_label=1)

# 4. TPR/FPR for each group (Binary scenario)
# If your DISPOSIT is truly multi-class, you'd have to define a "positive" class vs. "all else."
from sklearn.metrics import confusion_matrix

def compute_tpr_fpr(y_true, y_pred, positive_label=1):
    # For simplicity, treat everything that isn't 'positive_label' as 0
    # This is only valid if your classification is effectively binary
    y_true_bin = (y_true == positive_label).astype(int)
    y_pred_bin = (y_pred == positive_label).astype(int)
    
    cm = confusion_matrix(y_true_bin, y_pred_bin, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()
    
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    return tpr, fpr

# Example usage:
tpr_1, fpr_1 = compute_tpr_fpr(
    y_test_c[X_test_c[protected_col] == 1], 
    y_pred_c_test[X_test_c[protected_col] == 1],
    positive_label=1
)
tpr_0, fpr_0 = compute_tpr_fpr(
    y_test_c[X_test_c[protected_col] == 0], 
    y_pred_c_test[X_test_c[protected_col] == 0],
    positive_label=1
)

print(f"\nProtected Group (1) TPR: {tpr_1:.3f}, FPR: {fpr_1:.3f}")
print(f"Non-Protected Group (0) TPR: {tpr_0:.3f}, FPR: {fpr_0:.3f}")



Group-Level Accuracy for NEWRACE_Black = 1: 0.326
Group-Level Accuracy for NEWRACE_Black = 0: 0.778

Demographic Parity for 'DISPOSIT' = 1 (assuming label 1 is positive):
Group: 0, Positive Rate for label=1: 0.226
Group: 1, Positive Rate for label=1: 0.713

Protected Group (1) TPR: 0.912, FPR: 0.703
Non-Protected Group (0) TPR: 0.626, FPR: 0.219
