In [None]:
###############################
# 1. Import Required Libraries
###############################
import os
import numpy as np
import pandas as pd
import torch as torch

# Transformers, NLP
from transformers import (BertTokenizer, BertForSequenceClassification, 
                          Trainer, TrainingArguments, BertModel)

# Sklearn & Model-Related
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, f1_score, classification_report)
from xgboost import XGBClassifier

In [None]:
###############################
# 2. Load & Prepare Data (Guarded)
###############################
# This notebook is an *advanced scaffold*.
# It only runs if you provide the expected inputs.

from pathlib import Path

PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "data").exists() and (PROJECT_ROOT.parent / "data").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
DATA_DIR = PROJECT_ROOT / "data"

structured_path = DATA_DIR / "final_cleaned_data_phase4.csv"  # optional artifact
text_path = DATA_DIR / "legal_text_data.csv"  # optional artifact

print("=== Loading Structured Data (optional) ===")
if structured_path.exists():
    structured_df = pd.read_csv(structured_path, low_memory=False)
    print("Structured Data Shape:", structured_df.shape)
else:
    structured_df = None
    print(f"SKIP: missing {structured_path}")

print("\n=== Loading Textual Data (optional) ===")
if text_path.exists():
    text_df = pd.read_csv(text_path, low_memory=False)
    print("Text Data Shape:", text_df.shape)
else:
    text_df = None
    print(f"SKIP: missing {text_path}")

# Merge only if both are present and share CASE_ID
merged_df = None
if structured_df is not None and text_df is not None:
    if "CASE_ID" in structured_df.columns and "CASE_ID" in text_df.columns:
        merged_df = pd.merge(structured_df, text_df, on="CASE_ID", how="inner")
        print("\nMerged Data Shape:", merged_df.shape)
    else:
        print("SKIP merge: missing CASE_ID in one/both inputs")

# If merged_df is None, downstream advanced sections should be skipped.


In [None]:
###############################
# 3. Fine-Tuning BERT on Text (Guarded)
###############################
# Runs only if `merged_df` exists and contains expected columns.
# NOTE: This is an optional/expensive stage (GPU recommended).

BERT_DIR = Path("./bert_finetuned")

if merged_df is None:
    print("SKIP BERT fine-tuning: merged_df is None (missing inputs)")
else:
    required_cols = {"TEXT_COL", "BIAS_LABEL"}
    missing = required_cols - set(merged_df.columns)

    if missing:
        print("SKIP BERT fine-tuning: merged_df missing required columns:", missing)
    else:
        print("\n=== Sub-Phase 5.1: Fine-Tune BERT for Text Classification ===")

        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        merged_df["BIAS_LABEL"] = merged_df["BIAS_LABEL"].astype(int)

        train_text, val_text, y_train_text, y_val_text = train_test_split(
            merged_df["TEXT_COL"].astype(str).values,
            merged_df["BIAS_LABEL"].values,
            test_size=0.2,
            random_state=42,
            stratify=merged_df["BIAS_LABEL"],
        )

        def tokenize_function(texts):
            return tokenizer(
                texts,
                padding=True,
                truncation=True,
                max_length=256,
                return_tensors="pt",
            )

        train_encodings = tokenize_function(train_text.tolist())
        val_encodings = tokenize_function(val_text.tolist())

        class BiasTextDataset(torch.utils.data.Dataset):
            def __init__(self, encodings, labels):
                self.encodings = encodings
                self.labels = labels

            def __getitem__(self, idx):
                item = {k: v[idx] for k, v in self.encodings.items()}
                item["labels"] = torch.tensor(int(self.labels[idx]))
                return item

            def __len__(self):
                return len(self.labels)

        train_dataset = BiasTextDataset(train_encodings, y_train_text)
        val_dataset = BiasTextDataset(val_encodings, y_val_text)

        model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased",
            num_labels=2,
        )

        training_args = TrainingArguments(
            output_dir="./bert_finetune_output",
            evaluation_strategy="epoch",
            num_train_epochs=1,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            logging_dir="./bert_logs",
            logging_steps=50,
            save_steps=200,
            save_total_limit=1,
            report_to="none",
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
        )

        print("\n=== Fine-Tuning BERT ===")
        trainer.train()
        trainer.save_model(str(BERT_DIR))
        print(f"Fine-tuned BERT saved at: {BERT_DIR.resolve()}")


In [None]:
###############################
# 4. Extract BERT Embeddings (Guarded)
###############################

if merged_df is None:
    print("SKIP embedding extraction: merged_df is None")
elif not BERT_DIR.exists():
    print(f"SKIP embedding extraction: missing fine-tuned model dir: {BERT_DIR}")
elif "TEXT_COL" not in merged_df.columns:
    print("SKIP embedding extraction: merged_df missing TEXT_COL")
else:
    print("\n=== Extract BERT Embeddings for Full Dataset ===")
    from transformers import BertModel

    base_bert = BertModel.from_pretrained(str(BERT_DIR))
    base_bert.eval()

    def get_cls_embedding(text: str) -> np.ndarray:
        encoding = tokenizer(
            text,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt",
        )
        with torch.no_grad():
            outputs = base_bert(**encoding)
            cls_embed = outputs.last_hidden_state[:, 0, :]
        return cls_embed.squeeze().cpu().numpy()

    all_texts = merged_df["TEXT_COL"].astype(str).values
    embed_array = np.stack([get_cls_embedding(t) for t in all_texts], axis=0)
    print("BERT Embeddings Shape:", embed_array.shape)

    structured_cols = [
        col
        for col in merged_df.columns
        if col not in ["TEXT_COL", "BIAS_LABEL"] and "CASE_ID" not in col
    ]
    structured_features = merged_df[structured_cols].values

    X_combined = np.concatenate([embed_array, structured_features], axis=1)
    print("Combined Feature Shape:", X_combined.shape)

    # Prefer BIAS_LABEL if present, else try DISPOSIT_ENCODED
    if "BIAS_LABEL" in merged_df.columns:
        y_combined = merged_df["BIAS_LABEL"].values
    elif "DISPOSIT_ENCODED" in merged_df.columns:
        y_combined = merged_df["DISPOSIT_ENCODED"].values
    else:
        y_combined = None
        print("SKIP downstream modeling: no target column found (BIAS_LABEL or DISPOSIT_ENCODED)")


In [None]:
###############################
# 5. Advanced Modeling + Fairness (Guarded)
###############################

if "X_combined" not in globals() or y_combined is None:
    print("SKIP advanced modeling: missing X_combined/y_combined")
else:
    print("\n=== Training Advanced Model (e.g., XGBoost) on Combined Features ===")

    X_train_comb, X_test_comb, y_train_comb, y_test_comb = train_test_split(
        X_combined,
        y_combined,
        test_size=0.2,
        random_state=42,
        stratify=y_combined,
    )

    xgb_model = XGBClassifier(random_state=42)
    xgb_model.fit(X_train_comb, y_train_comb)

    y_pred_comb_test = xgb_model.predict(X_test_comb)
    acc_test = accuracy_score(y_test_comb, y_pred_comb_test)
    print(f"Test Accuracy on Combined Model: {acc_test:.3f}")
    print("Classification Report (Test):")
    print(classification_report(y_test_comb, y_pred_comb_test))

    # --- Minimal group metric example (only if we can locate a protected feature) ---
    print("\n=== Example Fairness Slice (if available) ===")

    protected_feature_name = "NEWRACE_Black"
    protected_col_idx = None

    # We can only do this if structured_cols was built in the embedding cell.
    if "structured_cols" in globals() and protected_feature_name in structured_cols:
        protected_col_idx = 768 + structured_cols.index(protected_feature_name)

    if protected_col_idx is None:
        print(f"SKIP group metrics: protected feature '{protected_feature_name}' not found in structured_cols")
    else:
        def group_accuracy(y_true, y_pred, X_array, idx_protected, group_val):
            mask = X_array[:, idx_protected] == group_val
            return accuracy_score(y_true[mask], y_pred[mask])

        acc_priv = group_accuracy(y_test_comb, y_pred_comb_test, X_test_comb, protected_col_idx, 0)
        acc_unpriv = group_accuracy(y_test_comb, y_pred_comb_test, X_test_comb, protected_col_idx, 1)
        print("Accuracy (Unprivileged):", acc_unpriv)
        print("Accuracy (Privileged):  ", acc_priv)

    print("\n=== Phase 5 Completed ===\n")
