In [1]:
import pandas as pd
from pathlib import Path

# -----------------------------------------------------------------------------
# Robust project-root + stable intermediate/output paths
# -----------------------------------------------------------------------------
PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "data").exists() and (PROJECT_ROOT.parent / "data").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
DATA_DIR = PROJECT_ROOT / "data"
INTERMEDIATE_DIR = DATA_DIR / "intermediate"
INTERMEDIATE_DIR.mkdir(parents=True, exist_ok=True)

phase2_path = DATA_DIR / "cleaned_data_phase2.csv"
ussc_df = pd.read_csv(phase2_path, low_memory=False)

# Step 1: Confirm Data Integrity
print("Dataset shape:", ussc_df.shape)

# Step 2: Remove Duplicates
before_dups = ussc_df.shape[0]
ussc_df.drop_duplicates(inplace=True)
after_dups = ussc_df.shape[0]
print(f"Dropped {before_dups - after_dups} duplicate rows")

# Step 3: Check Data Types
print("\nData Types:")
print(ussc_df.dtypes)

# Step 4: Validate Unique Values in Key Categorical Features
categorical_features = [
    "NEWRACE",
    "MONSEX",
    "EDUCATN",
    "CITIZEN",
    "ZONE",
    "SENTMON",
    "DISPOSIT",
]
for feature in categorical_features:
    if feature in ussc_df.columns:
        print(f"\nUnique values in {feature}:")
        print(ussc_df[feature].unique())

# Step 5: Check for Missing Data
missing_counts = ussc_df.isna().sum().sort_values(ascending=False)
print("\nMissing values (top 10):")
print(missing_counts.head(10))

# Step 6: Save intermediate dataset
output_file = INTERMEDIATE_DIR / "cleaned_data_after_subphase_3_1.csv"
ussc_df.to_csv(output_file, index=False)
print(f"\nIntermediate dataset saved to: {output_file}")

Dataset shape: (76538, 16)
Dropped 224 duplicate rows

Data Types:
AGE         float64
NEWRACE      object
MONSEX       object
EDUCATN      object
DISTRICT     object
CIRCDIST     object
CRIMHIST     object
SENTYR      float64
CITIZEN      object
CITWHERE     object
NUMDEPEN     object
CRIMLIV      object
SENTMON      object
ZONE         object
DISPOSIT     object
SENTTOT     float64
dtype: object

Unique values in NEWRACE:
['Hispanic' 'White' 'Black' 'Other' 'American Indian or Alaskan Native'
 'Asian or Pacific Islander' nan]

Unique values in MONSEX:
['Male' 'Female' nan]

Unique values in EDUCATN:
['Six years of school completed' 'High school graduate'
 'Some trade or vocational school' 'Nine years of school completed'
 'Some college' 'College graduate' 'Eleven years of school completed'
 'G.E.D. (general education diploma)' nan 'One year of school completed'
 'Trade or vocational degree' 'Middle school / junior high'
 'Ten years of school completed' 'Four years of school completed

In [2]:
# Load the intermediate dataset from Sub-Phase 3.1
file_path = INTERMEDIATE_DIR / "cleaned_data_after_subphase_3_1.csv"
ussc_df = pd.read_csv(file_path, low_memory=False)

# Step 1: Check for Missing Values Again
missing_counts = ussc_df.isna().sum()
print("\nMissing Values per Feature:")
print(missing_counts)

# Step 2: Preprocessing Specific Features
# Convert "No dependents" in NUMDEPEN to 0 and ensure numeric type
if "NUMDEPEN" in ussc_df.columns:
    ussc_df["NUMDEPEN"] = ussc_df["NUMDEPEN"].replace("No dependents", 0).astype(float)

# Convert CRIMHIST to binary values: 1 for "Yes, there is a criminal history", 0 otherwise
if "CRIMHIST" in ussc_df.columns:
    ussc_df["CRIMHIST"] = ussc_df["CRIMHIST"].apply(
        lambda x: 1 if x == "Yes, there is a criminal history" else 0
    )

# Cap extreme outliers in SENTTOT (documented heuristic)
if "SENTTOT" in ussc_df.columns:
    ussc_df["SENTTOT"] = ussc_df["SENTTOT"].apply(lambda x: x if x < 2000 else 2000)
    print(ussc_df["SENTTOT"].describe())

print("\nData Types:")
print(ussc_df.dtypes)

# Step 3: Handling Missing Values
numerical_features = [c for c in ["AGE", "NUMDEPEN", "CRIMHIST", "SENTYR", "SENTTOT"] if c in ussc_df.columns]
for feature in numerical_features:
    median_value = ussc_df[feature].median()
    ussc_df.fillna({feature: median_value}, inplace=True)
    print(f"Filled missing values in {feature} with median: {median_value}")

categorical_features = [
    c for c in [
        "NEWRACE",
        "MONSEX",
        "EDUCATN",
        "CITIZEN",
        "CITWHERE",
        "ZONE",
        "DISTRICT",
        "CIRCDIST",
        "CRIMLIV",
        "SENTMON",
    ]
    if c in ussc_df.columns
]
for feature in categorical_features:
    ussc_df.fillna({feature: "Unknown"}, inplace=True)

missing_counts_after = ussc_df.isna().sum()
print("\nMissing Values After Handling:")
print(missing_counts_after[missing_counts_after > 0])

# Step 4: Save the Dataset After Handling Missing Values
output_file = INTERMEDIATE_DIR / "cleaned_data_after_subphase_3_2.csv"
ussc_df.to_csv(output_file, index=False)
print(f"\nDataset after handling missing values saved to: {output_file}")


Missing Values per Feature:
AGE           44
NEWRACE      794
MONSEX       104
EDUCATN     8296
DISTRICT       0
CIRCDIST       0
CRIMHIST    1968
SENTYR         0
CITIZEN      415
CITWHERE    1352
NUMDEPEN    8317
CRIMLIV     4994
SENTMON        0
ZONE        1145
DISPOSIT       0
SENTTOT     6336
dtype: int64
count    76314.000000
mean       207.880397
std        543.185412
min          0.030000
25%          6.000000
50%         24.000000
75%         80.000000
max       2000.000000
Name: SENTTOT, dtype: float64

Data Types:
AGE         float64
NEWRACE      object
MONSEX       object
EDUCATN      object
DISTRICT     object
CIRCDIST     object
CRIMHIST      int64
SENTYR      float64
CITIZEN      object
CITWHERE     object
NUMDEPEN    float64
CRIMLIV      object
SENTMON      object
ZONE         object
DISPOSIT     object
SENTTOT     float64
dtype: object
Filled missing values in AGE with median: 35.0
Filled missing values in NUMDEPEN with median: 1.0
Filled missing values in CRIMHIST w

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# --------------------------------------------------
# 1. Load and Inspect the Dataset
# --------------------------------------------------
file_path = INTERMEDIATE_DIR / "cleaned_data_after_subphase_3_2.csv"
ussc_df = pd.read_csv(file_path, low_memory=False)

print("Initial checks on AGE:")
print("Missing in AGE:", ussc_df["AGE"].isna().sum())
print("AGE describe:\n", ussc_df["AGE"].describe())

# --------------------------------------------------
# 2. Bin AGE Before Scaling
# --------------------------------------------------
age_bins = [0, 25, 35, 45, np.inf]
age_labels = ["<25", "25-35", "35-45", "45+"]

ussc_df["AGE_BIN"] = pd.cut(
    ussc_df["AGE"],
    bins=age_bins,
    labels=age_labels,
    right=False,
    include_lowest=True,
)
print("Created 'AGE_BIN' feature. Unique bins:\n", ussc_df["AGE_BIN"].value_counts(dropna=False))

# --------------------------------------------------
# 3. Preserve raw targets before scaling
# --------------------------------------------------
# Keep a raw, interpretable copy for later modeling
if "SENTTOT" in ussc_df.columns:
    ussc_df["SENTTOT_RAW"] = ussc_df["SENTTOT"]

# --------------------------------------------------
# 4. Standardize numeric features (keep raw copies above)
# --------------------------------------------------
numerical_features = [c for c in ["AGE", "NUMDEPEN", "CRIMHIST", "SENTYR", "SENTTOT"] if c in ussc_df.columns]
scaler = StandardScaler()
ussc_df[numerical_features] = scaler.fit_transform(ussc_df[numerical_features])
print("Standardized numerical features:", numerical_features)

# --------------------------------------------------
# 5. Create Interaction Features
# --------------------------------------------------
if "NEWRACE" in ussc_df.columns and "CITIZEN" in ussc_df.columns:
    ussc_df["RACE_CITIZEN"] = ussc_df["NEWRACE"].astype(str) + "_" + ussc_df["CITIZEN"].astype(str)
    print("Created 'RACE_CITIZEN' feature.")

print("\nDescribe DISPOSIT")
if "DISPOSIT" in ussc_df.columns:
    print(ussc_df["DISPOSIT"].value_counts(dropna=False))

# --------------------------------------------------
# 6. Write phase3 dataset that retains targets
# --------------------------------------------------
phase3_unencoded_path = DATA_DIR / "cleaned_data_phase3_unencoded_DISPOSIT.csv"
ussc_df.to_csv(phase3_unencoded_path, index=False)
print(f"Wrote: {phase3_unencoded_path} | shape={ussc_df.shape}")

# --------------------------------------------------
# 7. Build one-hot feature matrix for modeling (drop targets)
# --------------------------------------------------
feature_df = ussc_df.copy()

# If present, we drop target columns for the feature matrix.
for c in ["DISPOSIT", "SENTTOT", "SENTTOT_RAW"]:
    if c in feature_df.columns:
        feature_df.drop(columns=[c], inplace=True)

categorical_features = [
    c
    for c in [
        "NEWRACE",
        "MONSEX",
        "EDUCATN",
        "CITIZEN",
        "CITWHERE",
        "ZONE",
        "DISTRICT",
        "CIRCDIST",
        "CRIMLIV",
        "AGE_BIN",
        "RACE_CITIZEN",
        "SENTMON",
    ]
    if c in feature_df.columns
]

encoded_df = pd.get_dummies(feature_df, columns=categorical_features, drop_first=True)
print("\nApplied one-hot encoding to categorical features.")
print("Encoded features shape:", encoded_df.shape)

# --------------------------------------------------
# 8. Save the transformed dataset (required by Notebook 4)
# --------------------------------------------------
phase3_features_path = DATA_DIR / "cleaned_data_phase3.csv"
encoded_df.to_csv(phase3_features_path, index=False)
print(f"Wrote: {phase3_features_path} | shape={encoded_df.shape}")



Initial checks on AGE:
Missing in AGE: 0
AGE describe:
 count    76314.000000
mean        36.394633
std         10.935215
min         16.000000
25%         28.000000
50%         35.000000
75%         43.000000
max         86.000000
Name: AGE, dtype: float64
Created 'AGE_BIN' feature. Unique bins:
 AGE_BIN
25-35    27526
35-45    22707
45+      16320
<25       9761
Name: count, dtype: int64
Standardized numerical features: ['AGE', 'NUMDEPEN', 'CRIMHIST', 'SENTYR', 'SENTTOT']
Created 'RACE_CITIZEN' feature.

Describe DISPOSIT
DISPOSIT
Guilty plea                        74416
Jury trial                          1742
Nolo contendere                       70
Trial by judge or bench trial         63
Guilty plea and trial (>1count)       23
Name: count, dtype: int64
Wrote: /Users/araj/Documents/Code/Machine Learning/Bias Detection in Judicial Text /data/cleaned_data_phase3_unencoded_DISPOSIT.csv | shape=(76314, 19)

Applied one-hot encoding to categorical features.
Encoded features shape: (76