In [None]:
# =============================
# GOOGLE COLAB VERSION – CODE A
# Create engineered CSV
# =============================
# ---------------------------------------------------------
# 2. Load the original cleaned modeling dataset.
#    This dataset contains:
#        - pretrial_recidivism (original target)
#        - grade_x (charge grades)
#        - county (for mapping urban/rural)
# ---------------------------------------------------------

# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

# 2. Load your existing file
input_path = "/content/drive/MyDrive/no_summaries_cleaned_modeling_features.csv"
df = pd.read_csv(input_path)

print("Original shape:", df.shape)




Mounted at /content/drive
Original shape: (431920, 33)


In [None]:


# ==========================
# Helper functions for feature construction
# ==========================
# WHY?
#   - grade_x contains values like "F1, M3" or "M2" or "F", etc.
#   - We need to detect whether a charge contains:
#         → any felony (F*, e.g., F1, F2, F3)
#         → any misdemeanor (M*, e.g., M1, M2)
#   - These help us define felony_recidivism and misdemeanor_recidivism.


def has_felony(grade):
    if pd.isna(grade):
        return 0
    tokens = [t.strip() for t in str(grade).split(',')]
    return int(any(tok.startswith('F') for tok in tokens))

def has_misdemeanor(grade):
    if pd.isna(grade):
        return 0
    tokens = [t.strip() for t in str(grade).split(',')]
    return int(any(tok.startswith('M') for tok in tokens))


In [None]:

# ==========================
# Create felony & misdemeanor recidivism labels
# ==========================
# WHY?
#   - pretrial_recidivism = 1 means some recidivism happened.
#   - We split it into felony vs misdemeanor recidivism:
#
#     felony_recidivism:
#         - 1 if:
#             (a) pretrial_recidivism = 1 AND
#             (b) grade_x contains any felony ("F", "F1", "F2"...)
#
#     misdemeanor_recidivism:
#         - 1 if:
#             (a) pretrial_recidivism = 1 AND
#             (b) grade_x contains NO felony AND
#             (c) contains a misdemeanor ("M1", "M2"...)
#
#   - This separation is needed because felony vs misdemeanor recidivism
#     behave differently and must be modeled independently.
# ==========================

df["felony_recidivism"] = df.apply(
    lambda row: 1 if row["pretrial_recidivism"] == 1 and has_felony(row["grade_x"]) else 0,
    axis=1,
)

df["misdemeanor_recidivism"] = df.apply(
    lambda row: 1 if (
        row["pretrial_recidivism"] == 1
        and has_misdemeanor(row["grade_x"])
        and not has_felony(row["grade_x"])
    ) else 0,
    axis=1,
)


In [None]:
# ==========================
# Urban vs Rural Classification
# ==========================
# WHY?
#   -  project requires evaluating fairness between urban and rural groups.
#   - We classify a county as urban if it contains one of the major PA cities.
#   - These cities represent population centers and have unique legal patterns:
#         Pittsburgh → Allegheny County
#         Philadelphia → Philadelphia County
#         Harrisburg → Dauphin County
#         Scranton → Lackawanna County
#         Erie → Erie County
#         State College → Centre County
#         Allentown → Lehigh County
#
#   - is_urban = 1 if county is one of the above
#   - urban_rural = readable label for reporting and fairness tables
# According  to this website https://www.cdc.gov/nchs/data-analysis-tools/urban-rural.html?CDC_AAref_Val=https://www.cdc.gov/nchs/data_access/urban_rural.html
# 1. Large Central Metro
# 2.Large Fringe Metro
# 3.Medium Metro
# 4.Small Metro
# 5.Micropolitan
# 6.Non-core
# Categories 1–3 are unequivocally URBAN.
# Categories 5–6 are unequivocally RURAL.
# ==========================

urban_counties = [
    "Allegheny",    # Pittsburgh
    "Philadelphia", # Philly
    "Dauphin",      # Harrisburg
    "Lackawanna",   # Scranton
    "Erie",         # Erie
    "Centre",       # State College
    "Lehigh",       # Allentown
]

df["is_urban"] = df["county"].isin(urban_counties).astype(int)
df["urban_rural"] = df["is_urban"].map({1: "urban", 0: "rural"})

print("\nUrban/rural counts:")
print(df["urban_rural"].value_counts())


Urban/rural counts:
urban_rural
rural    296307
urban    135613
Name: count, dtype: int64


In [None]:

# ==========================
# Save new engineered CSV
# ==========================
output_path = "/content/drive/MyDrive/modeling_features_with_recid_and_region.csv"
df.to_csv(output_path, index=False)

print("\nSaved new CSV →", output_path)
print("New shape:", df.shape)


Saved new CSV → /content/drive/MyDrive/modeling_features_with_recid_and_region.csv
New shape: (431920, 37)
