# 02 — Target Data Preprocessing

This notebook demonstrates how I constructed the **Target Index** for each state from 2015 to 2025 by merging
County Health Rankings (CHR) target metrics with FBI Hate Crime data.

In [None]:
"""import pandas as pd, numpy as np, os, json
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

DATA_DIR = "../data"
YEARS = range(2015, 2026)

# --- load annual CHR files and stack ---
frames = []
for y in YEARS:
    f = f"{DATA_DIR}/analytic_data{y}.csv"
    if os.path.exists(f):
        df_y = pd.read_csv(f)
        df_y["year"] = y
        frames.append(df_y)
chr_all = pd.concat(frames, ignore_index=True)

# --- hate-crime file ---
hate = pd.read_csv(f"{DATA_DIR}/hate_crime.csv")
hate.head()

"""

In [None]:
"""# --- Normalize target variables ---
target_cols = [
    # Health outcomes
    "v001_rawvalue",   # Premature death 
    "v127_rawvalue",   # Premature age-adjusted mortality 
    "v128_rawvalue",   # Child mortality 
    "v129_rawvalue",   # Infant mortality 
    "v135_rawvalue",   # Injury deaths 
    "v161_rawvalue",   # Suicides 
    "v015_rawvalue",   # Homicides 
    "v039_rawvalue",   # Motor vehicle crash deaths 
    "v148_rawvalue",   # Firearm fatalities 
    
    # Environmental / infrastructure factors
    "v125_rawvalue",   # Air pollution (PM2.5) 
    "v124_rawvalue",   # Drinking water violations 
    "v179_rawvalue",   # Access to parks 
    "v182_rawvalue",   # Adverse climate events 
    "v166_rawvalue",   # Broadband access 
    "v181_rawvalue",   # Library access 
    "v156_rawvalue",   # Traffic volume 
    
    # Behavioral / social outcomes
    "v134_rawvalue",   # Alcohol-impaired driving deaths 
    "v139_rawvalue",   # Food insecurity 
    "v083_rawvalue",   # Limited access to healthy foods 
    "v133_rawvalue",   # Food environment index 
    "v155_rawvalue"    # Flu vaccination 
]

scaler = MinMaxScaler()
# Aggregate to state level 
chr_state = chr_all.groupby("state", as_index=False).mean(numeric_only=True)

# Normalize selected columns
for col in target_cols:
    if col in chr_state:
        chr_state[f"{col}_norm"] = scaler.fit_transform(chr_state[[col]])


# --- Normalize hate crime then merge ---
hate_state = hate.groupby("state_abbr", as_index=False)["incident_id"].count()
hate_state["hate_crime_norm"] = scaler.fit_transform(hate_state[["incident_id"]])
hate_state = hate_state.rename(columns={"state_abbr": "state"})
merged = chr_state.merge(hate_state[["state","hate_crime_norm"]], left_on="state", right_on="state", how="left")

# Example output
merged[["state"] + [f"{col}_norm" for col in target_cols] + ["hate_crime_norm"]].head()

"""

In [None]:
"""# --- Compute weighted composite score ---
total = 22 # total weights (21 + 1 for hate crime)
merged["target_index"] = 0

for col in target_cols:
    if f"{col}_norm" in merged:
        merged["target_index"] += merged[f"{col}_norm"]*(1/total)

merged["target_index"] += merged["hate_crime_norm"]*(1/total)

# Example output
merged[["state", "target_index"]].head()"""

In [None]:
"""# --- Classify into quintiles ---
merged["Percentile_Class"] = pd.qcut(merged["target_index"], q=5, labels=False) + 1
merged["Percentile_Continuous"] = merged["target_index"]

# Example output
merged[["state", "target_index", "Percentile_Class", "Percentile_Continuous"]].head()"""

In [None]:
"""# --- Visualize target distribution ---
plt.hist(merged["target_index"], bins=10)
plt.title("Distribution of Target Index (State-Level)")
plt.xlabel("Target Index"); plt.ylabel("Frequency")
plt.show()"""

In [None]:
"""#  Save Paths Example
Path("../results/norm_targets").mkdir(parents=True, exist_ok=True)
merged.to_csv("../results/norm_targets/final_target_index_all_years.csv", index=False)
merged.to_json("../results/norm_targets/final_target_index_all_years.json", orient="records", indent=2)
merged.head()
"""

In [None]:
# Process Target Data
# Allow some time for script to run as it processes multiple years (10) of data
# Takes at least 1 minute to complete

!cd .. && python src/process_target_data.py