# 03 — Predictor Data Preprocessing

This notebook constructs the **Predictor Index** for each state from 2015 to 2025 using the
County Health Rankings (CHR) specified metrics.

In [None]:
""""import pandas as pd, os
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler

DATA_DIR = "../data"
YEARS = range(2015, 2026)

# --- load annual CHR files and stack ---
frames = []
for y in YEARS:
    f = f"{DATA_DIR}/analytic_data{y}.csv"
    if os.path.exists(f):
        df_y = pd.read_csv(f)
        df_y["year"] = y
        frames.append(df_y)
chr_all = pd.concat(frames, ignore_index=True)

# Example output
chr_all.head()
"""

In [None]:
"""# --- Normalize predictor variables ---
predictor_cols = [
    # --- Health behaviors ---
    "v009_rawvalue",  # Adult smoking 
    "v011_rawvalue",  # Adult obesity 
    "v049_rawvalue",  # Excessive drinking 
    "v070_rawvalue",  # Physical inactivity 
    "v045_rawvalue",  # Sexually transmitted infections 
    "v014_rawvalue",  # Teen births 
    "v138_rawvalue",  # Drug overdose deaths 
    "v060_rawvalue",  # Diabetes prevalence 
    "v061_rawvalue",  # HIV prevalence 
    "v132_rawvalue",  # Access to exercise opportunities 
    "v183_rawvalue",  # Feelings of loneliness 
    "v143_rawvalue",  # Insufficient sleep 

    # --- Clinical care ---
    "v004_rawvalue",  # Ratio of population to primary care physicians 
    "v062_rawvalue",  # Ratio of population to mental health providers 
    "v088_rawvalue",  # Ratio of population to dentists 
    "v005_rawvalue",  # Preventable hospital stays 
    "v085_rawvalue",  # Uninsured 
    "v131_rawvalue",  # Ratio of population to other primary care providers 

    # --- Social & economic factors ---
    "v024_rawvalue",  # Children in poverty 
    "v044_rawvalue",  # Income inequality 
    "v069_rawvalue",  # Some college education 
    "v168_rawvalue",  # High school completion 
    "v023_rawvalue",  # Unemployment 
    "v140_rawvalue",  # Social associations 
    "v171_rawvalue",  # Child care cost burden 
    "v151_rawvalue",  # Gender pay gap 
    "v063_rawvalue",  # Median household income 
    "v170_rawvalue",  # Living wage 
    "v172_rawvalue",  # Child care centers 
    "v141_rawvalue",  # Residential segregation (Black/White) 
    "v149_rawvalue",  # Disconnected youth 
    "v184_rawvalue",  # Lack of social and emotional support 
    "v177_rawvalue",  # Voter turnout 

    # --- Physical environment & housing ---
    "v136_rawvalue",  # Severe housing problems 
    "v153_rawvalue",  # Home ownership 
    "v154_rawvalue",  # Severe housing cost burden 
    "v067_rawvalue",  # Driving alone to work 
    "v137_rawvalue",  # Long commute 

    # --- Education & community context ---
    "v167_rawvalue",  # School segregation 
    "v169_rawvalue",  # School funding adequacy 

    # --- Health outcomes (as predictive indicators) ---
    "v036_rawvalue",  # Poor physical health days 
    "v042_rawvalue",  # Poor mental health days 
    "v144_rawvalue",  # Frequent physical distress 
    "v145_rawvalue",  # Frequent mental distress 
    "v147_rawvalue"   # Life expectancy 
]

scaler = MinMaxScaler()
# Aggregate to state level 
chr_state = chr_all.groupby("state", as_index=False).mean(numeric_only=True)

# Normalize selected columns
for col in predictor_cols:
    if col in chr_state:
        chr_state[f"{col}_norm"] = scaler.fit_transform(chr_state[[col]])

# Example columns after normalization
chr_state[[col for col in chr_state.columns if "norm" in col]].head()

"""

In [None]:
"""# Save Results
Path("../results/norm_predictors").mkdir(parents=True, exist_ok=True)
chr_state.to_csv("../results/norm_predictors/final_predictor_index_all_years.csv", index=False)
chr_state.head()"""

In [6]:
# Process Predictor Data
# Allow some time for script to run as it processes multiple years (10) of data
# Takes at least 1 minute to complete

!cd .. && python src/process_predictor_data.py

Processed v009_rawvalue (v009_denominator) — reverse=True

  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_weighted_mean(g)}))
  .apply(lambda g: pd.Series({var: safe_


Processed v011_rawvalue (v011_denominator) — reverse=True
Processed v049_rawvalue (v049_denominator) — reverse=True
Processed v070_rawvalue (v070_denominator) — reverse=True
Processed v045_rawvalue (v045_denominator) — reverse=True
Processed v014_rawvalue (v014_denominator) — reverse=True
Processed v138_rawvalue (v138_denominator) — reverse=True
Processed v060_rawvalue (v060_denominator) — reverse=True
Processed v061_rawvalue (v061_denominator) — reverse=True
Processed v132_rawvalue (v132_denominator) — reverse=False
v183_rawvalue not found in 2015, skipping.
v143_rawvalue not found in 2015, skipping.
Processed v004_rawvalue (v004_denominator) — reverse=True
Processed v062_rawvalue (v062_denominator) — reverse=True
Processed v088_rawvalue (v088_denominator) — reverse=True
Processed v005_rawvalue (v005_denominator) — reverse=True
Processed v085_rawvalue (v085_denominator) — reverse=True
Processed v131_rawvalue (v131_denominator) — reverse=True
Processed v024_rawvalue (v051_rawvalue (po