# 03 — Predictor Data Preprocessing

This notebook constructs the **Predictor Index** for each state from 2015 to 2025 using the
County Health Rankings (CHR) specified metrics.

In [None]:
import pandas as pd, os
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler

DATA_DIR = "../data"
YEARS = range(2015, 2026)

# --- load annual CHR files and stack ---
frames = []
for y in YEARS:
    f = f"{DATA_DIR}/analytic_data{y}.csv"
    if os.path.exists(f):
        df_y = pd.read_csv(f)
        df_y["year"] = y
        frames.append(df_y)
chr_all = pd.concat(frames, ignore_index=True)

# Example output
chr_all.head()

Unnamed: 0,statecode,countycode,fipscode,state,county,year,county_ranked,v001_rawvalue,v001_numerator,v001_denominator,...,v184_rawvalue,v184_numerator,v184_denominator,v184_cilow,v184_cihigh,v180_rawvalue,v180_numerator,v180_denominator,v180_cilow,v180_cihigh
0,0,0,0.0,US,United States,2015,,6621.61,3293135.0,877624573.0,...,,,,,,,,,,
1,1,0,1000.0,AL,Alabama,2015,,9507.9,72385.0,13527993.0,...,,,,,,,,,,
2,1,1,1001.0,AL,Autauga County,2015,1.0,8405.21,754.0,157345.0,...,,,,,,,,,,
3,1,3,1003.0,AL,Baldwin County,2015,1.0,7456.89,2460.0,519994.0,...,,,,,,,,,,
4,1,5,1005.0,AL,Barbour County,2015,1.0,8900.51,414.0,76718.0,...,,,,,,,,,,


In [5]:
# --- Normalize predictor variables ---
predictor_cols = [
    # --- Health behaviors ---
    "v009_rawvalue",  # Adult smoking 
    "v011_rawvalue",  # Adult obesity 
    "v049_rawvalue",  # Excessive drinking 
    "v070_rawvalue",  # Physical inactivity 
    "v045_rawvalue",  # Sexually transmitted infections 
    "v014_rawvalue",  # Teen births 
    "v138_rawvalue",  # Drug overdose deaths 
    "v060_rawvalue",  # Diabetes prevalence 
    "v061_rawvalue",  # HIV prevalence 
    "v132_rawvalue",  # Access to exercise opportunities 
    "v183_rawvalue",  # Feelings of loneliness 
    "v143_rawvalue",  # Insufficient sleep 

    # --- Clinical care ---
    "v004_rawvalue",  # Ratio of population to primary care physicians 
    "v062_rawvalue",  # Ratio of population to mental health providers 
    "v088_rawvalue",  # Ratio of population to dentists 
    "v005_rawvalue",  # Preventable hospital stays 
    "v085_rawvalue",  # Uninsured 
    "v131_rawvalue",  # Ratio of population to other primary care providers 

    # --- Social & economic factors ---
    "v024_rawvalue",  # Children in poverty 
    "v044_rawvalue",  # Income inequality 
    "v069_rawvalue",  # Some college education 
    "v168_rawvalue",  # High school completion 
    "v023_rawvalue",  # Unemployment 
    "v140_rawvalue",  # Social associations 
    "v171_rawvalue",  # Child care cost burden 
    "v151_rawvalue",  # Gender pay gap 
    "v063_rawvalue",  # Median household income 
    "v170_rawvalue",  # Living wage 
    "v172_rawvalue",  # Child care centers 
    "v141_rawvalue",  # Residential segregation (Black/White) 
    "v149_rawvalue",  # Disconnected youth 
    "v184_rawvalue",  # Lack of social and emotional support 
    "v177_rawvalue",  # Voter turnout 

    # --- Physical environment & housing ---
    "v136_rawvalue",  # Severe housing problems 
    "v153_rawvalue",  # Home ownership 
    "v154_rawvalue",  # Severe housing cost burden 
    "v067_rawvalue",  # Driving alone to work 
    "v137_rawvalue",  # Long commute 

    # --- Education & community context ---
    "v167_rawvalue",  # School segregation 
    "v169_rawvalue",  # School funding adequacy 

    # --- Health outcomes (as predictive indicators) ---
    "v036_rawvalue",  # Poor physical health days 
    "v042_rawvalue",  # Poor mental health days 
    "v144_rawvalue",  # Frequent physical distress 
    "v145_rawvalue",  # Frequent mental distress 
    "v147_rawvalue"   # Life expectancy 
]

scaler = MinMaxScaler()
# Aggregate to state level 
chr_state = chr_all.groupby("state", as_index=False).mean(numeric_only=True)

# Normalize selected columns
for col in predictor_cols:
    if col in chr_state:
        chr_state[f"{col}_norm"] = scaler.fit_transform(chr_state[[col]])

# Example columns after normalization
chr_state[[col for col in chr_state.columns if "norm" in col]].head()

  chr_state = chr_all.groupby("state", as_index=False).mean(numeric_only=True)
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


Unnamed: 0,v009_rawvalue_norm,v011_rawvalue_norm,v049_rawvalue_norm,v070_rawvalue_norm,v045_rawvalue_norm,v014_rawvalue_norm,v138_rawvalue_norm,v060_rawvalue_norm,v061_rawvalue_norm,v132_rawvalue_norm,...,v154_rawvalue_norm,v067_rawvalue_norm,v137_rawvalue_norm,v167_rawvalue_norm,v169_rawvalue_norm,v036_rawvalue_norm,v042_rawvalue_norm,v144_rawvalue_norm,v145_rawvalue_norm,v147_rawvalue_norm
0,0.861763,0.588859,0.667242,0.338385,0.74735,0.784166,0.311243,0.209601,0.031029,0.425407,...,0.123663,0.293068,0.0,0.41004,0.981134,0.504175,0.302878,0.543923,0.350022,0.573783
1,0.767341,0.909752,0.160381,0.852264,0.408042,0.766239,0.203551,0.998134,0.08554,0.047386,...,0.344497,1.0,0.760264,0.311239,0.163702,0.817712,0.830655,0.866412,0.817463,0.086276
2,0.86712,0.842243,0.199071,0.961964,0.339118,0.989474,0.194254,0.821155,0.05737,0.110659,...,0.252799,0.936918,0.5979,0.204428,0.269387,0.89607,0.816587,0.914394,0.808258,0.12809
3,0.502329,0.449884,0.385618,0.411659,0.285132,0.770596,0.373927,0.532552,0.055537,0.406771,...,0.395089,0.841564,0.475294,0.622887,0.345003,0.641306,0.575968,0.715868,0.562543,0.471731
4,0.193479,0.210349,0.570017,0.160174,0.202441,0.349939,0.27802,0.279248,0.077277,0.675782,...,0.984549,0.766161,0.640983,0.311636,0.454391,0.439382,0.452977,0.433447,0.331327,0.763474


In [4]:
# Optional Save Results
Path("../results/norm_predictors").mkdir(parents=True, exist_ok=True)
chr_state.to_csv("../results/norm_predictors/final_predictor_index_all_years.csv", index=False)
chr_state.head()

Unnamed: 0,state,statecode,countycode,fipscode,year,county_ranked,v001_rawvalue,v001_numerator,v001_denominator,v001_cilow,...,v154_rawvalue_norm,v067_rawvalue_norm,v137_rawvalue_norm,v167_rawvalue_norm,v169_rawvalue_norm,v036_rawvalue_norm,v042_rawvalue_norm,v144_rawvalue_norm,v145_rawvalue_norm,v147_rawvalue_norm
0,AK,2.0,143.658537,2143.658537,2020.082317,0.856574,10131.202337,669.036364,168503.0,7738.561492,...,0.123663,0.293068,0.0,0.41004,0.981134,0.504175,0.302878,0.543923,0.350022,0.573783
1,AL,1.0,66.014706,1066.014706,2020.0,1.0,11265.332867,2459.836898,401996.4,9879.954439,...,0.344497,1.0,0.760264,0.311239,0.163702,0.817712,0.830655,0.866412,0.817463,0.086276
2,AR,5.0,74.013158,5074.013158,2020.0,1.0,10825.418725,1286.784689,219864.6,9089.289642,...,0.252799,0.936918,0.5979,0.204428,0.269387,0.89607,0.816587,0.914394,0.808258,0.12809
3,AZ,4.0,13.0,4013.0,2020.0,1.0,9825.515272,10400.659091,2430718.0,9023.678484,...,0.395089,0.841564,0.475294,0.622887,0.345003,0.641306,0.575968,0.715868,0.562543,0.471731
4,CA,6.0,57.016949,6057.016949,2020.0,0.992337,7005.848958,12174.243411,3753684.0,6242.990473,...,0.984549,0.766161,0.640983,0.311636,0.454391,0.439382,0.452977,0.433447,0.331327,0.763474
