In [14]:
import pandas as pd
import numpy as np

# If using parquet:
df = pd.read_parquet("/Users/acsoteldo/Desktop/datasets/Data Project 12 Dec 2025/data/MASTER_dataset CLEANED.parquet")

# Drop geometry for non-spatial ML work (keep geoid/county_name)
if "geometry" in df.columns:
    df = df.drop(columns=["geometry"])

# Basic type cleanup
df["nfhl_in_floodplain"] = df["nfhl_in_floodplain"].astype("boolean")

num_cols = [c for c in df.columns if c not in ["geoid", "county_name", "eal_per_capita_risk_tier", "flood_hurricane_dominance"]]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="ignore")

df.shape, df.head(2)

  df[c] = pd.to_numeric(df[c], errors="ignore")


((1526, 83),
          geoid county_name  acs_population  acs_median_hh_income  \
 0  12011060303     Broward            6883                 38319   
 1  12086013600  Miami-Dade            5857                 44688   
 
    svi_overall_pctile  svi_ses_pctile  svi_household_comp_pctile  \
 0              0.9742          0.9430                     0.8325   
 1              0.8396          0.9022                     0.6404   
 
    svi_minority_lang_pctile  svi_housing_transport_pctile  svi_overall_score  \
 0                    0.9215                        0.9621            11.8481   
 1                    0.9805                        0.5734            10.0686   
 
    ...  nri_top3_eal_per_capita_usd  approx_tract_income_pool_usd  \
 0  ...                   134.396870                     263749677   
 1  ...                   165.360753                     261737616   
 
    nri_top3_eal_as_share_of_income_pool  priority_tract_flag  \
 0                              0.003507       

In [15]:
# Descriptive analysis
# County summaries
county_summary = df.groupby("county_name").agg(
    n_tracts=("geoid", "count"),
    median_income=("acs_median_hh_income", "median"),
    median_svi=("svi_overall_pctile", "median"),
    pct_floodplain=("nfhl_in_floodplain", lambda x: x.mean(skipna=True)),
    median_eal_pc=("nri_top3_eal_per_capita_usd", "median"),
).sort_values("median_eal_pc", ascending=False)

county_summary

Unnamed: 0_level_0,n_tracts,median_income,median_svi,pct_floodplain,median_eal_pc
county_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Monroe,29,82198.0,0.4669,0.965517,1116.212946
Palm Beach,373,75307.0,0.4398,0.560322,272.374933
Broward,417,70119.0,0.5982,0.973621,272.068364
Miami-Dade,707,65713.0,0.7132,0.845827,202.809497


In [16]:
# Correlation analysis (risk vs vulnerability vs income)
corr_cols = [
    "nri_top3_eal_per_capita_usd_capped",
    "svi_overall_pctile",
    "acs_median_hh_income",
    "acs_population",
    "flood_hurricane_share_of_total_eal"
]
corr = df[corr_cols].corr(method="spearman") # safer for skew + non-linear relationships
corr

Unnamed: 0,nri_top3_eal_per_capita_usd_capped,svi_overall_pctile,acs_median_hh_income,acs_population,flood_hurricane_share_of_total_eal
nri_top3_eal_per_capita_usd_capped,1.0,-0.505803,0.46949,-0.189798,0.787461
svi_overall_pctile,-0.505803,1.0,-0.832671,0.154533,-0.459187
acs_median_hh_income,0.46949,-0.832671,1.0,0.053536,0.387866
acs_population,-0.189798,0.154533,0.053536,1.0,-0.09153
flood_hurricane_share_of_total_eal,0.787461,-0.459187,0.387866,-0.09153,1.0


In [17]:
target = "nri_top3_eal_per_capita_usd_capped"
corr[target].sort_values(ascending=False)

nri_top3_eal_per_capita_usd_capped    1.000000
flood_hurricane_share_of_total_eal    0.787461
acs_median_hh_income                  0.469490
acs_population                       -0.189798
svi_overall_pctile                   -0.505803
Name: nri_top3_eal_per_capita_usd_capped, dtype: float64

In [18]:
# Regression analysis: quantify how SVI + income relates to per-capita loss.
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

model_df = df.dropna(subset=[
    "nri_top3_eal_per_capita_log",
    "svi_overall_pctile",
    "acs_median_hh_income"
]).copy()

X = model_df[["svi_overall_pctile", "acs_median_hh_income"]]
y = model_df["nri_top3_eal_per_capita_log"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=200
)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print("R2:", r2_score(y_test, y_pred))

coef_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": lr.coef_
})
coef_df

R2: 0.14686913990990058


Unnamed: 0,feature,coefficient
0,svi_overall_pctile,-0.9951279
1,acs_median_hh_income,-5.097237e-10


In [19]:
# Feature importance analysis
# Use a model to predict capped per-capita EAL, then interpret.
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance

features = [
    "svi_overall_pctile",
    "acs_median_hh_income",
    "acs_population",
    "svi_unemployed_pct",
    "svi_no_vehicle_pct",
    "svi_housing_cost_burden_pct",
    "svi_limited_english_pct",
    "svi_minority_pct",
]
features = [c for c in features if c in df.columns]

work = df.dropna(subset=features + ["nri_top3_eal_per_capita_usd_capped"]).copy()

X = work[features]
y = work["nri_top3_eal_per_capita_usd_capped"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=200)

rf = RandomForestRegressor(n_estimators=400, random_state=200, n_jobs=-1)
rf.fit(X_train, y_train)

perm = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=200)
imp = pd.Series(perm.importances_mean, index=features).sort_values(ascending=False)
imp


svi_minority_pct               0.269286
acs_median_hh_income           0.267928
svi_housing_cost_burden_pct    0.227612
svi_overall_pctile             0.173580
acs_population                 0.127403
svi_no_vehicle_pct             0.018347
svi_unemployed_pct             0.014740
svi_limited_english_pct        0.009459
dtype: float64

In [20]:
# Segmentation analysis (cluster tracts into “risk profiles”)
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

seg_features = [
    "svi_overall_pctile",
    "acs_median_hh_income",
    "nri_top3_eal_per_capita_usd_capped",
    "flood_hurricane_share_of_total_eal",
]
seg_features = [c for c in seg_features if c in df.columns]

seg = df.dropna(subset=seg_features).copy()
X = StandardScaler().fit_transform(seg[seg_features])

kmeans = KMeans(n_clusters=5, random_state=200, n_init="auto")
seg["cluster"] = kmeans.fit_predict(X)

# Cluster profiles
cluster_profile = seg.groupby("cluster")[seg_features].median().sort_values("nri_top3_eal_per_capita_usd_capped", ascending=False)
cluster_profile

Unnamed: 0_level_0,svi_overall_pctile,acs_median_hh_income,nri_top3_eal_per_capita_usd_capped,flood_hurricane_share_of_total_eal
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.398,95086.0,1143.382546,0.966135
2,0.2566,103750.0,352.815864,0.917946
3,0.7421,60771.5,250.375168,0.891212
4,0.727,-666666666.0,199.525325,0.822551
1,0.8292,55859.0,156.308791,0.826729


In [21]:
# Geospatial / Spatial analysis
import geopandas as gpd

gdf = gpd.read_file("/Users/acsoteldo/Desktop/datasets/Data Project 12 Dec 2025/data/MASTER_dataset CLEANED.geojson")
gdf["nri_top3_eal_per_capita_log"] = pd.to_numeric(gdf["nri_top3_eal_per_capita_log"], errors="coerce")

# Example: subset priority tracts
priority = gdf[gdf["priority_tract_flag"] == True]
len(priority)

31