In [1]:
import pandas as pd

# Step 1a: Read the core master file
master = pd.read_csv("metro_master_dataset.csv")

# Step 1b: Basic info
print("Shape:", master.shape)          # rows, columns
print("\nColumns:\n", master.columns.tolist())

# Optional: peek at first few rows
print("\nSample rows:")
print(master.head(5))


Shape: (377, 35)

Columns:
 ['CBSA', 'CBSA_NAME', 'growth_1yr_pct', 'growth_3yr_pct', 'growth_5yr_pct', 'CAGR_pct', 'vol_full', 'vol_12m', 'mom_12m_pct', 'mom_3m_pct', 'inflow_returns', 'inflow_population', 'outflow_returns', 'outflow_population', 'net_returns', 'net_population', 'inflow_per_1k', 'outflow_per_1k', 'net_per_1k', 'nri_population', 'RISK_SCORE', 'EAL_SCORE', 'SOVI_SCORE', 'RESL_SCORE', 'acs_total_pop', 'acs_median_age', 'pct_white', 'pct_black', 'pct_hispanic', 'pct_asian', 'pct_foreign_born', 'acs_total_pop_s', 'acs_median_age_s', 'median_household_income', 'mean_household_income']

Sample rows:
    CBSA                    CBSA_NAME  growth_1yr_pct  growth_3yr_pct  \
0  10180                  Abilene, TX        0.338824        3.540083   
1  10420                    Akron, OH        3.890175       15.278193   
2  10500                   Albany, GA       -2.891997        7.252488   
3  10540                   Albany, OR        1.080352        3.961822   
4  10580  Albany-

In [2]:
unemp = pd.read_excel("metro-annual-unemployment-rates_Cleaned.xlsx")

print("Shape:", unemp.shape)
print("\nColumns:", unemp.columns.tolist())

print("\nSample rows:")
print(unemp.head(5))


Shape: (387, 3)

Columns: ['CBSA', 'CBSA_NAME', 'Unemployment_rate_2024']

Sample rows:
    CBSA                    CBSA_NAME  Unemployment_rate_2024
0  10180                  Abilene, TX                     3.5
1  10420                    Akron, OH                     4.4
2  10500                   Albany, GA                     4.2
3  10540                   Albany, OR                     4.6
4  10580  Albany-Schenectady-Troy, NY                     3.3


In [3]:
gdp = pd.read_excel("GDP_Metros_Cleaned.xlsx")

print("Shape:", gdp.shape)
print("\nColumns:", gdp.columns.tolist())

print("\nSample rows:")
print(gdp.head(5))


Shape: (384, 3)

Columns: ['CBSA', 'CBSA_NAME', 'GDP_2023']

Sample rows:
    CBSA                     CBSA_NAME  GDP_2023
0  10180                  Abilene, TX   10306623
1  10420                    Akron, OH   47285487
2  10500                   Albany, GA    7782093
3  10540           Albany-Lebanon, OR    6441520
4  10580  Albany-Schenectady-Troy, NY   84910398


In [4]:
income = pd.read_excel("Income_per_person_metros_Cleaned.xlsx")

print("Shape:", income.shape)
print("\nColumns:", income.columns.tolist())

print("\nSample rows:")
print(income.head(5))


Shape: (384, 5)

Columns: ['CBSA', 'CBSA_NAME', 'Per capita personal income (dollars) ', 'Personal income (thousands of dollars)', 'Population ']

Sample rows:
    CBSA                     CBSA_NAME  Per capita personal income (dollars)   \
0  10180                  Abilene, TX                                   56034   
1  10420                    Akron, OH                                   62904   
2  10500                   Albany, GA                                   48546   
3  10540           Albany-Lebanon, OR                                   57016   
4  10580  Albany-Schenectady-Troy, NY                                   71972   

   Personal income (thousands of dollars)  Population   
0                                10175211       181591  
1                                43932157       698398  
2                                 7063798       145508  
3                                 7497375       131496  
4                                65111784       904682  


In [5]:
rpp = pd.read_excel("RPP_Metro_Cleaned.xlsx")

print("Shape:", rpp.shape)
print("\nColumns:", rpp.columns.tolist())

print("\nSample rows:")
print(rpp.head(5))


Shape: (384, 3)

Columns: ['CBSA', 'CBSA_NAME', 'RPP_2023']

Sample rows:
    CBSA                    CBSA_NAME  RPP_2023
0  10180                  Abilene, TX    89.849
1  10420                    Akron, OH    92.827
2  10500                   Albany, GA    86.405
3  10540           Albany-Lebanon, OR   104.609
4  10580  Albany-Schenectady-Troy, NY    97.583


In [6]:
# Function to clean CBSA + CBSA_NAME
def clean_keys(df):
    df["CBSA"] = df["CBSA"].astype(str).str.strip()
    df["CBSA_NAME"] = df["CBSA_NAME"].astype(str).str.strip()
    return df

# Apply to all datasets
master = clean_keys(master)
unemp = clean_keys(unemp)
gdp = clean_keys(gdp)
income = clean_keys(income)
rpp = clean_keys(rpp)

print("Step 6 done — keys standardized")


Step 6 done — keys standardized


In [7]:
# Step 7: Compare CBSA codes across all datasets

print("Master:", len(master["CBSA"].unique()))
print("Unemp :", len(unemp["CBSA"].unique()))
print("GDP   :", len(gdp["CBSA"].unique()))
print("Income:", len(income["CBSA"].unique()))
print("RPP   :", len(rpp["CBSA"].unique()))

# Which CBSAs are NOT in the master dataset?
missing_in_master = set(unemp["CBSA"]) - set(master["CBSA"])
print("\nCBSA values in unemployment but NOT in master:", missing_in_master)

# Which CBSAs are missing in unemployment?
missing_in_unemp = set(master["CBSA"]) - set(unemp["CBSA"])
print("\nCBSA values in master but NOT in unemployment:", missing_in_unemp)


Master: 377
Unemp : 387
GDP   : 384
Income: 384
RPP   : 384

CBSA values in unemployment but NOT in master: {'28880', '39150', '28450', '47930', '48680', '11200', '19430', '31140', '43640', '30500'}

CBSA values in master but NOT in unemployment: set()


In [8]:
# Step 8: Merge unemployment into master dataset
master_merged = master.merge(
    unemp,
    on=["CBSA", "CBSA_NAME"],
    how="left"
)

print("Shape:", master_merged.shape)
print(master_merged[["CBSA", "CBSA_NAME", "Unemployment_rate_2024"]].head())
print("\nMissing unemployment values:", master_merged["Unemployment_rate_2024"].isna().sum())


Shape: (377, 36)
    CBSA                    CBSA_NAME  Unemployment_rate_2024
0  10180                  Abilene, TX                     3.5
1  10420                    Akron, OH                     4.4
2  10500                   Albany, GA                     4.2
3  10540                   Albany, OR                     4.6
4  10580  Albany-Schenectady-Troy, NY                     3.3

Missing unemployment values: 1


In [9]:
missing_unemp = master_merged[master_merged["Unemployment_rate_2024"].isna()]
missing_unemp


Unnamed: 0,CBSA,CBSA_NAME,growth_1yr_pct,growth_3yr_pct,growth_5yr_pct,CAGR_pct,vol_full,vol_12m,mom_12m_pct,mom_3m_pct,...,pct_white,pct_black,pct_hispanic,pct_asian,pct_foreign_born,acs_total_pop_s,acs_median_age_s,median_household_income,mean_household_income,Unemployment_rate_2024
295,41180,"St. Louis, MO-IL",2.283003,10.37558,36.865131,3.32863,38470.23173,1508.572838,2.022109,0.537223,...,72.1,0,N,N,1.2,2809527,19.1,81679,107369,


In [10]:
# Step 9: Re-merge unemployment using CBSA only
master_merged = master.merge(
    unemp.drop(columns=["CBSA_NAME"]),   # avoid duplicate column
    on="CBSA",
    how="left"
)

print("Shape:", master_merged.shape)
print("\nMissing unemployment values:", master_merged["Unemployment_rate_2024"].isna().sum())

master_merged.head()


Shape: (377, 36)

Missing unemployment values: 0


Unnamed: 0,CBSA,CBSA_NAME,growth_1yr_pct,growth_3yr_pct,growth_5yr_pct,CAGR_pct,vol_full,vol_12m,mom_12m_pct,mom_3m_pct,...,pct_white,pct_black,pct_hispanic,pct_asian,pct_foreign_born,acs_total_pop_s,acs_median_age_s,median_household_income,mean_household_income,Unemployment_rate_2024
0,10180,"Abilene, TX",0.338824,3.540083,29.371194,,33098.22631,382.035294,0.443692,0.363973,...,66.7,0.0,N,N,1.8,181969,16.0,63390,87293,3.5
1,10420,"Akron, OH",3.890175,15.278193,43.416794,2.95466,33137.09487,2653.070516,3.676926,0.63848,...,76.0,0.0,N,N,1.2,702209,20.0,71364,92087,4.4
2,10500,"Albany, GA",-2.891997,7.252488,43.69831,2.363436,20913.12624,1680.055703,-2.655553,-0.397589,...,38.6,0.1,N,N,N,146051,18.4,56851,67526,4.2
3,10540,"Albany, OR",1.080352,3.961822,40.293406,,93203.21622,1398.753407,0.838018,0.091698,...,82.7,0.0,N,N,N,132474,20.2,80422,92282,4.6
4,10580,"Albany-Schenectady-Troy, NY",3.668004,14.66085,42.072721,4.360028,57818.59124,3704.534956,3.358848,0.788891,...,75.3,0.1,N,N,2.6,913485,19.6,86637,116748,3.3


In [11]:
# Step 10: Merge GDP using CBSA only
master_merged = master_merged.merge(
    gdp.drop(columns=["CBSA_NAME"]),
    on="CBSA",
    how="left"
)

print("Shape:", master_merged.shape)
print("\nMissing GDP values:", master_merged["GDP_2023"].isna().sum())

master_merged[["CBSA", "CBSA_NAME", "GDP_2023"]].head()


Shape: (377, 37)

Missing GDP values: 9


Unnamed: 0,CBSA,CBSA_NAME,GDP_2023
0,10180,"Abilene, TX",10306623.0
1,10420,"Akron, OH",47285487.0
2,10500,"Albany, GA",7782093.0
3,10540,"Albany, OR",6441520.0
4,10580,"Albany-Schenectady-Troy, NY",84910398.0


In [12]:
missing_gdp = master_merged[master_merged["GDP_2023"].isna()][["CBSA", "CBSA_NAME"]]
missing_gdp


Unnamed: 0,CBSA,CBSA_NAME
44,14580,"Bozeman, MT"
70,17410,"Cleveland, OH"
98,20580,"Eagle Pass, TX"
148,25740,"Helena, MT"
225,33500,"Minot, ND"
257,37140,"Paducah, KY-IL"
265,38240,"Pinehurst-Southern Pines, NC"
303,41780,"Sandusky, OH"
340,45900,"Traverse City, MI"


In [13]:
missing_in_gdp = set(master["CBSA"]) - set(gdp["CBSA"])
print("CBSA values in master but NOT in GDP:", missing_in_gdp)
print("Count:", len(missing_in_gdp))


CBSA values in master but NOT in GDP: {'41780', '20580', '45900', '14580', '37140', '25740', '38240', '33500', '17410'}
Count: 9


In [14]:
# Step 11: Merge income dataset
master_merged = master_merged.merge(
    income.drop(columns=["CBSA_NAME"]),
    on="CBSA",
    how="left"
)

print("Shape:", master_merged.shape)
print("\nMissing per-capita income:", master_merged["Per capita personal income (dollars) "].isna().sum())

master_merged[["CBSA", "CBSA_NAME", "Per capita personal income (dollars) "]].head()


Shape: (377, 40)

Missing per-capita income: 9


Unnamed: 0,CBSA,CBSA_NAME,Per capita personal income (dollars)
0,10180,"Abilene, TX",56034.0
1,10420,"Akron, OH",62904.0
2,10500,"Albany, GA",48546.0
3,10540,"Albany, OR",57016.0
4,10580,"Albany-Schenectady-Troy, NY",71972.0


In [15]:
# Step 12: Merge RPP using CBSA only
master_merged = master_merged.merge(
    rpp.drop(columns=["CBSA_NAME"]),
    on="CBSA",
    how="left"
)

print("Shape:", master_merged.shape)
print("\nMissing RPP values:", master_merged["RPP_2023"].isna().sum())

master_merged[["CBSA", "CBSA_NAME", "RPP_2023"]].head()


Shape: (377, 41)

Missing RPP values: 9


Unnamed: 0,CBSA,CBSA_NAME,RPP_2023
0,10180,"Abilene, TX",89.849
1,10420,"Akron, OH",92.827
2,10500,"Albany, GA",86.405
3,10540,"Albany, OR",104.609
4,10580,"Albany-Schenectady-Troy, NY",97.583


In [16]:
print("Total rows:", master_merged.shape[0])
print("Total columns:", master_merged.shape[1])

# Count missing values per key economic column
master_merged.isna().sum().sort_values(ascending=False).head(12)


Total rows: 377
Total columns: 41


CAGR_pct                                  135
GDP_2023                                    9
Personal income (thousands of dollars)      9
Population                                  9
Per capita personal income (dollars)        9
RPP_2023                                    9
SOVI_SCORE                                  4
EAL_SCORE                                   4
nri_population                              4
RISK_SCORE                                  4
RESL_SCORE                                  4
growth_5yr_pct                              1
dtype: int64

In [17]:
# Step 13A — Save the final combined dataset
master_merged.to_csv("FinalMasterData.csv", index=False)

print("Saved as final_metro_master_dataset.csv")


Saved as final_metro_master_dataset.csv


In [18]:
# STEP 15A — Define feature lists

economic_cols = [
    "Unemployment_rate_2024",
    "GDP_2023",
    "Per capita personal income (dollars)",
    "Personal income (thousands of dollars)",
    "Population ",
    "median_household_income",
    "mean_household_income"
]

growth_cols = [
    "growth_1yr_pct",
    "growth_3yr_pct",
    "growth_5yr_pct",
    "CAGR_pct",
    "mom_12m_pct",
    "mom_3m_pct"
]

volatility_cols = [
    "vol_full",
    "vol_12m"
]

migration_cols = [
    "inflow_returns",
    "inflow_population",
    "outflow_returns",
    "outflow_population",
    "net_returns",
    "net_population",
    "inflow_per_1k",
    "outflow_per_1k",
    "net_per_1k"
]

risk_cols = [
    "RISK_SCORE",
    "EAL_SCORE",
    "SOVI_SCORE"
]

resilience_cols = [
    "RESL_SCORE"
]

affordability_cols = [
    "RPP_2023"
]


In [19]:
# STEP 16 — Normalization helpers

def normalize(series):
    """Standard min-max scaling."""
    return (series - series.min()) / (series.max() - series.min())

def normalize_inverted(series):
    """Inverted min-max (lower = better)."""
    return 1 - ((series - series.min()) / (series.max() - series.min()))


In [20]:
# STEP 17A — Normalize unemployment (lower is better → inverted)

master_merged["Unemployment_rate_2024_norm"] = normalize_inverted(
    master_merged["Unemployment_rate_2024"]
)

print(master_merged[["CBSA", "CBSA_NAME", "Unemployment_rate_2024", "Unemployment_rate_2024_norm"]].head())
print("\nMin/Max normalized unemployment:",
      master_merged["Unemployment_rate_2024_norm"].min(),
      master_merged["Unemployment_rate_2024_norm"].max())


    CBSA                    CBSA_NAME  Unemployment_rate_2024  \
0  10180                  Abilene, TX                     3.5   
1  10420                    Akron, OH                     4.4   
2  10500                   Albany, GA                     4.2   
3  10540                   Albany, OR                     4.6   
4  10580  Albany-Schenectady-Troy, NY                     3.3   

   Unemployment_rate_2024_norm  
0                     0.892216  
1                     0.838323  
2                     0.850299  
3                     0.826347  
4                     0.904192  

Min/Max normalized unemployment: 0.0 1.0


In [21]:
# STEP 17B — Normalize GDP (higher is better → standard)

master_merged["GDP_2023_norm"] = normalize(
    master_merged["GDP_2023"]
)

print(master_merged[["CBSA", "CBSA_NAME", "GDP_2023", "GDP_2023_norm"]].head(10))

print("\nMissing GDP normalized:", master_merged["GDP_2023_norm"].isna().sum())
print("Min/Max normalized GDP:",
      master_merged["GDP_2023_norm"].min(),
      master_merged["GDP_2023_norm"].max())


    CBSA                          CBSA_NAME    GDP_2023  GDP_2023_norm
0  10180                        Abilene, TX  10306623.0       0.002929
1  10420                          Akron, OH  47285487.0       0.019040
2  10500                         Albany, GA   7782093.0       0.001829
3  10540                         Albany, OR   6441520.0       0.001245
4  10580        Albany-Schenectady-Troy, NY  84910398.0       0.035432
5  10740                    Albuquerque, NM  59382941.0       0.024311
6  10780                     Alexandria, LA   7753574.0       0.001817
7  10900  Allentown-Bethlehem-Easton, PA-NJ  59890990.0       0.024532
8  11020                        Altoona, PA   7681160.0       0.001785
9  11100                       Amarillo, TX  18913682.0       0.006679

Missing GDP normalized: 9
Min/Max normalized GDP: 0.0 1.0


In [22]:
# STEP 17C — Normalize Income Variables (higher = better)

master_merged["PerCapitaIncome_norm"] = normalize(
    master_merged["Per capita personal income (dollars) "]
)

master_merged["PersonalIncome_norm"] = normalize(
    master_merged["Personal income (thousands of dollars)"]
)

master_merged["Population_norm"] = normalize(
    master_merged["Population "]
)

print(master_merged[[
    "CBSA", "CBSA_NAME",
    "Per capita personal income (dollars) ", "PerCapitaIncome_norm",
    "Personal income (thousands of dollars)", "PersonalIncome_norm",
    "Population ", "Population_norm"
]].head(10))

print("\nMissing Per Capita Income:", master_merged["PerCapitaIncome_norm"].isna().sum())
print("Missing Personal Income:", master_merged["PersonalIncome_norm"].isna().sum())
print("Missing Population:", master_merged["Population_norm"].isna().sum())


    CBSA                          CBSA_NAME  \
0  10180                        Abilene, TX   
1  10420                          Akron, OH   
2  10500                         Albany, GA   
3  10540                         Albany, OR   
4  10580        Albany-Schenectady-Troy, NY   
5  10740                    Albuquerque, NM   
6  10780                     Alexandria, LA   
7  10900  Allentown-Bethlehem-Easton, PA-NJ   
8  11020                        Altoona, PA   
9  11100                       Amarillo, TX   

   Per capita personal income (dollars)   PerCapitaIncome_norm  \
0                                56034.0              0.190572   
1                                62904.0              0.251014   
2                                48546.0              0.124693   
3                                57016.0              0.199212   
4                                71972.0              0.330794   
5                                57278.0              0.201517   
6                   

In [23]:
# STEP 17D — Normalize household income (higher = better)

master_merged["MedianHHIncome_norm"] = normalize(
    master_merged["median_household_income"]
)

master_merged["MeanHHIncome_norm"] = normalize(
    master_merged["mean_household_income"]
)

print(master_merged[[
    "CBSA", "CBSA_NAME",
    "median_household_income", "MedianHHIncome_norm",
    "mean_household_income", "MeanHHIncome_norm"
]].head(10))

print("\nMissing MedianHHIncome_norm:", master_merged["MedianHHIncome_norm"].isna().sum())
print("Missing MeanHHIncome_norm:", master_merged["MeanHHIncome_norm"].isna().sum())

print("\nMin/Max MedianHHIncome_norm:",
      master_merged["MedianHHIncome_norm"].min(),
      master_merged["MedianHHIncome_norm"].max())

print("Min/Max MeanHHIncome_norm:",
      master_merged["MeanHHIncome_norm"].min(),
      master_merged["MeanHHIncome_norm"].max())


    CBSA                          CBSA_NAME  median_household_income  \
0  10180                        Abilene, TX                    63390   
1  10420                          Akron, OH                    71364   
2  10500                         Albany, GA                    56851   
3  10540                         Albany, OR                    80422   
4  10580        Albany-Schenectady-Troy, NY                    86637   
5  10740                    Albuquerque, NM                    76097   
6  10780                     Alexandria, LA                    55909   
7  10900  Allentown-Bethlehem-Easton, PA-NJ                    83974   
8  11020                        Altoona, PA                    66994   
9  11100                       Amarillo, TX                    69595   

   MedianHHIncome_norm  mean_household_income  MeanHHIncome_norm  
0             0.106306                  87293           0.235150  
1             0.176578                  92087           0.270670  
2     

In [24]:
# STEP 18A — Normalize growth & momentum (higher is better)

growth_features = {
    "growth_1yr_pct": "growth_1yr_norm",
    "growth_3yr_pct": "growth_3yr_norm",
    "growth_5yr_pct": "growth_5yr_norm",
    "CAGR_pct": "CAGR_norm",
    "mom_12m_pct": "mom12_norm",
    "mom_3m_pct": "mom3_norm"
}

for col, new_col in growth_features.items():
    master_merged[new_col] = normalize(master_merged[col])

# Show first 10 rows for verification
print(master_merged[[
    "CBSA", "CBSA_NAME",
    "growth_1yr_norm", "growth_3yr_norm",
    "growth_5yr_norm", "CAGR_norm",
    "mom12_norm", "mom3_norm"
]].head(10))

# Missing count check
for col in growth_features.values():
    print(col, "missing:", master_merged[col].isna().sum())


    CBSA                          CBSA_NAME  growth_1yr_norm  growth_3yr_norm  \
0  10180                        Abilene, TX         0.599264         0.507499   
1  10420                          Akron, OH         0.764010         0.749483   
2  10500                         Albany, GA         0.449387         0.584031   
3  10540                         Albany, OR         0.633663         0.516194   
4  10580        Albany-Schenectady-Troy, NY         0.753703         0.736756   
5  10740                    Albuquerque, NM         0.662677         0.636421   
6  10780                     Alexandria, LA         0.504372         0.373509   
7  10900  Allentown-Bethlehem-Easton, PA-NJ         0.735541         0.743048   
8  11020                        Altoona, PA         0.863283         0.619753   
9  11100                       Amarillo, TX         0.609186         0.543826   

   growth_5yr_norm  CAGR_norm  mom12_norm  mom3_norm  
0         0.526504        NaN    0.604857   0.633354 

In [25]:
# STEP 18B — Normalize Volatility (lower = better → inverted)

master_merged["vol_full_norm"] = normalize_inverted(
    master_merged["vol_full"]
)

master_merged["vol12_norm"] = normalize_inverted(
    master_merged["vol_12m"]
)

print(master_merged[[
    "CBSA", "CBSA_NAME",
    "vol_full", "vol_full_norm",
    "vol_12m", "vol12_norm"
]].head(10))

print("\nMissing vol_full_norm:", master_merged["vol_full_norm"].isna().sum())
print("Missing vol12_norm:", master_merged["vol12_norm"].isna().sum())

print("\nMin/Max vol_full_norm:",
      master_merged["vol_full_norm"].min(),
      master_merged["vol_full_norm"].max())

print("Min/Max vol12_norm:",
      master_merged["vol12_norm"].min(),
      master_merged["vol12_norm"].max())


    CBSA                          CBSA_NAME     vol_full  vol_full_norm  \
0  10180                        Abilene, TX  33098.22631       0.933732   
1  10420                          Akron, OH  33137.09487       0.933623   
2  10500                         Albany, GA  20913.12624       0.967877   
3  10540                         Albany, OR  93203.21622       0.765307   
4  10580        Albany-Schenectady-Troy, NY  57818.59124       0.864461   
5  10740                    Albuquerque, NM  58399.99907       0.862832   
6  10780                     Alexandria, LA  27947.53632       0.948165   
7  10900  Allentown-Bethlehem-Easton, PA-NJ  56953.88091       0.866884   
8  11020                        Altoona, PA  22186.51616       0.964308   
9  11100                       Amarillo, TX  32793.35486       0.934586   

       vol_12m  vol12_norm  
0   382.035294    0.998148  
1  2653.070516    0.920374  
2  1680.055703    0.953696  
3  1398.753407    0.963330  
4  3704.534956    0.884365  


In [26]:
# STEP 19A — Positive migration features (higher = better)

positive_migration = {
    "inflow_returns": "inflow_returns_norm",
    "inflow_population": "inflow_pop_norm",
    "net_returns": "net_returns_norm",
    "net_population": "net_pop_norm",
    "inflow_per_1k": "inflow_per_1k_norm",
    "net_per_1k": "net_per_1k_norm"
}

for col, new_col in positive_migration.items():
    master_merged[new_col] = normalize(master_merged[col])

print(master_merged[[
    "CBSA", "CBSA_NAME",
    "inflow_returns_norm", "inflow_pop_norm",
    "net_returns_norm", "net_pop_norm",
    "inflow_per_1k_norm", "net_per_1k_norm"
]].head(10))

print("\nMissing counts (positive migration):")
for col in positive_migration.values():
    print(col, ":", master_merged[col].isna().sum())


    CBSA                          CBSA_NAME  inflow_returns_norm  \
0  10180                        Abilene, TX             0.006086   
1  10420                          Akron, OH             0.033744   
2  10500                         Albany, GA             0.004139   
3  10540                         Albany, OR             0.004145   
4  10580        Albany-Schenectady-Troy, NY             0.045757   
5  10740                    Albuquerque, NM             0.043825   
6  10780                     Alexandria, LA             0.003721   
7  10900  Allentown-Bethlehem-Easton, PA-NJ             0.043577   
8  11020                        Altoona, PA             0.003296   
9  11100                       Amarillo, TX             0.010937   

   inflow_pop_norm  net_returns_norm  net_pop_norm  inflow_per_1k_norm  \
0         0.006536          0.782091      0.793062            0.431459   
1         0.033201          0.778537      0.791084            0.758916   
2         0.004128          0

In [27]:
# STEP 19B — Negative migration features (lower is better → inverted)

negative_migration = {
    "outflow_returns": "outflow_returns_norm",
    "outflow_population": "outflow_pop_norm",
    "outflow_per_1k": "outflow_per_1k_norm"
}

for col, new_col in negative_migration.items():
    master_merged[new_col] = normalize_inverted(master_merged[col])

# Show sample
print(master_merged[[
    "CBSA", "CBSA_NAME",
    "outflow_returns", "outflow_returns_norm",
    "outflow_population", "outflow_pop_norm",
    "outflow_per_1k", "outflow_per_1k_norm"
]].head(10))

# Missing check
print("\nMissing counts (negative migration):")
for col in negative_migration.values():
    print(col, ":", master_merged[col].isna().sum())


    CBSA                          CBSA_NAME  outflow_returns  \
0  10180                        Abilene, TX            81970   
1  10420                          Akron, OH           353435   
2  10500                         Albany, GA            65646   
3  10540                         Albany, OR            62767   
4  10580        Albany-Schenectady-Troy, NY           474183   
5  10740                    Albuquerque, NM           449420   
6  10780                     Alexandria, LA            63692   
7  10900  Allentown-Bethlehem-Easton, PA-NJ           449840   
8  11020                        Altoona, PA            57286   
9  11100                       Amarillo, TX           131352   

   outflow_returns_norm  outflow_population  outflow_pop_norm  outflow_per_1k  \
0              0.994469              167963          0.994078      488.024148   
1              0.967878              644589          0.968696      548.310629   
2              0.996067              131167         

In [28]:
# STEP 20A — Risk scores (lower risk = better → inverted)

risk_features = {
    "RISK_SCORE": "RISK_SCORE_norm",
    "EAL_SCORE": "EAL_SCORE_norm",
    "SOVI_SCORE": "SOVI_SCORE_norm"
}

for col, new_col in risk_features.items():
    master_merged[new_col] = normalize_inverted(master_merged[col])


# STEP 20B — Resilience score (higher = better → standard)

master_merged["RESL_SCORE_norm"] = normalize(master_merged["RESL_SCORE"])


# Display sample
print(master_merged[[
    "CBSA", "CBSA_NAME",
    "RISK_SCORE", "RISK_SCORE_norm",
    "EAL_SCORE", "EAL_SCORE_norm",
    "SOVI_SCORE", "SOVI_SCORE_norm",
    "RESL_SCORE", "RESL_SCORE_norm"
]].head(10))

# Missing check
print("\nMissing counts:")
print("RISK_SCORE_norm:", master_merged["RISK_SCORE_norm"].isna().sum())
print("EAL_SCORE_norm:", master_merged["EAL_SCORE_norm"].isna().sum())
print("SOVI_SCORE_norm:", master_merged["SOVI_SCORE_norm"].isna().sum())
print("RESL_SCORE_norm:", master_merged["RESL_SCORE_norm"].isna().sum())


    CBSA                          CBSA_NAME  RISK_SCORE  RISK_SCORE_norm  \
0  10180                        Abilene, TX   75.678767         0.283181   
1  10420                          Akron, OH   77.972278         0.256417   
2  10500                         Albany, GA   65.136687         0.406198   
3  10540                         Albany, OR   91.886732         0.094048   
4  10580        Albany-Schenectady-Troy, NY   67.552093         0.378012   
5  10740                    Albuquerque, NM   85.480028         0.168808   
6  10780                     Alexandria, LA   76.148483         0.277699   
7  10900  Allentown-Bethlehem-Easton, PA-NJ   78.526592         0.249949   
8  11020                        Altoona, PA   52.433980         0.554427   
9  11100                       Amarillo, TX   88.730851         0.130874   

   EAL_SCORE  EAL_SCORE_norm  SOVI_SCORE  SOVI_SCORE_norm  RESL_SCORE  \
0  74.318054        0.296311   71.660296         0.298505   56.588647   
1  78.957047     

In [29]:
# STEP 21 — Normalize affordability (lower cost = better → inverted)

master_merged["RPP_norm"] = normalize_inverted(
    master_merged["RPP_2023"]
)

print(master_merged[[
    "CBSA", "CBSA_NAME",
    "RPP_2023", "RPP_norm"
]].head(10))

print("\nMissing RPP_norm:", master_merged["RPP_norm"].isna().sum())

print("Min/Max RPP_norm:",
      master_merged["RPP_norm"].min(),
      master_merged["RPP_norm"].max())


    CBSA                          CBSA_NAME  RPP_2023  RPP_norm
0  10180                        Abilene, TX    89.849  0.819547
1  10420                          Akron, OH    92.827  0.733537
2  10500                         Albany, GA    86.405  0.919016
3  10540                         Albany, OR   104.609  0.393253
4  10580        Albany-Schenectady-Troy, NY    97.583  0.596176
5  10740                    Albuquerque, NM    92.958  0.729754
6  10780                     Alexandria, LA    86.218  0.924417
7  10900  Allentown-Bethlehem-Easton, PA-NJ    98.441  0.571396
8  11020                        Altoona, PA    88.250  0.865729
9  11100                       Amarillo, TX    90.812  0.791734

Missing RPP_norm: 9
Min/Max RPP_norm: 0.0 1.0


In [30]:
# STEP 22A — Economic Strength Score

economic_norm_cols = [
    "Unemployment_rate_2024_norm",
    "GDP_2023_norm",
    "PerCapitaIncome_norm",
    "PersonalIncome_norm",
    "Population_norm",
    "MedianHHIncome_norm",
    "MeanHHIncome_norm"
]

master_merged["ES_score"] = master_merged[economic_norm_cols].mean(axis=1)

print(master_merged[["CBSA", "CBSA_NAME", "ES_score"]].head(10))
print("\nEconomic score — missing values:", master_merged["ES_score"].isna().sum())
print("Min/Max ES_score:", master_merged["ES_score"].min(), master_merged["ES_score"].max())


    CBSA                          CBSA_NAME  ES_score
0  10180                        Abilene, TX  0.205347
1  10420                          Akron, OH  0.230184
2  10500                         Albany, GA  0.160122
3  10540                         Albany, OR  0.223073
4  10580        Albany-Schenectady-Troy, NY  0.301869
5  10740                    Albuquerque, NM  0.239716
6  10780                     Alexandria, LA  0.178619
7  10900  Allentown-Bethlehem-Easton, PA-NJ  0.268457
8  11020                        Altoona, PA  0.211539
9  11100                       Amarillo, TX  0.222220

Economic score — missing values: 0
Min/Max ES_score: 0.04793102485432098 0.7493656987858762


In [31]:
# STEP 22B — Housing Growth + Stability Score

housing_norm_cols = [
    "growth_1yr_norm",
    "growth_3yr_norm",
    "growth_5yr_norm",
    "mom12_norm",
    "mom3_norm",
    "vol_full_norm",
    "vol12_norm"
]

master_merged["HG_score"] = master_merged[housing_norm_cols].mean(axis=1)

print(master_merged[["CBSA", "CBSA_NAME", "HG_score"]].head(10))
print("\nHousing Growth score — missing:", master_merged["HG_score"].isna().sum())
print("Min/Max HG_score:", master_merged["HG_score"].min(), master_merged["HG_score"].max())


    CBSA                          CBSA_NAME  HG_score
0  10180                        Abilene, TX  0.686194
1  10420                          Akron, OH  0.787593
2  10500                         Albany, GA  0.655651
3  10540                         Albany, OR  0.677130
4  10580        Albany-Schenectady-Troy, NY  0.768971
5  10740                    Albuquerque, NM  0.727221
6  10780                     Alexandria, LA  0.531900
7  10900  Allentown-Bethlehem-Easton, PA-NJ  0.779491
8  11020                        Altoona, PA  0.754930
9  11100                       Amarillo, TX  0.702883

Housing Growth score — missing: 0
Min/Max HG_score: 0.2684764161368588 0.9229980152440014


In [32]:
# STEP 22C — Migration Score

migration_norm_cols = [
    "inflow_returns_norm",
    "inflow_pop_norm",
    "net_returns_norm",
    "net_pop_norm",
    "inflow_per_1k_norm",
    "net_per_1k_norm",
    "outflow_returns_norm",
    "outflow_pop_norm",
    "outflow_per_1k_norm"
]

master_merged["MG_score"] = master_merged[migration_norm_cols].mean(axis=1)

print(master_merged[["CBSA", "CBSA_NAME", "MG_score"]].head(10))
print("\nMigration score — missing:", master_merged["MG_score"].isna().sum())
print("Min/Max MG_score:", master_merged["MG_score"].min(), master_merged["MG_score"].max())


    CBSA                          CBSA_NAME  MG_score
0  10180                        Abilene, TX  0.557955
1  10420                          Akron, OH  0.547309
2  10500                         Albany, GA  0.542476
3  10540                         Albany, OR  0.568700
4  10580        Albany-Schenectady-Troy, NY  0.543041
5  10740                    Albuquerque, NM  0.552861
6  10780                     Alexandria, LA  0.524934
7  10900  Allentown-Bethlehem-Easton, PA-NJ  0.546999
8  11020                        Altoona, PA  0.541818
9  11100                       Amarillo, TX  0.547324

Migration score — missing: 0
Min/Max MG_score: 0.35215128302164866 0.6243221835849613


In [33]:
# STEP 22D — Climate Score

climate_norm_cols = [
    "RISK_SCORE_norm",
    "EAL_SCORE_norm",
    "SOVI_SCORE_norm",
    "RESL_SCORE_norm"
]

master_merged["CL_score"] = master_merged[climate_norm_cols].mean(axis=1)

print(master_merged[["CBSA", "CBSA_NAME", "CL_score"]].head(10))
print("\nClimate score — missing:", master_merged["CL_score"].isna().sum())
print("Min/Max CL_score:", master_merged["CL_score"].min(), master_merged["CL_score"].max())


    CBSA                          CBSA_NAME  CL_score
0  10180                        Abilene, TX  0.360357
1  10420                          Akron, OH  0.477750
2  10500                         Albany, GA  0.396002
3  10540                         Albany, OR  0.303717
4  10580        Albany-Schenectady-Troy, NY  0.588429
5  10740                    Albuquerque, NM  0.271818
6  10780                     Alexandria, LA  0.343121
7  10900  Allentown-Bethlehem-Easton, PA-NJ  0.455504
8  11020                        Altoona, PA  0.669369
9  11100                       Amarillo, TX  0.282418

Climate score — missing: 4
Min/Max CL_score: 0.01201306143440995 0.8573852602022922


In [34]:
# STEP 22E — Affordability Score (lower cost = better)

master_merged["AF_score"] = master_merged["RPP_norm"]

print(master_merged[["CBSA", "CBSA_NAME", "RPP_2023", "RPP_norm", "AF_score"]].head(10))

print("\nAffordability score — missing:", master_merged["AF_score"].isna().sum())
print("Min/Max AF_score:", master_merged["AF_score"].min(), master_merged["AF_score"].max())


    CBSA                          CBSA_NAME  RPP_2023  RPP_norm  AF_score
0  10180                        Abilene, TX    89.849  0.819547  0.819547
1  10420                          Akron, OH    92.827  0.733537  0.733537
2  10500                         Albany, GA    86.405  0.919016  0.919016
3  10540                         Albany, OR   104.609  0.393253  0.393253
4  10580        Albany-Schenectady-Troy, NY    97.583  0.596176  0.596176
5  10740                    Albuquerque, NM    92.958  0.729754  0.729754
6  10780                     Alexandria, LA    86.218  0.924417  0.924417
7  10900  Allentown-Bethlehem-Easton, PA-NJ    98.441  0.571396  0.571396
8  11020                        Altoona, PA    88.250  0.865729  0.865729
9  11100                       Amarillo, TX    90.812  0.791734  0.791734

Affordability score — missing: 9
Min/Max AF_score: 0.0 1.0


In [35]:
# STEP 23B — Final Composite City Score (Weighted)

master_merged["Final_City_Score"] = (
    master_merged["ES_score"] * 0.25 +
    master_merged["HG_score"] * 0.25 +
    master_merged["MG_score"] * 0.20 +
    master_merged["CL_score"] * 0.15 +
    master_merged["AF_score"] * 0.15
)

# Preview top rows
print(master_merged[[
    "CBSA", "CBSA_NAME",
    "ES_score", "HG_score", "MG_score", "CL_score", "AF_score",
    "Final_City_Score"
]].head(10))

print("\nMissing Final_City_Score:", master_merged["Final_City_Score"].isna().sum())
print("Min/Max Final_City_Score:",
      master_merged["Final_City_Score"].min(),
      master_merged["Final_City_Score"].max())


    CBSA                          CBSA_NAME  ES_score  HG_score  MG_score  \
0  10180                        Abilene, TX  0.205347  0.686194  0.557955   
1  10420                          Akron, OH  0.230184  0.787593  0.547309   
2  10500                         Albany, GA  0.160122  0.655651  0.542476   
3  10540                         Albany, OR  0.223073  0.677130  0.568700   
4  10580        Albany-Schenectady-Troy, NY  0.301869  0.768971  0.543041   
5  10740                    Albuquerque, NM  0.239716  0.727221  0.552861   
6  10780                     Alexandria, LA  0.178619  0.531900  0.524934   
7  10900  Allentown-Bethlehem-Easton, PA-NJ  0.268457  0.779491  0.546999   
8  11020                        Altoona, PA  0.211539  0.754930  0.541818   
9  11100                       Amarillo, TX  0.222220  0.702883  0.547324   

   CL_score  AF_score  Final_City_Score  
0  0.360357  0.819547          0.511462  
1  0.477750  0.733537          0.545599  
2  0.396002  0.919016     

In [36]:
# STEP 23C — Export final dataset with scores to Excel

output_path = "FinalCityScores.xlsx"

master_merged.to_excel(output_path, index=False)

print("Export complete:", output_path)


Export complete: FinalCityScores.xlsx


In [37]:
# STEP — Export only dimension scores + final score

score_cols = [
    "CBSA",
    "CBSA_NAME",
    "ES_score",
    "HG_score",
    "MG_score",
    "CL_score",
    "AF_score",
    "Final_City_Score"
]

score_df = master_merged[score_cols]

output_path_scores = "CityScores_Only.xlsx"

score_df.to_excel(output_path_scores, index=False)

print("Export complete:", output_path_scores)


Export complete: CityScores_Only.xlsx
