# Processing & Pruning Our Data

## Import our dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os
from sklearn.impute import SimpleImputer
import json
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_val_predict

## Load our datasets

In [2]:
atlas_fifteen_path = '../data/raw/Raw_Atlas_2015/Raw_Atlas_Data_2015.xlsx'
fifteen_xl = pd.ExcelFile(atlas_fifteen_path)
atlas_ten_path = '../data/raw/Raw_Atlas_2010/Raw_Atlas_Data_2010.xlsx'
ten_xl = pd.ExcelFile(atlas_ten_path)

atlas_nineteen = pd.read_csv('../data/raw/Raw_Atlas_2019/Raw_Atlas_Data_2019.csv')
atlas_fifteen = fifteen_xl.parse('Food Access Research Atlas')
atlas_ten = ten_xl.parse('Food Access Research Atlas')

## Processing 2019 data for our present-day classification model

The 100+ columns in our table are often redundant and useless for training our model.

Let's keep the fundamentals, along with some general data buffer zones for identifying relative isolation + outliers.

In [90]:
keep_cols = [
    'CensusTract', 'Urban', 'PovertyRate', 'MedianFamilyIncome',
    'TractLOWI', 'TractKids', 'TractSeniors', 'TractHUNV', 'TractSNAP', 
]

buffer_cols = [
    'lapop1share', 'lakids1share', 'laseniors1share',
    'lahunv1share', 'lalowi1share'
    #We keep the 10/20 values for PovertyRate linear regression later on
]

In [91]:
pd.set_option('display.max_columns', None)
atlas_nineteen.head()

Unnamed: 0,CensusTract,State,County,Urban,Pop2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,LILATracts_halfAnd10,LILATracts_1And20,LILATracts_Vehicle,HUNVFlag,LowIncomeTracts,PovertyRate,MedianFamilyIncome,LA1and10,LAhalfand10,LA1and20,LATracts_half,LATracts1,LATracts10,LATracts20,LATractsVehicle_20,LAPOP1_10,LAPOP05_10,LAPOP1_20,LALOWI1_10,LALOWI05_10,LALOWI1_20,lapophalf,lapophalfshare,lalowihalf,lalowihalfshare,lakidshalf,lakidshalfshare,laseniorshalf,laseniorshalfshare,lawhitehalf,lawhitehalfshare,lablackhalf,lablackhalfshare,laasianhalf,laasianhalfshare,lanhopihalf,lanhopihalfshare,laaianhalf,laaianhalfshare,laomultirhalf,laomultirhalfshare,lahisphalf,lahisphalfshare,lahunvhalf,lahunvhalfshare,lasnaphalf,lasnaphalfshare,lapop1,lapop1share,lalowi1,lalowi1share,lakids1,lakids1share,laseniors1,laseniors1share,lawhite1,lawhite1share,lablack1,lablack1share,laasian1,laasian1share,lanhopi1,lanhopi1share,laaian1,laaian1share,laomultir1,laomultir1share,lahisp1,lahisp1share,lahunv1,lahunv1share,lasnap1,lasnap1share,lapop10,lapop10share,lalowi10,lalowi10share,lakids10,lakids10share,laseniors10,laseniors10share,lawhite10,lawhite10share,lablack10,lablack10share,laasian10,laasian10share,lanhopi10,lanhopi10share,laaian10,laaian10share,laomultir10,laomultir10share,lahisp10,lahisp10share,lahunv10,lahunv10share,lasnap10,lasnap10share,lapop20,lapop20share,lalowi20,lalowi20share,lakids20,lakids20share,laseniors20,laseniors20share,lawhite20,lawhite20share,lablack20,lablack20share,laasian20,laasian20share,lanhopi20,lanhopi20share,laaian20,laaian20share,laomultir20,laomultir20share,lahisp20,lahisp20share,lahunv20,lahunv20share,lasnap20,lasnap20share,TractLOWI,TractKids,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
0,1001020100,Alabama,Autauga County,1,1912,693,0,0.0,0.0,0,0,0,0,0,0,11.3,81250.0,1,1,1,1,1,0,0,0,1896.0,1912.0,1896.0,461.0,467.0,461.0,1912.0,100.0,467.0,24.42,507.0,26.52,221.0,11.56,1622.0,84.83,217.0,11.35,14.0,0.73,0.0,0.0,14.0,0.73,45.0,2.35,44.0,2.3,5.0,0.79,92.0,13.33,1896.0,99.19,461.0,24.11,504.0,26.33,219.0,11.44,1611.0,84.26,214.0,11.17,14.0,0.72,0.0,0.0,14.0,0.73,44.0,2.31,43.0,2.27,5.0,0.79,92.0,13.22,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,455.0,507.0,221.0,1622.0,217.0,14.0,0.0,14.0,45.0,44.0,6.0,102.0
1,1001020200,Alabama,Autauga County,1,2170,743,0,181.0,8.34,1,1,1,0,0,1,17.9,49000.0,1,1,1,1,1,0,0,0,1261.0,2170.0,1261.0,604.0,962.0,604.0,2170.0,100.0,962.0,44.34,606.0,27.93,214.0,9.86,888.0,40.92,1217.0,56.08,5.0,0.23,0.0,0.0,5.0,0.23,55.0,2.53,75.0,3.46,93.0,12.47,161.0,21.7,1261.0,58.11,604.0,27.83,406.0,18.69,127.0,5.83,357.0,16.43,854.0,39.36,4.0,0.18,0.0,0.0,4.0,0.2,42.0,1.93,33.0,1.52,67.0,9.0,96.0,12.95,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,802.0,606.0,214.0,888.0,1217.0,5.0,0.0,5.0,55.0,75.0,89.0,156.0
2,1001020300,Alabama,Autauga County,1,3373,1256,0,0.0,0.0,0,0,0,0,0,0,15.0,62609.0,1,1,1,1,1,0,0,0,1552.0,2857.0,1552.0,478.0,971.0,478.0,2857.0,84.7,971.0,28.79,771.0,22.86,358.0,10.6,2177.0,64.53,554.0,16.43,10.0,0.3,1.0,0.03,10.0,0.3,105.0,3.1,78.0,2.3,39.0,3.09,139.0,11.05,1552.0,46.0,478.0,14.18,416.0,12.34,201.0,5.96,1242.0,36.81,255.0,7.56,8.0,0.24,0.0,0.0,2.0,0.06,45.0,1.33,36.0,1.08,0.0,0.0,74.0,5.87,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1306.0,894.0,439.0,2576.0,647.0,17.0,5.0,11.0,117.0,87.0,99.0,172.0
3,1001020400,Alabama,Autauga County,1,4386,1722,0,0.0,0.0,0,0,0,0,0,0,2.8,70607.0,1,1,1,1,1,0,0,0,1363.0,3651.0,1363.0,343.0,893.0,343.0,3651.0,83.24,893.0,20.36,847.0,19.3,767.0,17.48,3395.0,77.41,170.0,3.88,15.0,0.34,3.0,0.06,8.0,0.18,60.0,1.38,61.0,1.4,19.0,1.13,84.0,4.88,1363.0,31.09,343.0,7.83,346.0,7.89,237.0,5.39,1233.0,28.12,81.0,1.85,7.0,0.16,2.0,0.05,4.0,0.08,37.0,0.84,30.0,0.68,8.0,0.46,30.0,1.76,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,922.0,1015.0,904.0,4086.0,193.0,18.0,4.0,11.0,74.0,85.0,21.0,98.0
4,1001020500,Alabama,Autauga County,1,10766,4082,0,181.0,1.68,0,0,0,0,1,0,15.2,96334.0,1,1,1,1,1,0,0,1,2643.0,7778.0,2643.0,586.0,1719.0,586.0,7778.0,72.25,1719.0,15.97,2309.0,21.45,840.0,7.8,6299.0,58.51,1001.0,9.29,209.0,1.94,5.0,0.05,38.0,0.35,227.0,2.11,277.0,2.57,164.0,4.01,235.0,5.76,2643.0,24.55,586.0,5.45,715.0,6.64,362.0,3.36,2168.0,20.14,343.0,3.19,47.0,0.44,1.0,0.01,14.0,0.13,70.0,0.65,86.0,0.8,55.0,1.35,83.0,2.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2242.0,3162.0,1126.0,8666.0,1437.0,296.0,9.0,48.0,310.0,355.0,230.0,339.0


### Calculating & fixing missing data values within our set

In [92]:
processed_atlas_nineteen = atlas_nineteen[keep_cols + buffer_cols + ['Pop2010'] + ['lasnap1share']].copy()
nan_count = processed_atlas_nineteen.isna().sum()
print(nan_count)

CensusTract               0
Urban                     0
PovertyRate               3
MedianFamilyIncome      748
TractLOWI                 4
TractKids                 4
TractSeniors              4
TractHUNV                 4
TractSNAP                 4
lapop1share           19989
lakids1share          19989
laseniors1share       19989
lahunv1share          19966
lalowi1share          19989
Pop2010                   0
lasnap1share          19966
dtype: int64


In [93]:
print((processed_atlas_nineteen['Pop2010'] == 0).sum())
print((len(processed_atlas_nineteen)))

0
72531


In [94]:
processed_atlas_nineteen = processed_atlas_nineteen[processed_atlas_nineteen['Pop2010'] != 0].reset_index(drop=True)

We can confidently drop this harmful data, given that these unpopulated tracts cannot be considered on any scale of being a food desert!

In [95]:
imputer = SimpleImputer(strategy='median')
processed_atlas_nineteen[keep_cols] = imputer.fit_transform(processed_atlas_nineteen[keep_cols])

In [96]:
rural_missing = processed_atlas_nineteen[processed_atlas_nineteen['Urban'] == 0][buffer_cols].isna().mean()
urban_missing = processed_atlas_nineteen[processed_atlas_nineteen['Urban'] == 1][buffer_cols].isna().mean()

print("Rural missing fraction:\n", rural_missing)
print("Urban missing fraction:\n", urban_missing)

Rural missing fraction:
 lapop1share        0.000346
lakids1share       0.000346
laseniors1share    0.000346
lahunv1share       0.000346
lalowi1share       0.000346
dtype: float64
Urban missing fraction:
 lapop1share        0.362214
lakids1share       0.362214
laseniors1share    0.362214
lahunv1share       0.361797
lalowi1share       0.362214
dtype: float64


When observing the data, we can see that our buffer zones typically return NaN when in an urban environment.

Realistically, there seems to be no reason to NOT impute our missing urban values.
Urban environments realistically do not come across notable volatilities that would return an entirely new result if not tracked properly in our database.
As a result, median imputation will be used for these values as well, especially considering that we can use the respective ~ 60% (~ 48k) reference values.

In [97]:
processed_atlas_nineteen[buffer_cols] = imputer.fit_transform(processed_atlas_nineteen[buffer_cols])
nan_count = processed_atlas_nineteen.isna().sum()
print(nan_count)

CensusTract               0
Urban                     0
PovertyRate               0
MedianFamilyIncome        0
TractLOWI                 0
TractKids                 0
TractSeniors              0
TractHUNV                 0
TractSNAP                 0
lapop1share               0
lakids1share              0
laseniors1share           0
lahunv1share              0
lalowi1share              0
Pop2010                   0
lasnap1share          19966
dtype: int64


### Additional features for our 2019 model

#### Simple ratios:

In [98]:
processed_atlas_nineteen['LOWIRatio'] = processed_atlas_nineteen['TractLOWI'] / processed_atlas_nineteen['Pop2010'] # Low income percentage
processed_atlas_nineteen['SNAPRatio'] = processed_atlas_nineteen['TractSNAP'] / processed_atlas_nineteen['Pop2010'] # Percentage of residents receiving SNAP
processed_atlas_nineteen['HUNVRatio'] = processed_atlas_nineteen['TractHUNV'] / processed_atlas_nineteen['Pop2010'] # Percentage of residents without a vehicle (important for food deserts!)
processed_atlas_nineteen['FoodInsecurityIndex'] = (
    processed_atlas_nineteen['LOWIRatio'] +
    processed_atlas_nineteen['SNAPRatio'] +
    processed_atlas_nineteen['HUNVRatio']
)

#### More complex features:

In [99]:
processed_atlas_nineteen['SNAPDisparity'] = processed_atlas_nineteen['SNAPRatio'] - processed_atlas_nineteen['lasnap1share']

Our calculated SNAP disparity is how much higher/lower the tract's SNAP is relative to its surrounding area.

A high positive disparity would indicate isolated food insecurity, while a negative disparity would show a priveleged pocket inside of a worse-off area.

In [100]:
processed_atlas_nineteen['LOWIWeighted'] = processed_atlas_nineteen['TractLOWI'] * processed_atlas_nineteen['PovertyRate']

Our weighted low income emphasizes tracts with both a high number of low-income people and a high poverty rate.

This helps create more granular priorization for our model. 

### Scaling our 2019 data before saving to be processed later

In [101]:
scaler = MinMaxScaler()
scaled_processed_atlas_nineteen = scaler.fit_transform(processed_atlas_nineteen)

I chose to use sklearn's MinMaxScaler() module over the StandardScaler() module for this project.

Given our present-day forecasting model will be using a ReLU for the hidden layer activation function, MinMax scaling shines in this aspect.

Our K-Means clustering would work fine with either choice. Given that the feature values in this dataset (proportions, demographic shares, poverty rates) are naturally bounded and positive, MinMaxScaler could also be seen as a natural fit.

## Processing our 2015 & 2010 data

### Running through our 2015 data (same steps as 2019)

In [102]:
processed_atlas_fifteen = atlas_fifteen[keep_cols + buffer_cols + ['POP2010']].copy()
# The 2015 Atlas uses the naming convention 'POP2010' opposed to the 2019 variation of 'Pop2010'
processed_atlas_fifteen.tail()

Unnamed: 0,CensusTract,Urban,PovertyRate,MedianFamilyIncome,TractLOWI,TractKids,TractSeniors,TractHUNV,TractSNAP,lapop1share,lakids1share,laseniors1share,lahunv1share,lalowi1share,POP2010
72859,56043000200,0,15.2,67917,1013,884,593,56,116,0.815997,0.215208,0.141838,0.027829,0.248265,3326
72860,56043000301,1,14.5,52474,1214,674,399,114,124,0.0,0.0,0.0,0.0,0.0,2665
72861,56043000302,1,12.7,66250,742,614,516,82,97,0.061109,0.011692,0.013193,0.00527,0.023703,2542
72862,56045951100,0,9.2,81500,671,655,499,108,50,0.688747,0.119033,0.096883,0.060575,0.135425,3314
72863,56045951300,1,15.0,68966,1390,918,650,95,168,0.353439,0.087572,0.050493,0.019168,0.132466,3894


In [103]:
processed_atlas_fifteen[keep_cols + buffer_cols] = imputer.fit_transform(processed_atlas_fifteen[keep_cols + buffer_cols])
nan_count = processed_atlas_fifteen.isna().sum()
print(nan_count)

CensusTract           0
Urban                 0
PovertyRate           0
MedianFamilyIncome    0
TractLOWI             0
TractKids             0
TractSeniors          0
TractHUNV             0
TractSNAP             0
lapop1share           0
lakids1share          0
laseniors1share       0
lahunv1share          0
lalowi1share          0
POP2010               0
dtype: int64


In [104]:
processed_atlas_fifteen['LOWIRatio'] = processed_atlas_fifteen['TractLOWI'] / processed_atlas_fifteen['POP2010'] # Low income percentage
processed_atlas_fifteen['SNAPRatio'] = processed_atlas_fifteen['TractSNAP'] / processed_atlas_fifteen['POP2010'] # Percentage of residents receiving SNAP
processed_atlas_fifteen['HUNVRatio'] = processed_atlas_fifteen['TractHUNV'] / processed_atlas_fifteen['POP2010'] # Percentage of residents without a vehicle (important for food deserts!)
processed_atlas_fifteen['FoodInsecurityIndex'] = (
    processed_atlas_fifteen['LOWIRatio'] +
    processed_atlas_fifteen['SNAPRatio'] +
    processed_atlas_fifteen['HUNVRatio']
)
processed_atlas_fifteen['LOWIWeighted'] = processed_atlas_fifteen['TractLOWI'] * processed_atlas_fifteen['PovertyRate']
nan_count = processed_atlas_fifteen.isna().sum()
print(nan_count)
# We don't include SNAP disparity in our 2015 dataset (unlike in the 2019 variation) given it's inaccessibility within the 2010 dataset
# (Which it will need to work in tandem with for our random forest food desert prediction model)

CensusTract              0
Urban                    0
PovertyRate              0
MedianFamilyIncome       0
TractLOWI                0
TractKids                0
TractSeniors             0
TractHUNV                0
TractSNAP                0
lapop1share              0
lakids1share             0
laseniors1share          0
lahunv1share             0
lalowi1share             0
POP2010                  0
LOWIRatio              333
SNAPRatio              333
HUNVRatio              333
FoodInsecurityIndex    333
LOWIWeighted             0
dtype: int64


In [105]:
print((processed_atlas_fifteen['POP2010'] == 0).sum())
processed_atlas_fifteen = processed_atlas_fifteen[processed_atlas_fifteen['POP2010'] != 0].reset_index(drop=True) # Let's drop this data

333


Weirdly enough, some of our census tracts seem to have a population of 0!

While this may seem confusing initially, it is important to remember that our atlas data may contain non-residential or industrial census tracts, along with even group quarters-only areas (military bases, prisons, or college dorms). 

Given we can't run through this data, let's drop it.

### Running through our 2010 data

In [106]:
pd.set_option('display.max_columns', None)
atlas_ten.head()

Unnamed: 0,CensusTract,State,County,LILATracts_1And10,LILATracts_halfAnd10,LILATracts_1And20,LILATracts_Vehicle,Urban,Rural,LA1and10,LAhalfand10,LA1and20,LATracts_half,LATracts1,LATracts10,LATracts20,LATractsVehicle_20,HUNVFlag,GroupQuartersFlag,OHU2010,NUMGQTRS,PCTGQTRS,LowIncomeTracts,POP2010,UATYP10,lapophalf,lapophalfshare,lalowihalf,lalowihalfshare,lakidshalf,lakidshalfshare,laseniorshalf,laseniorshalfshare,lahunvhalf,lahunvhalfshare,lapop1,lapop1share,lalowi1,lalowi1share,lakids1,lakids1share,laseniors1,laseniors1share,lahunv1,lahunv1share,lapop10,lapop10share,lalowi10,lalowi10share,lakids10,lakids10share,laseniors10,laseniors10share,lahunv10,lahunv10share,lapop20,lapop20share,lalowi20,lalowi20share,lakids20,lakids20share,laseniors20,laseniors20share,lahunv20,lahunv20share
0,1001020100,AL,Autauga,0,0,0,0,1,0,1,1,1,1,1,0,0,0,0,0,693,0.0,0.0,0,1912,U,1732.225468,0.905976,306.546737,0.160328,466.426429,0.919973,198.82822,0.899675,44.2121,0.063798,1357.48094,0.70998,245.277225,0.128283,363.638381,0.717235,162.497246,0.735282,31.579173,0.045569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1001020200,AL,Autauga,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,743,181.0,0.08341,0,2170,U,1410.374828,0.649942,484.905037,0.223459,448.163512,0.739544,139.30539,0.65096,86.423433,0.116317,483.429683,0.222779,170.838823,0.078728,174.770469,0.2884,50.976822,0.238209,34.39859,0.046297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1001020300,AL,Autauga,0,0,0,0,1,0,1,1,1,1,1,0,0,0,0,0,1256,0.0,0.0,0,3373,U,2764.604126,0.819628,773.419284,0.229297,744.891575,0.833212,346.203097,0.788618,54.188593,0.043144,1417.874893,0.42036,380.78629,0.112892,377.128132,0.421844,190.00148,0.432805,19.156371,0.015252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1001020400,AL,Autauga,0,0,0,0,1,0,1,1,1,1,1,0,0,0,0,0,1722,0.0,0.0,0,4386,U,4272.112205,0.974034,874.067405,0.199286,980.143479,0.965659,892.805993,0.987617,16.964191,0.009851,1909.275364,0.435311,311.160977,0.070944,470.411544,0.46346,374.051202,0.413773,3.926144,0.00228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1001020500,AL,Autauga,0,0,0,0,1,0,1,1,1,1,1,0,0,1,1,0,4082,181.0,0.016812,0,10766,U,7798.99399,0.72441,1131.052984,0.105058,2314.376847,0.731934,843.200608,0.748846,177.089308,0.043383,2753.648392,0.255773,373.426978,0.034686,745.740558,0.235845,373.826962,0.331996,57.430973,0.014069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


This sucks! 
It seems like our 2010 atlas is missing SNAP, Seniors, Kids, and Income breakdowns at the tract level. 

All of these are present in both our 2015 & 2019 models.
    
To recap our model necessities, our random forest forecasing model need to predict the 'Food Desert' status of a tract given fed labels supplied by our autoencoder/k-means clustering model.

For the purposes of this project, we will need to find a reasonable workaround to somehow impute these features while not skewing data at the same time.
Sadly, I don't think some basic imputing will do the trick here.

In [113]:
keep_cols_2010 = [
    'CensusTract',
    'Urban',
    'LowIncomeTracts',
    'HUNVFlag',
    'POP2010'
]
buffer_cols_2010 = [
    'lapop1share',
    'lalowi1share',
    'lakids1share',
    'laseniors1share',
    'lahunv1share',
]

These are basic features which are shared between all of our datasets.
Asides from these, the similarities end.

In [108]:
pruned_atlas_2010 = atlas_ten[keep_cols_2010 + buffer_cols_2010].copy()

#### Feature engineering our missing columns

##### PovertyRate

In [109]:
import requests
from dotenv import load_dotenv
import os

load_dotenv()

API_KEY = os.getenv("CENSUS_API_KEY")

In [110]:
def fetch_census_tract_data(state_index: str, API_KEY: str):
    BASE_URL = "https://api.census.gov/data/2010/acs/acs5"
    # B17001_001E: Population for whom poverty status is determined
    # B17001_002E: Population below poverty level
    # B19113_001E: Median Family Income
    FIELDS = ["B17001_001E", "B17001_002E", "B19113_001E"]  
    GEOGRAPHY = "tract:*"
    
    params = {
        "get": ",".join(FIELDS),
        "for": GEOGRAPHY,
        "in": f"state:{state_index}",
        "key": API_KEY
    }

    response = requests.get(BASE_URL, params=params)

    if response.status_code != 200:
        print(f"API request failed with status code {response.status_code} for state {state_index}.")
        return None

    try:
        data = response.json()
    except requests.exceptions.JSONDecodeError as e:
        print(f"JSON parsing failed for state {state_index}: {e}")
        return None

    if not data or len(data) < 2:
        print(f"No data returned or unexpected response format for state {state_index}.")
        return None

    columns = data[0]
    values = data[1:]
    df = pd.DataFrame(values, columns=columns)
    
    geo_cols = ['state', 'county', 'tract']
    if all(col in df.columns for col in geo_cols):
        df["CensusTract"] = df["state"] + df["county"] + df["tract"]
        df = df.set_index("CensusTract")
    else:
        print(f"Warning: Missing geo columns in response for state {state_index}.")
        return None

    df[FIELDS] = df[FIELDS].apply(pd.to_numeric, errors='coerce')

    df['PovertyRate'] = 0.0
    valid_mask = (df["B17001_001E"] > 0) & (df["B17001_001E"].notna())
    df.loc[valid_mask, 'PovertyRate'] = (df.loc[valid_mask, "B17001_002E"] / df.loc[valid_mask, "B17001_001E"]) * 100
    df['PovertyRate'] = df['PovertyRate'].round(2)

    df.rename(columns={
        "B19113_001E": "MedianFamilyIncome"
    }, inplace=True)

    result = df[["PovertyRate", "MedianFamilyIncome"]]

    return result


In [111]:
state_abbrev_to_fips = {
    'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06',
    'CO': '08', 'CT': '09', 'DE': '10', 'DC': '11', 'FL': '12',
    'GA': '13', 'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18',
    'IA': '19', 'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23',
    'MD': '24', 'MA': '25', 'MI': '26', 'MN': '27', 'MS': '28',
    'MO': '29', 'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33',
    'NJ': '34', 'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38',
    'OH': '39', 'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44',
    'SC': '45', 'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49',
    'VT': '50', 'VA': '51', 'WA': '53', 'WV': '54', 'WI': '55',
    'WY': '56'
}

In [112]:
tract_dfs = []
for abbrev in atlas_ten["State"].unique():
    state_fips = state_abbrev_to_fips.get(abbrev)
    if not state_fips:
        print(f"Unknown state abbreviation: {abbrev}")
        continue

    print(f"Fetching for state: {abbrev} (FIPS: {state_fips})")

    try:
        tract_df = fetch_census_tract_data(state_fips, API_KEY)
        tract_dfs.append(tract_df)
    except Exception as e:
        print(f"Failed for state {abbrev}: {e}")

Fetching for state: AL (FIPS: 01)
Fetching for state: AK (FIPS: 02)
Fetching for state: AZ (FIPS: 04)
Fetching for state: AR (FIPS: 05)
Fetching for state: CA (FIPS: 06)
Fetching for state: CO (FIPS: 08)
Fetching for state: CT (FIPS: 09)
Fetching for state: DE (FIPS: 10)
Fetching for state: DC (FIPS: 11)
Fetching for state: FL (FIPS: 12)
Fetching for state: GA (FIPS: 13)
Fetching for state: HI (FIPS: 15)
Fetching for state: ID (FIPS: 16)
Fetching for state: IL (FIPS: 17)
Fetching for state: IN (FIPS: 18)
Fetching for state: IA (FIPS: 19)
Fetching for state: KS (FIPS: 20)
Fetching for state: KY (FIPS: 21)
Fetching for state: LA (FIPS: 22)
Fetching for state: ME (FIPS: 23)
Fetching for state: MD (FIPS: 24)
Fetching for state: MA (FIPS: 25)
Fetching for state: MI (FIPS: 26)
Fetching for state: MN (FIPS: 27)
Fetching for state: MS (FIPS: 28)
Fetching for state: MO (FIPS: 29)
Fetching for state: MT (FIPS: 30)
Fetching for state: NE (FIPS: 31)
Fetching for state: NV (FIPS: 32)
Fetching for s

In [114]:
fetched_tract_data_2010 = pd.concat(tract_dfs).sort_index()
fetched_tract_data_2010.head()

Unnamed: 0_level_0,PovertyRate,MedianFamilyIncome
CensusTract,Unnamed: 1_level_1,Unnamed: 2_level_1
1001020100,10.5,76806
1001020200,14.76,49191
1001020300,8.04,53342
1001020400,6.32,67292
1001020500,5.96,80176


##### TractLOWI

The logic used in estimating TractLOWI is that by averaging the general amount of low income people across various ranges of buffers, we can create a good idea of what this statistic would come out to.

##### Fetching API Census data for TRACTSNAP

In [115]:
def fetch_tract_demographics(state_index: str, API_KEY: str):
    BASE_URL = "https://api.census.gov/data/2010/acs/acs5"

    FIELDS = [
        # SNAP
        "B22003_001E", "B22003_002E",  # total households, SNAP households
        # LOWI (income-to-poverty ratio)
        "C17002_001E", "C17002_002E", "C17002_003E", "C17002_004E",
        "C17002_005E", "C17002_006E", "C17002_007E",
        # Total population
        "B01001_001E",
        # Kids (male & female, age 0–17)
        "B01001_003E", "B01001_004E", "B01001_005E", "B01001_006E",
        "B01001_027E", "B01001_028E", "B01001_029E", "B01001_030E",
        # Seniors (male & female, 65+)
        "B01001_020E", "B01001_021E", "B01001_044E", "B01001_045E",
        # No Vehicle
        "B08201_001E", "B08201_002E"
    ]

    params = {
        "get": ",".join(FIELDS),
        "for": "tract:*",
        "in": f"state:{state_index}",
        "key": API_KEY
    }

    try:
        response = requests.get(BASE_URL, params=params)
    except requests.exceptions.RequestException as e:
        print(f"Request exception for state {state_index}: {e}")
        return None

    if response.status_code != 200:
        print(f"API request failed with status code {response.status_code} for state {state_index}.")
        try:
            error_info = response.json()
            print(f"API error message for state {state_index}: {error_info}")
        except Exception:
            print(f"API response text for state {state_index}: {response.text}")
        return None

    try:
        data = response.json()
    except requests.exceptions.JSONDecodeError as e:
        print(f"JSON parsing failed for state {state_index}: {e}")
        print(f"Response content was: {response.text}")
        return None

    if not data or len(data) < 2:
        print(f"No data returned or unexpected response format for state {state_index}. Data: {data}")
        return None

    df = pd.DataFrame(data[1:], columns=data[0])

    required_geo = ["state", "county", "tract"]
    if not all(col in df.columns for col in required_geo):
        print(f"Missing required geography columns for state {state_index}. Columns found: {df.columns.tolist()}")
        return None

    df["CensusTract"] = df["state"] + df["county"] + df["tract"]
    df = df.set_index("CensusTract")

    df = df.apply(pd.to_numeric, errors='coerce')

    df["Raw_SNAP_Households_WithBenefits"] = df["B22003_002E"]
    df["Raw_LOWI_BelowPoverty_Sum"] = df[["C17002_002E", "C17002_003E", "C17002_004E",
                                           "C17002_005E", "C17002_006E", "C17002_007E"]].sum(axis=1)

    kids_cols = ["B01001_003E", "B01001_004E", "B01001_005E", "B01001_006E",
                 "B01001_027E", "B01001_028E", "B01001_029E", "B01001_030E"]
    df["Raw_Kids_Count"] = df[kids_cols].sum(axis=1)

    senior_cols = ["B01001_020E", "B01001_021E", "B01001_044E", "B01001_045E"]
    df["Raw_Seniors_Count"] = df[senior_cols].sum(axis=1)
    df["Raw_HUNV_Households_NoVehicle"] = df["B08201_002E"]

    return df[[
        "Raw_SNAP_Households_WithBenefits", "Raw_LOWI_BelowPoverty_Sum",
        "Raw_Kids_Count", "Raw_Seniors_Count", "Raw_HUNV_Households_NoVehicle"
    ]]


In [116]:
demographic_dfs = []

for abbrev in atlas_ten["State"].unique():
    state_fips = state_abbrev_to_fips.get(abbrev)
    if not state_fips:
        print(f"Unknown state abbreviation: {abbrev}")
        continue

    print(f"Fetching for state: {abbrev} (FIPS: {state_fips})")

    try:
        demographic_df = fetch_tract_demographics(state_fips, API_KEY)
        demographic_dfs.append(demographic_df)
    except Exception as e:
        print(f"Failed for state {abbrev}: {e}")

Fetching for state: AL (FIPS: 01)
Fetching for state: AK (FIPS: 02)
Fetching for state: AZ (FIPS: 04)
Fetching for state: AR (FIPS: 05)
Fetching for state: CA (FIPS: 06)
Fetching for state: CO (FIPS: 08)
Fetching for state: CT (FIPS: 09)
Fetching for state: DE (FIPS: 10)
Fetching for state: DC (FIPS: 11)
Fetching for state: FL (FIPS: 12)
Fetching for state: GA (FIPS: 13)
Fetching for state: HI (FIPS: 15)
Fetching for state: ID (FIPS: 16)
Fetching for state: IL (FIPS: 17)
Fetching for state: IN (FIPS: 18)
Fetching for state: IA (FIPS: 19)
Fetching for state: KS (FIPS: 20)
Fetching for state: KY (FIPS: 21)
Fetching for state: LA (FIPS: 22)
Fetching for state: ME (FIPS: 23)
Fetching for state: MD (FIPS: 24)
Fetching for state: MA (FIPS: 25)
Fetching for state: MI (FIPS: 26)
Fetching for state: MN (FIPS: 27)
Fetching for state: MS (FIPS: 28)
Fetching for state: MO (FIPS: 29)
Fetching for state: MT (FIPS: 30)
Fetching for state: NE (FIPS: 31)
Fetching for state: NV (FIPS: 32)
Fetching for s

In [117]:
fetched_demographic_data = pd.concat(demographic_dfs).sort_index()
fetched_demographic_data.head()

Unnamed: 0_level_0,Raw_SNAP_Households_WithBenefits,Raw_LOWI_BelowPoverty_Sum,Raw_Kids_Count,Raw_Seniors_Count,Raw_HUNV_Households_NoVehicle
CensusTract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001020100,40,320,445,111,50
1001020200,65,682,597,89,115
1001020300,111,1035,998,147,101
1001020400,116,961,1195,213,19
1001020500,140,1427,3012,197,223


In [118]:
all_demographic_data_2010 = pd.concat(demographic_dfs, ignore_index=False)
all_demographic_data_2010.reset_index(inplace=True)
all_demographic_data_2010.columns = ['CensusTract', 'TractSNAP', 'TractLOWI', 'TractKids', 'TractSeniors', 'TractHUNV']
all_demographic_data_2010.head()

Unnamed: 0,CensusTract,TractSNAP,TractLOWI,TractKids,TractSeniors,TractHUNV
0,1001020100,40,320,445,111,50
1,1001020200,65,682,597,89,115
2,1001020300,111,1035,998,147,101
3,1001020400,116,961,1195,213,19
4,1001020500,140,1427,3012,197,223


In [119]:
if fetched_tract_data_2010.index.name == "CensusTract":
    fetched_tract_data_2010 = fetched_tract_data_2010.reset_index()
all_fetched_2010 = pd.merge(all_demographic_data_2010, fetched_tract_data_2010, on="CensusTract", how="left")

print(f'Fetched tract data (1st set): {fetched_tract_data_2010.head()}')
print(f'Fetched demographic data (2nd set): {all_demographic_data_2010.head()}')

all_fetched_2010.head()

Fetched tract data (1st set):    CensusTract  PovertyRate  MedianFamilyIncome
0  01001020100        10.50               76806
1  01001020200        14.76               49191
2  01001020300         8.04               53342
3  01001020400         6.32               67292
4  01001020500         5.96               80176
Fetched demographic data (2nd set):    CensusTract  TractSNAP  TractLOWI  TractKids  TractSeniors  TractHUNV
0  01001020100         40        320        445           111         50
1  01001020200         65        682        597            89        115
2  01001020300        111       1035        998           147        101
3  01001020400        116        961       1195           213         19
4  01001020500        140       1427       3012           197        223


Unnamed: 0,CensusTract,TractSNAP,TractLOWI,TractKids,TractSeniors,TractHUNV,PovertyRate,MedianFamilyIncome
0,1001020100,40,320,445,111,50,10.5,76806
1,1001020200,65,682,597,89,115,14.76,49191
2,1001020300,111,1035,998,147,101,8.04,53342
3,1001020400,116,961,1195,213,19,6.32,67292
4,1001020500,140,1427,3012,197,223,5.96,80176


In [136]:
pruned_atlas_2010['CensusTract'] = pruned_atlas_2010['CensusTract'].astype(str)
pruned_atlas_2010['CensusTract'] = pruned_atlas_2010['CensusTract'].str.zfill(11)
cols_to_drop = ['TractSNAP', 'TractLOWI', 'TractKids', 'TractSeniors', 'TractHUNV', 'PovertyRate', 'MedianFamilyIncome']
pruned_atlas_2010 = pruned_atlas_2010.drop(columns=cols_to_drop, errors='ignore')

processed_atlas_2010 = pd.merge(
    pruned_atlas_2010, all_fetched_2010, on='CensusTract', how='left'
)
processed_atlas_2010['LOWIRatio'] = processed_atlas_2010['TractLOWI'] / processed_atlas_2010['POP2010'] # Low income percentage
processed_atlas_2010['SNAPRatio'] = processed_atlas_2010['TractSNAP'] / processed_atlas_2010['POP2010'] # Percentage of residents receiving SNAP
processed_atlas_2010['HUNVRatio'] = processed_atlas_2010['TractHUNV'] / processed_atlas_2010['POP2010'] # Percentage of residents without a vehicle (important for food deserts!)
processed_atlas_2010['FoodInsecurityIndex'] = (
    processed_atlas_2010['LOWIRatio'] +
    processed_atlas_2010['SNAPRatio'] +
    processed_atlas_2010['HUNVRatio']
)
processed_atlas_2010['LOWIWeighted'] = processed_atlas_2010['TractLOWI'] * processed_atlas_2010['PovertyRate']
processed_atlas_2010 = processed_atlas_2010[processed_atlas_2010['POP2010'] != 0].reset_index(drop=True)
processed_atlas_2010.head()

Unnamed: 0,CensusTract,Urban,LowIncomeTracts,HUNVFlag,POP2010,lapop1share,lalowi1share,lakids1share,laseniors1share,lahunv1share,lalowi10share,lalowi20share,TractSNAP,TractLOWI,TractKids,TractSeniors,TractHUNV,PovertyRate,MedianFamilyIncome,LOWIRatio,SNAPRatio,HUNVRatio,FoodInsecurityIndex,LOWIWeighted
0,1001020100,1,0,0,1912,0.70998,0.128283,0.717235,0.735282,0.045569,0.0,0.0,40,320,445,111,50,10.5,76806,0.167364,0.020921,0.026151,0.214435,3360.0
1,1001020200,1,0,0,2170,0.222779,0.078728,0.2884,0.238209,0.046297,0.0,0.0,65,682,597,89,115,14.76,49191,0.314286,0.029954,0.052995,0.397235,10066.32
2,1001020300,1,0,0,3373,0.42036,0.112892,0.421844,0.432805,0.015252,0.0,0.0,111,1035,998,147,101,8.04,53342,0.306849,0.032908,0.029944,0.369701,8321.4
3,1001020400,1,0,0,4386,0.435311,0.070944,0.46346,0.413773,0.00228,0.0,0.0,116,961,1195,213,19,6.32,67292,0.219106,0.026448,0.004332,0.249886,6073.52
4,1001020500,1,0,1,10766,0.255773,0.034686,0.235845,0.331996,0.014069,0.0,0.0,140,1427,3012,197,223,5.96,80176,0.132547,0.013004,0.020713,0.166264,8504.92


In [137]:
numeric_df = processed_atlas_2010.select_dtypes(include=[np.number])

# Check for infinite values in numeric columns
print(np.isinf(numeric_df).sum())

Urban                  0
LowIncomeTracts        0
HUNVFlag               0
POP2010                0
lapop1share            0
lalowi1share           0
lakids1share           0
laseniors1share        0
lahunv1share           0
lalowi10share          0
lalowi20share          0
TractSNAP              0
TractLOWI              0
TractKids              0
TractSeniors           0
TractHUNV              0
PovertyRate            0
MedianFamilyIncome     0
LOWIRatio              0
SNAPRatio              0
HUNVRatio              0
FoodInsecurityIndex    0
LOWIWeighted           0
dtype: int64


In [144]:
scaled_processed_atlas_fifteen = scaler.fit_transform(processed_atlas_fifteen)
scaled_processed_atlas_2010 = scaler.fit_transform(processed_atlas_2010)

In [145]:
unaligned_atlas_nineteen = pd.DataFrame(scaled_processed_atlas_nineteen, columns=[processed_atlas_nineteen.columns])
unaligned_atlas_fifteen = pd.DataFrame(scaled_processed_atlas_fifteen, columns=[processed_atlas_fifteen.columns])
unaligned_atlas_2010 = pd.DataFrame(scaled_processed_atlas_2010, columns=processed_atlas_2010.columns)
unaligned_atlas_2010.head()

Unnamed: 0,CensusTract,Urban,LowIncomeTracts,HUNVFlag,POP2010,lapop1share,lalowi1share,lakids1share,laseniors1share,lahunv1share,lalowi10share,lalowi20share,TractSNAP,TractLOWI,TractKids,TractSeniors,TractHUNV,PovertyRate,MedianFamilyIncome,LOWIRatio,SNAPRatio,HUNVRatio,FoodInsecurityIndex,LOWIWeighted
0,0.0,1.0,0.0,0.0,0.051027,0.70998,0.128283,0.717235,0.735282,0.045569,0.0,0.0,0.021368,0.026418,0.048041,0.02401,0.008186,0.105,0.99974,0.004782,0.01569,0.009895,0.006127,0.005762
1,1.816698e-09,1.0,0.0,0.0,0.057916,0.222779,0.078728,0.2884,0.238209,0.046297,0.0,0.0,0.034722,0.056303,0.06445,0.019252,0.018828,0.1476,0.999699,0.00898,0.022465,0.020052,0.01135,0.017263
2,3.633395e-09,1.0,0.0,0.0,0.090038,0.42036,0.112892,0.421844,0.432805,0.015252,0.0,0.0,0.059295,0.085445,0.10774,0.031798,0.016536,0.0804,0.999705,0.008767,0.024681,0.01133,0.010563,0.014271
3,5.450093e-09,1.0,0.0,0.0,0.117086,0.435311,0.070944,0.46346,0.413773,0.00228,0.0,0.0,0.061966,0.079336,0.129008,0.046074,0.003111,0.0632,0.999726,0.00626,0.019836,0.001639,0.00714,0.010416
4,7.266791e-09,1.0,0.0,1.0,0.287442,0.255773,0.034686,0.235845,0.331995,0.014069,0.0,0.0,0.074786,0.117807,0.325165,0.042613,0.036509,0.0596,0.999745,0.003787,0.009753,0.007837,0.00475,0.014585


In [153]:
print(len(unaligned_atlas_2010))
print(len(unaligned_atlas_fifteen))
print(len(unaligned_atlas_nineteen))

72531
72531
72531


Given our length MIGHT be different (in the sense certain tracts might misalign/not exist), let's loop through our dataframes and remove any census tracts whose index doesn't exist in the other dataset.

In [162]:
def align_dataframes_by_index(*dfs):
    common_index = dfs[0].index
    for df in dfs[1:]:
        common_index = common_index.intersection(df.index)
    aligned_dfs = tuple(df.loc[common_index].copy() for df in dfs)
    return aligned_dfs

In [163]:
final_atlas_ten, final_atlas_fifteen, final_atlas_nineteen = align_dataframes_by_index(unaligned_atlas_2010, unaligned_atlas_fifteen, unaligned_atlas_nineteen)

In [164]:
print(len(final_atlas_ten))
print(len(final_atlas_fifteen))
print(len(final_atlas_nineteen))
print(final_atlas_ten.index.equals(final_atlas_fifteen.index))

72531
72531
72531
True


Alignment complete (for certain, I know the length was the same, but it's better to be safe than sorry)!

In [161]:
print(f'2019 atlas data: {final_atlas_nineteen.head()}')
print(f'2015 atlas data: {final_atlas_fifteen.head()}')
print(f'2010 atlas data: {final_atlas_ten.head()}')

2019 atlas data:     CensusTract Urban PovertyRate MedianFamilyIncome TractLOWI TractKids  \
0  0.000000e+00   1.0       0.113           0.318183  0.036220  0.042803   
1  1.816698e-09   1.0       0.179           0.187881  0.063843  0.051161   
2  3.633395e-09   1.0       0.150           0.242867  0.103964  0.075475   
3  5.450093e-09   1.0       0.028           0.275182  0.073396  0.085690   
4  7.266791e-09   1.0       0.152           0.379128  0.178475  0.266948   

  TractSeniors TractHUNV TractSNAP lapop1share lakids1share laseniors1share  \
0     0.012796  0.000990  0.046897      0.9919     0.289978          0.1144   
1     0.012391  0.014689  0.071724      0.5811     0.205837          0.0583   
2     0.025418  0.016339  0.079080      0.4600     0.135903          0.0596   
3     0.052342  0.003466  0.045057      0.3109     0.086894          0.0539   
4     0.065196  0.037960  0.155862      0.2455     0.073128          0.0336   

  lahunv1share lalowi1share   Pop2010 lasnap1share 

##### WOOHOO
Feature engineering finished!

Look through these statistics if you want, but the scaled similarities almost guarantee a completely fine dataset for our Random forest model.

In [166]:
final_atlas_ten.to_csv('../data/processed/processed_ten.csv')
final_atlas_fifteen.to_csv('../data/processed/processed_atlas_fifteen.csv')
final_atlas_nineteen.to_csv('../data/processed/processed_atlas_nineteen.csv')