# Processing & Pruning Our Data

## Import our dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os
from sklearn.impute import SimpleImputer
import json
import seaborn as sns

## Load our datasets

In [2]:
atlas_2015_path = '../data/raw/Raw_Atlas_2015/Raw_Atlas_Data_2015.xlsx'
atlas_2015_xl = pd.ExcelFile(atlas_2015_path)
atlas_2010_path = '../data/raw/Raw_Atlas_2010/Raw_Atlas_Data_2010.xlsx'
atlas_2010_xl = pd.ExcelFile(atlas_2010_path)

atlas_2019 = pd.read_csv('../data/raw/Raw_Atlas_2019/Raw_Atlas_Data_2019.csv')
atlas_2015 = atlas_2015_xl.parse('Food Access Research Atlas')
atlas_2010 = atlas_2010_xl.parse('Food Access Research Atlas')

## Processing 2019 data for our present-day classification model

The 100+ columns in our table are often redundant and useless for training our model.

Let's keep the fundamentals, along with some general data buffer zones for identifying relative isolation + outliers.

In [56]:
keep_cols_2019 = [
    'Urban', 'PovertyRate', 'MedianFamilyIncome',
    'TractLOWI', 'TractKids', 'TractSeniors', 'TractHUNV', 'TractSNAP', 
]

buffer_cols_2019 = [
    'lapop1share', 'lakids1share', 'laseniors1share',
    'lahunv1share', 'lalowi1share'
]

#### Quick definitions for our data:

##### Keep_cols
CensusTract â€“ Census tract number

Urban â€“ Flag for urban tract

PovertyRate â€“ Share of the tract population living with income at or below the Federal poverty thresholds for family size

MedianFamilyIncome â€“ Median family income in the tract

TractLOWI â€“ Total count of low-income individuals in the tract

TractKids â€“ Total count of children (age 0â€“17) in the tract

TractSeniors â€“ Total count of seniors (age 65+) in the tract

TractHUNV â€“ Total count of housing units without a vehicle in the tract

TractSNAP â€“ Total count of housing units receiving SNAP benefits in the tract

##### Buffer_cols
lapop1share â€“ Share of tract population beyond 1 mile from a supermarket

lakids1share â€“ Share of children (age 0â€“17) beyond 1 mile from a supermarket

laseniors1share â€“ Share of seniors (age 65+) beyond 1 mile from a supermarket

lahunv1share â€“ Share of housing units without a vehicle that are beyond 1 mile from a supermarket

lalowi1share â€“ Share of low-income individuals beyond 1 mile from a supermarket

In [57]:
scale_cols = [
    "MedianFamilyIncome", "TractLOWI", "TractKids", "TractSeniors",
    "TractHUNV", "TractSNAP", "LOWIWeighted"
]

### Calculating & fixing missing data values within our set

In [58]:
processed_atlas_2019 = atlas_2019[keep_cols_2019 + buffer_cols_2019 + ['Pop2010']].copy()
processed_atlas_2019['CensusTract'] = atlas_2019['CensusTract']
nan_count_2019 = processed_atlas_2019.isna().sum()
print(nan_count_2019)

Urban                     0
PovertyRate               3
MedianFamilyIncome      748
TractLOWI                 4
TractKids                 4
TractSeniors              4
TractHUNV                 4
TractSNAP                 4
lapop1share           19989
lakids1share          19989
laseniors1share       19989
lahunv1share          19966
lalowi1share          19989
Pop2010                   0
CensusTract               0
dtype: int64


In [59]:
processed_atlas_2019 = processed_atlas_2019[processed_atlas_2019['Pop2010'] != 0].reset_index(drop=True)
processed_atlas_2019.head()

Unnamed: 0,Urban,PovertyRate,MedianFamilyIncome,TractLOWI,TractKids,TractSeniors,TractHUNV,TractSNAP,lapop1share,lakids1share,laseniors1share,lahunv1share,lalowi1share,Pop2010,CensusTract
0,1,11.3,81250.0,455.0,507.0,221.0,6.0,102.0,99.19,26.33,11.44,0.79,24.11,1912,1001020100
1,1,17.9,49000.0,802.0,606.0,214.0,89.0,156.0,58.11,18.69,5.83,9.0,27.83,2170,1001020200
2,1,15.0,62609.0,1306.0,894.0,439.0,99.0,172.0,46.0,12.34,5.96,0.0,14.18,3373,1001020300
3,1,2.8,70607.0,922.0,1015.0,904.0,21.0,98.0,31.09,7.89,5.39,0.46,7.83,4386,1001020400
4,1,15.2,96334.0,2242.0,3162.0,1126.0,230.0,339.0,24.55,6.64,3.36,1.35,5.45,10766,1001020500


We can confidently drop data where population = 0, given that these unpopulated tracts cannot be considered on any scale of being a food desert!

In [60]:
imputer = SimpleImputer(strategy='median')
processed_atlas_2019[keep_cols_2019] = imputer.fit_transform(processed_atlas_2019[keep_cols_2019])

In [61]:
rural_missing_ratio_2019 = processed_atlas_2019[processed_atlas_2019['Urban'] == 0][buffer_cols_2019].isna().mean()
urban_missing_ratio_2019 = processed_atlas_2019[processed_atlas_2019['Urban'] == 1][buffer_cols_2019].isna().mean()

print("Rural missing fraction:\n", rural_missing_ratio_2019)
print("Urban missing fraction:\n", urban_missing_ratio_2019)

Rural missing fraction:
 lapop1share        0.000346
lakids1share       0.000346
laseniors1share    0.000346
lahunv1share       0.000346
lalowi1share       0.000346
dtype: float64
Urban missing fraction:
 lapop1share        0.362214
lakids1share       0.362214
laseniors1share    0.362214
lahunv1share       0.361797
lalowi1share       0.362214
dtype: float64


When observing the data, we can see that our buffer zones typically return NaN when in an urban environment.

Realistically, there seems to be no reason to NOT impute our missing urban values.
Urban environments realistically do not come across notable volatilities that would return an entirely new result if not tracked properly in our database.
As a result, median imputation will be used for these values as well, especially considering that we can use the respective ~ 60% (~ 48k) stats as reference values.

In [62]:
processed_atlas_2019[buffer_cols_2019] = imputer.fit_transform(processed_atlas_2019[buffer_cols_2019])
nan_count_2019 = processed_atlas_2019.isna().sum()
print(nan_count_2019)

Urban                 0
PovertyRate           0
MedianFamilyIncome    0
TractLOWI             0
TractKids             0
TractSeniors          0
TractHUNV             0
TractSNAP             0
lapop1share           0
lakids1share          0
laseniors1share       0
lahunv1share          0
lalowi1share          0
Pop2010               0
CensusTract           0
dtype: int64


All cleaned up - no NaN values!

### Additional features for our 2019 model

In [63]:
processed_atlas_2019['LOWIRatio'] = processed_atlas_2019['TractLOWI'] / processed_atlas_2019['Pop2010']
processed_atlas_2019['SNAPRatio'] = processed_atlas_2019['TractSNAP'] / processed_atlas_2019['Pop2010']
processed_atlas_2019['HUNVRatio'] = processed_atlas_2019['TractHUNV'] / processed_atlas_2019['Pop2010']
processed_atlas_2019['FoodInsecurityIndex'] = (
    processed_atlas_2019['LOWIRatio'] +
    processed_atlas_2019['SNAPRatio'] +
    processed_atlas_2019['HUNVRatio']
)
processed_atlas_2019['LOWIWeighted'] = processed_atlas_2019['TractLOWI'] * processed_atlas_2019['PovertyRate']

#### What do these features mean?
LOWIRatio â€“ The proportion of the tract population that is low-income, calculated as TractLOWI divided by Pop2010

SNAPRatio â€“ The proportion of the tract population living in households receiving SNAP benefits, calculated as TractSNAP divided by Pop2010

HUNVRatio â€“ The proportion of the tract population living in households without a vehicle, calculated as TractHUNV divided by Pop2010

FoodInsecurityIndex â€“ A composite index indicating overall food insecurity, calculated as the sum of LOWIRatio, SNAPRatio, and HUNVRatio

LOWIWeighted â€“ A weighted measure of low-income population, calculated as TractLOWI multiplied by PovertyRate to account for both count and depth of poverty

Our weighted low income emphasizes tracts with both a high number of low-income people and a high poverty rate, creating more granular prioritization for our model 

(Overcomplicated termðŸ˜…, but means to break down priorities into very detailed or specific parts to focus on what matters most at a fine level)

### Scaling our 2019 data before saving to be processed later

In [64]:
scaler = MinMaxScaler()
scaled_processed_atlas_2019 = processed_atlas_2019.copy()
scale_cols_2019 = scale_cols + ['Pop2010']
scaled_values = scaler.fit_transform(scaled_processed_atlas_2019[scale_cols_2019])
scaled_processed_atlas_2019[scale_cols_2019] = scaled_values
scaled_processed_atlas_2019.head()

Unnamed: 0,Urban,PovertyRate,MedianFamilyIncome,TractLOWI,TractKids,TractSeniors,TractHUNV,TractSNAP,lapop1share,lakids1share,laseniors1share,lahunv1share,lalowi1share,Pop2010,CensusTract,LOWIRatio,SNAPRatio,HUNVRatio,FoodInsecurityIndex,LOWIWeighted
0,1.0,11.3,0.318183,0.03622,0.042803,0.012796,0.00099,0.046897,99.19,26.33,11.44,0.79,24.11,0.051027,1001020100,0.237971,0.053347,0.003138,0.294456,0.006762
1,1.0,17.9,0.187881,0.063843,0.051161,0.012391,0.014689,0.071724,58.11,18.69,5.83,9.0,27.83,0.057916,1001020200,0.369585,0.071889,0.041014,0.482488,0.018882
2,1.0,15.0,0.242867,0.103964,0.075475,0.025418,0.016339,0.07908,46.0,12.34,5.96,0.0,14.18,0.090038,1001020300,0.387192,0.050993,0.029351,0.467536,0.025766
3,1.0,2.8,0.275182,0.073396,0.08569,0.052342,0.003466,0.045057,31.09,7.89,5.39,0.46,7.83,0.117086,1001020400,0.210214,0.022344,0.004788,0.237346,0.003395
4,1.0,15.2,0.379128,0.178475,0.266948,0.065196,0.03796,0.155862,24.55,6.64,3.36,1.35,5.45,0.287442,1001020500,0.208248,0.031488,0.021364,0.2611,0.044822


I chose to use sklearn's MinMaxScaler() module over the StandardScaler() module for this project.

Given our present-day forecasting model will be using a ReLU for the hidden layer activation function, MinMax scaling shines in this aspect.

Our K-Means clustering would work fine with either choice. Given that the feature values in this dataset (proportions, demographic shares, poverty rates) are naturally bounded and positive, MinMaxScaler could also be seen as a "more natural" fit.

## Processing our 2015 & 2010 data

### Running through our 2015 data (via the same process as done w/ 2019)

In [65]:
processed_atlas_2015 = atlas_2015[keep_cols_2019 + buffer_cols_2019 + ['POP2010']].copy()
processed_atlas_2015.head()

Unnamed: 0,Urban,PovertyRate,MedianFamilyIncome,TractLOWI,TractKids,TractSeniors,TractHUNV,TractSNAP,lapop1share,lakids1share,laseniors1share,lahunv1share,lalowi1share,POP2010
0,1,10.0,74750,448,507,221,26,112,0.70998,0.190187,0.084988,0.014102,0.168454,1912
1,1,18.2,51875,763,606,214,87,202,0.222779,0.080539,0.023492,0.029123,0.066885,2170
2,1,19.1,52905,1578,894,439,108,120,0.42036,0.111808,0.05633,0.010594,0.206533,3373
3,1,3.3,68079,1241,1015,904,19,82,0.310868,0.07888,0.053928,0.0051,0.093385,4386
4,1,8.5,77819,2692,3162,1126,198,488,0.245504,0.066371,0.033594,0.01094,0.057874,10766


The 2015 Atlas uses the naming convention 'POP2010' opposed to the 2019 variation of 'Pop2010,' which is also why we add 'Pop2010' in our 2019 processes, we need to keep the little naming differences separate, but use the similarities both sets have. 


In [66]:
processed_atlas_2015[keep_cols_2019 + buffer_cols_2019] = imputer.fit_transform(processed_atlas_2015[keep_cols_2019 + buffer_cols_2019])
processed_atlas_2015['CensusTract'] = atlas_2015['CensusTract']
nan_count_2015 = processed_atlas_2015.isna().sum()
processed_atlas_2015.head()

Unnamed: 0,Urban,PovertyRate,MedianFamilyIncome,TractLOWI,TractKids,TractSeniors,TractHUNV,TractSNAP,lapop1share,lakids1share,laseniors1share,lahunv1share,lalowi1share,POP2010,CensusTract
0,1.0,10.0,74750.0,448.0,507.0,221.0,26.0,112.0,0.70998,0.190187,0.084988,0.014102,0.168454,1912,1001020100
1,1.0,18.2,51875.0,763.0,606.0,214.0,87.0,202.0,0.222779,0.080539,0.023492,0.029123,0.066885,2170,1001020200
2,1.0,19.1,52905.0,1578.0,894.0,439.0,108.0,120.0,0.42036,0.111808,0.05633,0.010594,0.206533,3373,1001020300
3,1.0,3.3,68079.0,1241.0,1015.0,904.0,19.0,82.0,0.310868,0.07888,0.053928,0.0051,0.093385,4386,1001020400
4,1.0,8.5,77819.0,2692.0,3162.0,1126.0,198.0,488.0,0.245504,0.066371,0.033594,0.01094,0.057874,10766,1001020500


Just like with our 2019 set, we'll impute some of our missing features.

In [67]:
processed_atlas_2015['LOWIRatio'] = processed_atlas_2015['TractLOWI'] / processed_atlas_2015['POP2010'] # Low income percentage
processed_atlas_2015['SNAPRatio'] = processed_atlas_2015['TractSNAP'] / processed_atlas_2015['POP2010'] # Percentage of residents receiving SNAP
processed_atlas_2015['HUNVRatio'] = processed_atlas_2015['TractHUNV'] / processed_atlas_2015['POP2010'] # Percentage of residents without a vehicle (important for food deserts!)
processed_atlas_2015['FoodInsecurityIndex'] = (
    processed_atlas_2015['LOWIRatio'] +
    processed_atlas_2015['SNAPRatio'] +
    processed_atlas_2015['HUNVRatio']
)
processed_atlas_2015['LOWIWeighted'] = processed_atlas_2015['TractLOWI'] * processed_atlas_2015['PovertyRate']
nan_count_2015 = processed_atlas_2015.isna().sum()
print(nan_count_2015)

Urban                    0
PovertyRate              0
MedianFamilyIncome       0
TractLOWI                0
TractKids                0
TractSeniors             0
TractHUNV                0
TractSNAP                0
lapop1share              0
lakids1share             0
laseniors1share          0
lahunv1share             0
lalowi1share             0
POP2010                  0
CensusTract              0
LOWIRatio              333
SNAPRatio              333
HUNVRatio              333
FoodInsecurityIndex    333
LOWIWeighted             0
dtype: int64


Basic feature engineering as seen in the 2019 set. 

In [68]:
print((processed_atlas_2015['POP2010'] == 0).sum())
processed_atlas_2015 = processed_atlas_2015[processed_atlas_2015['POP2010'] != 0].reset_index(drop=True) # Let's drop this data

333


Weirdly enough, some of our census tracts seem to have a population of 0 (We briefly touched upon this in our 2019 processing)!

While this may seem confusing initially, it is important to remember that our atlas data may contain non-residential or industrial census tracts, along with even group quarters-only areas (military bases, prisons, or college dorms). 

Given we can't run through this data, let's drop it.

In [69]:
print(nan_count_2015)

Urban                    0
PovertyRate              0
MedianFamilyIncome       0
TractLOWI                0
TractKids                0
TractSeniors             0
TractHUNV                0
TractSNAP                0
lapop1share              0
lakids1share             0
laseniors1share          0
lahunv1share             0
lalowi1share             0
POP2010                  0
CensusTract              0
LOWIRatio              333
SNAPRatio              333
HUNVRatio              333
FoodInsecurityIndex    333
LOWIWeighted             0
dtype: int64


You might've noticed some of our calculated ratios also returned a NaN value.

Given that these features were created relative to population, the reason why 333 of them returned NaN is due to a division by 0 error.
By not considering the tracts with a population of 0, we in turn trash NaN errors occurring within our ratios.

##### TL;DR
Populations of 0 and NaNs in our ratio values come bundled together due to divison by 0. 

### Running through our 2010 data

In [70]:
pd.set_option('display.max_columns', None)
atlas_2010.head()

Unnamed: 0,CensusTract,State,County,LILATracts_1And10,LILATracts_halfAnd10,LILATracts_1And20,LILATracts_Vehicle,Urban,Rural,LA1and10,LAhalfand10,LA1and20,LATracts_half,LATracts1,LATracts10,LATracts20,LATractsVehicle_20,HUNVFlag,GroupQuartersFlag,OHU2010,NUMGQTRS,PCTGQTRS,LowIncomeTracts,POP2010,UATYP10,lapophalf,lapophalfshare,lalowihalf,lalowihalfshare,lakidshalf,lakidshalfshare,laseniorshalf,laseniorshalfshare,lahunvhalf,lahunvhalfshare,lapop1,lapop1share,lalowi1,lalowi1share,lakids1,lakids1share,laseniors1,laseniors1share,lahunv1,lahunv1share,lapop10,lapop10share,lalowi10,lalowi10share,lakids10,lakids10share,laseniors10,laseniors10share,lahunv10,lahunv10share,lapop20,lapop20share,lalowi20,lalowi20share,lakids20,lakids20share,laseniors20,laseniors20share,lahunv20,lahunv20share
0,1001020100,AL,Autauga,0,0,0,0,1,0,1,1,1,1,1,0,0,0,0,0,693,0.0,0.0,0,1912,U,1732.225468,0.905976,306.546737,0.160328,466.426429,0.919973,198.82822,0.899675,44.2121,0.063798,1357.48094,0.70998,245.277225,0.128283,363.638381,0.717235,162.497246,0.735282,31.579173,0.045569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1001020200,AL,Autauga,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,743,181.0,0.08341,0,2170,U,1410.374828,0.649942,484.905037,0.223459,448.163512,0.739544,139.30539,0.65096,86.423433,0.116317,483.429683,0.222779,170.838823,0.078728,174.770469,0.2884,50.976822,0.238209,34.39859,0.046297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1001020300,AL,Autauga,0,0,0,0,1,0,1,1,1,1,1,0,0,0,0,0,1256,0.0,0.0,0,3373,U,2764.604126,0.819628,773.419284,0.229297,744.891575,0.833212,346.203097,0.788618,54.188593,0.043144,1417.874893,0.42036,380.78629,0.112892,377.128132,0.421844,190.00148,0.432805,19.156371,0.015252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1001020400,AL,Autauga,0,0,0,0,1,0,1,1,1,1,1,0,0,0,0,0,1722,0.0,0.0,0,4386,U,4272.112205,0.974034,874.067405,0.199286,980.143479,0.965659,892.805993,0.987617,16.964191,0.009851,1909.275364,0.435311,311.160977,0.070944,470.411544,0.46346,374.051202,0.413773,3.926144,0.00228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1001020500,AL,Autauga,0,0,0,0,1,0,1,1,1,1,1,0,0,1,1,0,4082,181.0,0.016812,0,10766,U,7798.99399,0.72441,1131.052984,0.105058,2314.376847,0.731934,843.200608,0.748846,177.089308,0.043383,2753.648392,0.255773,373.426978,0.034686,745.740558,0.235845,373.826962,0.331996,57.430973,0.014069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


This sucks! 
It seems like our 2010 atlas is missing SNAP, Seniors, Kids, and Income breakdowns at the tract level. 

All of these are present in both our 2015 & 2019 models.
    
To recap our model necessities, our random forest forecasing model need to predict the 'Food Desert' status of a tract given fed labels supplied by our autoencoder/k-means clustering model.

For the purposes of this project, we will need to find a reasonable workaround to somehow fetch these features while not skewing data at the same time.

### A brief detour
The biggest time sink in this whole project was me trying to conceive of a way to impute missing data.

I threw a lot of ideas at this problem: linear regression, ridge regression, and even my own weighted sum formulas using location data.

But honestly, it turned out to be a waste of time. Thereâ€™s just way too much natural volatility to make predictions based on regression between two past datasetsðŸ˜­. (I used 2019 and 2015 as reference points, and it failed miserably, especially for tracts with ANY kind of extreme stats.)

I also tried location-based imputation, but that flopped too. For a bunch (a majority, sadly) of tract measurements beyond 1 mile, Iâ€™d just get zeros. And when I did limit it to a 1-mile radius, the results were still very unpredictable and all over the place. 

Definitely a tough lesson learned!

In [71]:
keep_cols_2010 = [
    'CensusTract',
    'Urban',
    'LowIncomeTracts',
    'HUNVFlag',
    'POP2010'
]
buffer_cols_2010 = [
    'lapop1share',
    'lalowi1share',
    'lakids1share',
    'laseniors1share',
    'lahunv1share',
]

These are basic features which are shared between all of our datasets.
Asides from these, the similarities end.

In [72]:
pruned_atlas_2010 = atlas_2010[keep_cols_2010 + buffer_cols_2010].copy()

#### Feature engineering our missing columns
(A.K.A just using the Census API to fill in food atlas gaps from our imported CSVs)

##### PovertyRate + MedianFamilyIncome

In [20]:
import requests
from dotenv import load_dotenv
import os

load_dotenv()

API_KEY = os.getenv("CENSUS_API_KEY")

In [21]:
def fetch_census_tract_data_first(state_index: str, API_KEY: str):
    BASE_URL = "https://api.census.gov/data/2010/acs/acs5"
    # B17001_001E: Population for whom poverty status is determined
    # B17001_002E: Population below poverty level
    # B19113_001E: Median Family Income
    FIELDS = ["B17001_001E", "B17001_002E", "B19113_001E"]  
    GEOGRAPHY = "tract:*"
    
    params = {
        "get": ",".join(FIELDS),
        "for": GEOGRAPHY,
        "in": f"state:{state_index}",
        "key": API_KEY
    }

    response = requests.get(BASE_URL, params=params)

    if response.status_code != 200:
        print(f"API request failed with status code {response.status_code} for state {state_index}.")
        return None

    try:
        data = response.json()
    except requests.exceptions.JSONDecodeError as e:
        print(f"JSON parsing failed for state {state_index}: {e}")
        return None

    if not data or len(data) < 2:
        print(f"No data returned or unexpected response format for state {state_index}.")
        return None

    columns = data[0]
    values = data[1:]
    df = pd.DataFrame(values, columns=columns)
    
    geo_cols = ['state', 'county', 'tract']
    if all(col in df.columns for col in geo_cols):
        df["CensusTract"] = df["state"] + df["county"] + df["tract"]
        df = df.set_index("CensusTract")
    else:
        print(f"Warning: Missing geo columns in response for state {state_index}.")
        return None

    df[FIELDS] = df[FIELDS].apply(pd.to_numeric, errors='coerce')

    df['PovertyRate'] = 0.0
    valid_mask = (df["B17001_001E"] > 0) & (df["B17001_001E"].notna())
    df.loc[valid_mask, 'PovertyRate'] = (df.loc[valid_mask, "B17001_002E"] / df.loc[valid_mask, "B17001_001E"]) * 100
    df['PovertyRate'] = df['PovertyRate'].round(2)

    df.rename(columns={
        "B19113_001E": "MedianFamilyIncome"
    }, inplace=True)

    result = df[["PovertyRate", "MedianFamilyIncome"]]

    return result


In [22]:
state_abbrev_to_fips = {
    'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06',
    'CO': '08', 'CT': '09', 'DE': '10', 'DC': '11', 'FL': '12',
    'GA': '13', 'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18',
    'IA': '19', 'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23',
    'MD': '24', 'MA': '25', 'MI': '26', 'MN': '27', 'MS': '28',
    'MO': '29', 'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33',
    'NJ': '34', 'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38',
    'OH': '39', 'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44',
    'SC': '45', 'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49',
    'VT': '50', 'VA': '51', 'WA': '53', 'WV': '54', 'WI': '55',
    'WY': '56'
}

In [23]:
tract_dfs_2010 = []
for abbrev in atlas_2010["State"].unique():
    state_fips = state_abbrev_to_fips.get(abbrev)
    if not state_fips:
        print(f"Unknown state abbreviation: {abbrev}")
        continue

    print(f"Fetching for state: {abbrev} (FIPS: {state_fips})")

    try:
        tract_df_2010 = fetch_census_tract_data_first(state_fips, API_KEY)
        tract_dfs_2010.append(tract_df_2010)
    except Exception as e:
        print(f"Failed for state {abbrev}: {e}")

Fetching for state: AL (FIPS: 01)
Fetching for state: AK (FIPS: 02)
Fetching for state: AZ (FIPS: 04)
Fetching for state: AR (FIPS: 05)
Fetching for state: CA (FIPS: 06)
Fetching for state: CO (FIPS: 08)
Fetching for state: CT (FIPS: 09)
Fetching for state: DE (FIPS: 10)
Fetching for state: DC (FIPS: 11)
Fetching for state: FL (FIPS: 12)
Fetching for state: GA (FIPS: 13)
Fetching for state: HI (FIPS: 15)
Fetching for state: ID (FIPS: 16)
Fetching for state: IL (FIPS: 17)
Fetching for state: IN (FIPS: 18)
Fetching for state: IA (FIPS: 19)
Fetching for state: KS (FIPS: 20)
Fetching for state: KY (FIPS: 21)
Fetching for state: LA (FIPS: 22)
Fetching for state: ME (FIPS: 23)
Fetching for state: MD (FIPS: 24)
Fetching for state: MA (FIPS: 25)
Fetching for state: MI (FIPS: 26)
Fetching for state: MN (FIPS: 27)
Fetching for state: MS (FIPS: 28)
Fetching for state: MO (FIPS: 29)
Fetching for state: MT (FIPS: 30)
Fetching for state: NE (FIPS: 31)
Fetching for state: NV (FIPS: 32)
Fetching for s

In [24]:
fetched_tract_data_2010_first = pd.concat(tract_dfs_2010).sort_index()
fetched_tract_data_2010_first.head()

Unnamed: 0_level_0,PovertyRate,MedianFamilyIncome
CensusTract,Unnamed: 1_level_1,Unnamed: 2_level_1
1001020100,10.5,76806
1001020200,14.76,49191
1001020300,8.04,53342
1001020400,6.32,67292
1001020500,5.96,80176


##### PovertyRate, MedianFamilyIncome, 

In [25]:
def fetch_tract_demographics_second(state_index: str, API_KEY: str):
    BASE_URL = "https://api.census.gov/data/2010/acs/acs5"

    FIELDS = [
        # SNAP
        "B22003_001E", "B22003_002E",  # total households, SNAP households
        # LOWI (income-to-poverty ratio)
        "C17002_001E", "C17002_002E", "C17002_003E", "C17002_004E",
        "C17002_005E", "C17002_006E", "C17002_007E",
        # Total population
        "B01001_001E",
        # Kids (male & female, age 0â€“17)
        "B01001_003E", "B01001_004E", "B01001_005E", "B01001_006E",
        "B01001_027E", "B01001_028E", "B01001_029E", "B01001_030E",
        # Seniors (male & female, 65+)
        "B01001_020E", "B01001_021E", "B01001_044E", "B01001_045E",
        # No Vehicle
        "B08201_001E", "B08201_002E"
    ]

    params = {
        "get": ",".join(FIELDS),
        "for": "tract:*",
        "in": f"state:{state_index}",
        "key": API_KEY
    }

    try:
        response = requests.get(BASE_URL, params=params)
    except requests.exceptions.RequestException as e:
        print(f"Request exception for state {state_index}: {e}")
        return None

    if response.status_code != 200:
        print(f"API request failed with status code {response.status_code} for state {state_index}.")
        try:
            error_info = response.json()
            print(f"API error message for state {state_index}: {error_info}")
        except Exception:
            print(f"API response text for state {state_index}: {response.text}")
        return None

    try:
        data = response.json()
    except requests.exceptions.JSONDecodeError as e:
        print(f"JSON parsing failed for state {state_index}: {e}")
        print(f"Response content was: {response.text}")
        return None

    if not data or len(data) < 2:
        print(f"No data returned or unexpected response format for state {state_index}. Data: {data}")
        return None

    df = pd.DataFrame(data[1:], columns=data[0])

    required_geo = ["state", "county", "tract"]
    if not all(col in df.columns for col in required_geo):
        print(f"Missing required geography columns for state {state_index}. Columns found: {df.columns.tolist()}")
        return None

    df["CensusTract"] = df["state"] + df["county"] + df["tract"]
    df = df.set_index("CensusTract")

    df = df.apply(pd.to_numeric, errors='coerce')

    df["Raw_SNAP_Households_WithBenefits"] = df["B22003_002E"]
    df["Raw_LOWI_BelowPoverty_Sum"] = df[["C17002_002E", "C17002_003E", "C17002_004E",
                                           "C17002_005E", "C17002_006E", "C17002_007E"]].sum(axis=1)

    kids_cols = ["B01001_003E", "B01001_004E", "B01001_005E", "B01001_006E",
                 "B01001_027E", "B01001_028E", "B01001_029E", "B01001_030E"]
    df["Raw_Kids_Count"] = df[kids_cols].sum(axis=1)

    senior_cols = ["B01001_020E", "B01001_021E", "B01001_044E", "B01001_045E"]
    df["Raw_Seniors_Count"] = df[senior_cols].sum(axis=1)
    df["Raw_HUNV_Households_NoVehicle"] = df["B08201_002E"]

    return df[[
        "Raw_SNAP_Households_WithBenefits", "Raw_LOWI_BelowPoverty_Sum",
        "Raw_Kids_Count", "Raw_Seniors_Count", "Raw_HUNV_Households_NoVehicle"
    ]]


In [26]:
demographic_dfs_2010 = []

for abbrev in atlas_2010["State"].unique():
    state_fips = state_abbrev_to_fips.get(abbrev)
    if not state_fips:
        print(f"Unknown state abbreviation: {abbrev}")
        continue

    print(f"Fetching for state: {abbrev} (FIPS: {state_fips})")

    try:
        demographic_df_2010 = fetch_tract_demographics_second(state_fips, API_KEY)
        demographic_dfs_2010.append(demographic_df_2010)
    except Exception as e:
        print(f"Failed for state {abbrev}: {e}")

Fetching for state: AL (FIPS: 01)
Fetching for state: AK (FIPS: 02)
Fetching for state: AZ (FIPS: 04)
Fetching for state: AR (FIPS: 05)
Fetching for state: CA (FIPS: 06)
Fetching for state: CO (FIPS: 08)
Fetching for state: CT (FIPS: 09)
Fetching for state: DE (FIPS: 10)
Fetching for state: DC (FIPS: 11)
Fetching for state: FL (FIPS: 12)
Fetching for state: GA (FIPS: 13)
Fetching for state: HI (FIPS: 15)
Fetching for state: ID (FIPS: 16)
Fetching for state: IL (FIPS: 17)
Fetching for state: IN (FIPS: 18)
Fetching for state: IA (FIPS: 19)
Fetching for state: KS (FIPS: 20)
Fetching for state: KY (FIPS: 21)
Fetching for state: LA (FIPS: 22)
Fetching for state: ME (FIPS: 23)
Fetching for state: MD (FIPS: 24)
Fetching for state: MA (FIPS: 25)
Fetching for state: MI (FIPS: 26)
Fetching for state: MN (FIPS: 27)
Fetching for state: MS (FIPS: 28)
Fetching for state: MO (FIPS: 29)
Fetching for state: MT (FIPS: 30)
Fetching for state: NE (FIPS: 31)
Fetching for state: NV (FIPS: 32)
Fetching for s

In [73]:
fetched_demographic_data_2010_second = pd.concat(demographic_dfs_2010).sort_index()
fetched_demographic_data_2010_second.head()

Unnamed: 0_level_0,Raw_SNAP_Households_WithBenefits,Raw_LOWI_BelowPoverty_Sum,Raw_Kids_Count,Raw_Seniors_Count,Raw_HUNV_Households_NoVehicle
CensusTract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001020100,40,320,445,111,50
1001020200,65,682,597,89,115
1001020300,111,1035,998,147,101
1001020400,116,961,1195,213,19
1001020500,140,1427,3012,197,223


In [74]:
all_demographic_data_2010_second = pd.concat(demographic_dfs_2010, ignore_index=False)
all_demographic_data_2010_second.reset_index(inplace=True)
all_demographic_data_2010_second.columns = ['CensusTract', 'TractSNAP', 'TractLOWI', 'TractKids', 'TractSeniors', 'TractHUNV']
all_demographic_data_2010_second.head()

Unnamed: 0,CensusTract,TractSNAP,TractLOWI,TractKids,TractSeniors,TractHUNV
0,1001020100,40,320,445,111,50
1,1001020200,65,682,597,89,115
2,1001020300,111,1035,998,147,101
3,1001020400,116,961,1195,213,19
4,1001020500,140,1427,3012,197,223


In [75]:
if fetched_tract_data_2010_first.index.name == "CensusTract":
    fetched_tract_data_2010_first = fetched_tract_data_2010_first.reset_index()
all_fetched_2010 = pd.merge(all_demographic_data_2010_second, fetched_tract_data_2010_first, on="CensusTract", how="left")

print(f'Fetched tract data (1st set): {fetched_tract_data_2010_first.head()}')
print(f'Fetched demographic data (2nd set): {all_demographic_data_2010_second.head()}')

all_fetched_2010.head()

Fetched tract data (1st set):    CensusTract  PovertyRate  MedianFamilyIncome
0  01001020100        10.50               76806
1  01001020200        14.76               49191
2  01001020300         8.04               53342
3  01001020400         6.32               67292
4  01001020500         5.96               80176
Fetched demographic data (2nd set):    CensusTract  TractSNAP  TractLOWI  TractKids  TractSeniors  TractHUNV
0  01001020100         40        320        445           111         50
1  01001020200         65        682        597            89        115
2  01001020300        111       1035        998           147        101
3  01001020400        116        961       1195           213         19
4  01001020500        140       1427       3012           197        223


Unnamed: 0,CensusTract,TractSNAP,TractLOWI,TractKids,TractSeniors,TractHUNV,PovertyRate,MedianFamilyIncome
0,1001020100,40,320,445,111,50,10.5,76806
1,1001020200,65,682,597,89,115,14.76,49191
2,1001020300,111,1035,998,147,101,8.04,53342
3,1001020400,116,961,1195,213,19,6.32,67292
4,1001020500,140,1427,3012,197,223,5.96,80176


In [76]:
nan_count_2010 = processed_atlas_2010.isna().sum()
print(nan_count_2010)

CensusTract            0
Urban                  0
LowIncomeTracts        0
HUNVFlag               0
POP2010                0
lapop1share            0
lalowi1share           0
lakids1share           0
laseniors1share        0
lahunv1share           0
TractSNAP              0
TractLOWI              0
TractKids              0
TractSeniors           0
TractHUNV              0
PovertyRate            0
MedianFamilyIncome     0
LOWIRatio              0
SNAPRatio              0
HUNVRatio              0
FoodInsecurityIndex    0
LOWIWeighted           0
dtype: int64


No NaNs in our 2010 dataframe!

### What does the above code do?

We've outlined that there are many features missing in our 2010 dataset that are present in both the 2019 and 2015 variants. 

Rather than making educated guesses on their values (which I've yapped on earlier in this notebook), we simply use the Census API to fill in the gaps.

We use two different fetching functions as to not overload the API, which has a limit on the requests made every second.

After our two different functions have run, we merge both of their returned datasets into the 'all_fetched_2010' dataframe.

In [87]:
pruned_atlas_2010['CensusTract'] = pruned_atlas_2010['CensusTract'].astype(str)
pruned_atlas_2010['CensusTract'] = pruned_atlas_2010['CensusTract'].str.zfill(11)
cols_to_drop = ['TractSNAP', 'TractLOWI', 'TractKids', 'TractSeniors', 'TractHUNV', 'PovertyRate', 'MedianFamilyIncome']
pruned_atlas_2010 = pruned_atlas_2010.drop(columns=cols_to_drop, errors='ignore')

processed_atlas_2010 = pd.merge(
    pruned_atlas_2010, all_fetched_2010, on='CensusTract', how='left'
)
processed_atlas_2010['LOWIRatio'] = processed_atlas_2010['TractLOWI'] / processed_atlas_2010['POP2010'] # Low income percentage
processed_atlas_2010['SNAPRatio'] = processed_atlas_2010['TractSNAP'] / processed_atlas_2010['POP2010'] # Percentage of residents receiving SNAP
processed_atlas_2010['HUNVRatio'] = processed_atlas_2010['TractHUNV'] / processed_atlas_2010['POP2010'] # Percentage of residents without a vehicle (important for food deserts!)
processed_atlas_2010['FoodInsecurityIndex'] = (
    processed_atlas_2010['LOWIRatio'] +
    processed_atlas_2010['SNAPRatio'] +
    processed_atlas_2010['HUNVRatio']
)
processed_atlas_2010['LOWIWeighted'] = processed_atlas_2010['TractLOWI'] * processed_atlas_2010['PovertyRate']
processed_atlas_2010 = processed_atlas_2010[processed_atlas_2010['POP2010'] != 0].reset_index(drop=True)
processed_atlas_2010["CensusTract"] = processed_atlas_2010["CensusTract"].astype(str).str.lstrip("0")
processed_atlas_2010.head()

Unnamed: 0,CensusTract,Urban,LowIncomeTracts,HUNVFlag,POP2010,lapop1share,lalowi1share,lakids1share,laseniors1share,lahunv1share,TractSNAP,TractLOWI,TractKids,TractSeniors,TractHUNV,PovertyRate,MedianFamilyIncome,LOWIRatio,SNAPRatio,HUNVRatio,FoodInsecurityIndex,LOWIWeighted
0,1001020100,1,0,0,1912,0.70998,0.128283,0.717235,0.735282,0.045569,40,320,445,111,50,10.5,76806,0.167364,0.020921,0.026151,0.214435,3360.0
1,1001020200,1,0,0,2170,0.222779,0.078728,0.2884,0.238209,0.046297,65,682,597,89,115,14.76,49191,0.314286,0.029954,0.052995,0.397235,10066.32
2,1001020300,1,0,0,3373,0.42036,0.112892,0.421844,0.432805,0.015252,111,1035,998,147,101,8.04,53342,0.306849,0.032908,0.029944,0.369701,8321.4
3,1001020400,1,0,0,4386,0.435311,0.070944,0.46346,0.413773,0.00228,116,961,1195,213,19,6.32,67292,0.219106,0.026448,0.004332,0.249886,6073.52
4,1001020500,1,0,1,10766,0.255773,0.034686,0.235845,0.331996,0.014069,140,1427,3012,197,223,5.96,80176,0.132547,0.013004,0.020713,0.166264,8504.92


As seen with our 2019 & 2015 dataframes, we feature engineer some basic ratios/factors into our dataframe.

Prior to doing this, in this cell, we merge our fetched census 2010 data with our base atlas data 2010.

In [88]:
numeric_df_inf_check_2019 = processed_atlas_2019.select_dtypes(include=[np.number])
numeric_df_inf_check_2015 = processed_atlas_2015.select_dtypes(include=[np.number])
numeric_df_inf_check_2010 = processed_atlas_2010.select_dtypes(include=[np.number])

# Check for infinite values in numeric columns
print(f'2019 infs: {np.isinf(numeric_df_inf_check_2019).sum()}')
print(f'2015 infs: {np.isinf(numeric_df_inf_check_2015).sum()}')
print(f'2010 infs: {np.isinf(numeric_df_inf_check_2010).sum()}')

2019 infs: Urban                  0
PovertyRate            0
MedianFamilyIncome     0
TractLOWI              0
TractKids              0
TractSeniors           0
TractHUNV              0
TractSNAP              0
lapop1share            0
lakids1share           0
laseniors1share        0
lahunv1share           0
lalowi1share           0
Pop2010                0
CensusTract            0
LOWIRatio              0
SNAPRatio              0
HUNVRatio              0
FoodInsecurityIndex    0
LOWIWeighted           0
dtype: int64
2015 infs: Urban                  0
PovertyRate            0
MedianFamilyIncome     0
TractLOWI              0
TractKids              0
TractSeniors           0
TractHUNV              0
TractSNAP              0
lapop1share            0
lakids1share           0
laseniors1share        0
lahunv1share           0
lalowi1share           0
POP2010                0
CensusTract            0
LOWIRatio              0
SNAPRatio              0
HUNVRatio              0
FoodInsecurityI

Yet again, with any dataset, it's always good to check for/impute NaN/inf values.

Better to be safe than sorry!

In [89]:
scaled_processed_atlas_2015 = processed_atlas_2015.copy()
scale_cols_2015 = scale_cols + ['POP2010']
scaled_values_2015 = scaler.fit_transform(scaled_processed_atlas_2015[scale_cols_2015])
scaled_processed_atlas_2015[scale_cols_2015] = scaled_values
scaled_processed_atlas_2010 = processed_atlas_2010.copy()
scaled_values_2010 = scaler.fit_transform(scaled_processed_atlas_2010[scale_cols_2015])
scaled_processed_atlas_2010[scale_cols_2015] = scaled_values
print(f'Scaled 2010 data: {scaled_processed_atlas_2010.head()}')
print(f'Scaled 2015 data: {scaled_processed_atlas_2015.head()}')

Scaled 2010 data:   CensusTract  Urban  LowIncomeTracts  HUNVFlag   POP2010  lapop1share  \
0  1001020100      1                0         0  0.051027     0.709980   
1  1001020200      1                0         0  0.057916     0.222779   
2  1001020300      1                0         0  0.090038     0.420360   
3  1001020400      1                0         0  0.117086     0.435311   
4  1001020500      1                0         1  0.287442     0.255773   

   lalowi1share  lakids1share  laseniors1share  lahunv1share  TractSNAP  \
0      0.128283      0.717235         0.735282      0.045569   0.046897   
1      0.078728      0.288400         0.238209      0.046297   0.071724   
2      0.112892      0.421844         0.432805      0.015252   0.079080   
3      0.070944      0.463460         0.413773      0.002280   0.045057   
4      0.034686      0.235845         0.331996      0.014069   0.155862   

   TractLOWI  TractKids  TractSeniors  TractHUNV  PovertyRate  \
0   0.036220   0.0428

Just as with our 2019 dataset, we scale our 2015 & 2010 datasets off sklearn's MinMax() module. 


#### Another brief detour
Initially, I utilized a much more primitive approach towards scaling data.
Rather than defining an explicit list of features to be scaled, I scaled the entire datasets of 2019, 2015, and 2010.

Weirdly enough, I only saw problems arise in the 2010 set from this approach, where most notably, the MedianFamilyIncome was scaled to values extremely close to 1 consistently. 

For example, that first tract, which currently has '0.362...' as its MedianFamilyIncome, had a prior value of '0.997...' when applying almost brute-force like scaling.

Why do you think this happened?

##### My theory
The 2010 dataset contained a mix of features with vastly different value ranges, including raw counts (e.g., population numbers), percentages, ratios (already normalized between 0 and 1), and large-scale numeric values like income.

When the scaler was applied indiscriminately to all columns together, features with smaller value ranges (ratios and shares close to 0â€“1) heavily influenced the scaling parameters.

As a result, features like MedianFamilyIncome, which had much larger original values, were compressed into a narrow range near 1 because the scaler adjusted to fit all columns simultaneously without considering their differing scales and meanings.

In [90]:
unaligned_atlas_2019 = pd.DataFrame(scaled_processed_atlas_2019, columns=processed_atlas_2019.columns)
unaligned_atlas_2015 = pd.DataFrame(scaled_processed_atlas_2015, columns=processed_atlas_2015.columns)
unaligned_atlas_2010 = pd.DataFrame(scaled_processed_atlas_2010, columns=processed_atlas_2010.columns)
unaligned_atlas_2010.head()

Unnamed: 0,CensusTract,Urban,LowIncomeTracts,HUNVFlag,POP2010,lapop1share,lalowi1share,lakids1share,laseniors1share,lahunv1share,TractSNAP,TractLOWI,TractKids,TractSeniors,TractHUNV,PovertyRate,MedianFamilyIncome,LOWIRatio,SNAPRatio,HUNVRatio,FoodInsecurityIndex,LOWIWeighted
0,1001020100,1,0,0,0.051027,0.70998,0.128283,0.717235,0.735282,0.045569,0.046897,0.03622,0.042803,0.012796,0.00099,10.5,0.318183,0.167364,0.020921,0.026151,0.214435,0.006762
1,1001020200,1,0,0,0.057916,0.222779,0.078728,0.2884,0.238209,0.046297,0.071724,0.063843,0.051161,0.012391,0.014689,14.76,0.187881,0.314286,0.029954,0.052995,0.397235,0.018882
2,1001020300,1,0,0,0.090038,0.42036,0.112892,0.421844,0.432805,0.015252,0.07908,0.103964,0.075475,0.025418,0.016339,8.04,0.242867,0.306849,0.032908,0.029944,0.369701,0.025766
3,1001020400,1,0,0,0.117086,0.435311,0.070944,0.46346,0.413773,0.00228,0.045057,0.073396,0.08569,0.052342,0.003466,6.32,0.275182,0.219106,0.026448,0.004332,0.249886,0.003395
4,1001020500,1,0,1,0.287442,0.255773,0.034686,0.235845,0.331996,0.014069,0.155862,0.178475,0.266948,0.065196,0.03796,5.96,0.379128,0.132547,0.013004,0.020713,0.166264,0.044822


In [91]:
print(len(unaligned_atlas_2010))
print(len(unaligned_atlas_2015))
print(len(unaligned_atlas_2019))

72531
72531
72531


Given our length MIGHT be different (in the sense certain tracts might misalign/not exist), let's loop through our dataframes and remove any census tracts whose index doesn't exist in the other dataset.

In [92]:
def align_dataframes_by_index(*dfs):
    common_index = dfs[0].index
    for df in dfs[1:]:
        common_index = common_index.intersection(df.index)
    aligned_dfs = tuple(df.loc[common_index].copy() for df in dfs)
    return aligned_dfs

In [93]:
final_atlas_2010, final_atlas_2015, final_atlas_2019 = align_dataframes_by_index(unaligned_atlas_2010, unaligned_atlas_2015, unaligned_atlas_2019)

In [96]:
cols_to_drop_2010 = ['LowIncomeTracts', 'HUNVFlag']
final_atlas_2010 = final_atlas_2010.drop(columns=[col for col in cols_to_drop if col in final_atlas_2010.columns])
final_atlas_2019.rename(columns={"Pop2010": "POP2010"}, inplace=True)
print(len(final_atlas_2010))
print(len(final_atlas_2015))
print(len(final_atlas_2019))


print(final_atlas_2010.index.equals(final_atlas_2015.index))
print(final_atlas_2010.index.equals(final_atlas_2019.index))
print(final_atlas_2015.index.equals(final_atlas_2019.index))


72531
72531
72531
True
True
True


Alignment complete 
(for certain now, I know the length was the same, but it's better to be safe than sorry with data, the most integral part of any machine learning project)!

In [97]:
print(f'2019 atlas data: {final_atlas_2019.head()}')
print(f'2015 atlas data: {final_atlas_2015.head()}')
print(f'2010 atlas data: {final_atlas_2010.head()}')

2019 atlas data:    Urban  PovertyRate  MedianFamilyIncome  TractLOWI  TractKids  TractSeniors  \
0    1.0         11.3            0.318183   0.036220   0.042803      0.012796   
1    1.0         17.9            0.187881   0.063843   0.051161      0.012391   
2    1.0         15.0            0.242867   0.103964   0.075475      0.025418   
3    1.0          2.8            0.275182   0.073396   0.085690      0.052342   
4    1.0         15.2            0.379128   0.178475   0.266948      0.065196   

   TractHUNV  TractSNAP  lapop1share  lakids1share  laseniors1share  \
0   0.000990   0.046897        99.19         26.33            11.44   
1   0.014689   0.071724        58.11         18.69             5.83   
2   0.016339   0.079080        46.00         12.34             5.96   
3   0.003466   0.045057        31.09          7.89             5.39   
4   0.037960   0.155862        24.55          6.64             3.36   

   lahunv1share  lalowi1share   POP2010  CensusTract  LOWIRatio  SNAP

### WOOHOO
Feature engineering finished!

Look through these statistics if you want, but the scaled similarities almost guarantee a completely fine dataset for our Random forest model (And Kmeans/AE for our unsupervised classification 2019 model).

In [74]:
final_atlas_2010.to_csv('../data/processed/processed_2010.csv')
final_atlas_2015.to_csv('../data/processed/processed_atlas_2015.csv')
final_atlas_2019.to_csv('../data/processed/processed_atlas_2019.csv')