We use the [US Bible Belt states](https://en.wikipedia.org/wiki/Bible_Belt) from [Folktables](https://github.com/socialfoundations/folktables)' ACSIncome prediction task to showcase the code of Lazzari, Alvarez, and Ruggieri (2022).

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
from folktables import ACSDataSource, ACSIncome

# all US states
states = sorted(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 
                 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 
                 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 
                 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
                 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'])

# state codes from folktables:
_STATE_CODES = {'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06',
                'CO': '08', 'CT': '09', 'DE': '10', 'FL': '12', 'GA': '13',
                'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18', 'IA': '19',
                'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23', 'MD': '24',
                'MA': '25', 'MI': '26', 'MN': '27', 'MS': '28', 'MO': '29',
                'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33', 'NJ': '34',
                'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38', 'OH': '39',
                'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44', 'SC': '45',
                'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50',
                'VA': '51', 'WA': '53', 'WV': '54', 'WI': '55', 'WY': '56',
                'PR': '72'}

# # state codes
# state_codes = pd.read_csv(os.path.join('..', 'data', 'state_codes.csv'))

# bible belt states
bible_belt = ['AL','AK', 'GA', 'LA', 'MS', 'OK', 'TN', 'NC', 'SC']

# data folder
data_source = ACSDataSource(survey_year='2017', horizon='1-Year', survey='person', root_dir=os.path.join('..', 'data'))

In [3]:
for state in bible_belt:
    print(state)
    
    data = data_source.get_data(states=[state], download=True)
    features, labels, _ = ACSIncome.df_to_numpy(data)
    
    df = pd.DataFrame(features, columns=ACSIncome.features)
    df['Y'] = labels
    df['STATE'] = state
    
    print("{} features for {} individuals".format(df.shape[1]-1, df.shape[0]))
    
    df.to_csv(os.path.join('..', 'data', 'lar_{}_adult.csv'.format(state.lower())), sep='|', index=False)
    
    del data, features, labels, df
    
print('DONE')

AL
Downloading data for 2017 1-Year person survey for AL...
11 features for 22074 individuals
AK
Downloading data for 2017 1-Year person survey for AK...
11 features for 3512 individuals
GA
Downloading data for 2017 1-Year person survey for GA...
11 features for 50238 individuals
LA
Downloading data for 2017 1-Year person survey for LA...
11 features for 20882 individuals
MS
Downloading data for 2017 1-Year person survey for MS...
11 features for 13003 individuals
OK
Downloading data for 2017 1-Year person survey for OK...
11 features for 17724 individuals
TN
Downloading data for 2017 1-Year person survey for TN...
11 features for 33240 individuals
NC
Downloading data for 2017 1-Year person survey for NC...
11 features for 50893 individuals
SC
Downloading data for 2017 1-Year person survey for SC...
11 features for 24327 individuals
DONE


In [None]:
# let's run the pipeline for Florida
fl_data = data_source.get_data(states=['FL'], download=True)
fl_features, fl_labels, _ = ACSIncome.df_to_numpy(fl_data)

df = pd.DataFrame(fl_features, columns=ACSIncome.features)
df['Y'] = fl_labels

print("{} features for {} individuals".format(df.shape[1]-1, df.shape[0]))
df.head(5)

# save for run_experiments
df.to_csv(os.path.join('..', 'data', 'lar_fl_adult.csv'), sep='|', index=False)