In [2]:
import numpy as np
import pandas as pd
import folktables
from folktables import ACSDataSource, ACSIncome
import matplotlib.pyplot as plt
import math

# Folktables
Load and preprocess data for the Income and Income-Reg datasets using the Folktables library (https://github.com/zykls/folktables)

To download the classification data, run everything except the "Download regression data" section. To download regression data, run everything except the "Download categorical data" section.

To download additional states, edit where it says "WI" with a different state abbreviation.

## Download classification data
The ACSIncome task is pre-defined for classification (i.e., predicting above or below a $50,000 threshold).

In [53]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["WI"], download=True)
#acs_data = data_source.get_data(states=["MD"], download=True)
#acs_data = data_source.get_data(states=["LA"], download=True)
#acs_data = data_source.get_data(states=['GA'], download=True)
#acs_data = data_source.get_data(states=['OR'],download=True)
features, label, group = ACSIncome.df_to_numpy(acs_data)

## Download regression data
For regression data (i.e., predicting the exact salary)

In [3]:
def adult_filter(data):
    """Mimic the filters in place for Adult data.
    Adult documentation notes: Extraction was done by Barry Becker from
    the 1994 Census database. A set of reasonably clean records was extracted
    using the following conditions:
    ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))
    """
    df = data
    df = df[df['AGEP'] > 16]
    df = df[df['PINCP'] > 100]
    df = df[df['WKHP'] > 0]
    df = df[df['PWGTP'] >= 1]
    return df

ACSIncomeCont = folktables.BasicProblem(
    features=[
        'AGEP',
        'COW',
        'SCHL',
        'MAR',
        'OCCP',
        'POBP',
        'RELP',
        'WKHP',
        'SEX',
        'RAC1P',
    ],
    target='PINCP',
   # target_transform=lambda x: x > 50000,
    group='RAC1P',
    preprocess=adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

In [4]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["WI"], download=True)
features, label, group = ACSIncomeCont.df_to_numpy(acs_data)

Downloading data for 2018 1-Year person survey for WI...


## Preprocessing

Cast the data into a pandas dataframe

In [54]:
features_pd = pd.DataFrame(features)

Notes of the data (**ACSIncome**)

0. Age
1. COW - class of worker (change to one-hot)
2. SCHL - education attainment (keep as categories)
3. MAR - marital status, change to one-hot
4. OCCP - occupation (so many, think about whehter to just remove this)
5. POBP - place of birth (state/country, maybe encode as us/abroad?)
6. RELP - relationship to head of household
7. WKHP - hours worked per week (binary)
8. SEX
9. RAC1P - race (one-hot)

In [56]:
occ = pd.Series(0 for x in range(len(features_pd)))
pob = pd.Series(0 for x in range(len(features_pd)))
rel = pd.Series(0 for x in range(len(features_pd)))

for index,row in features_pd.iterrows():
    if math.floor(row[4]/1000) in [6,7]: # put trades into single category
        occ[index] = 6
    else:
        occ[index] = math.floor(row[4]/1000)
        
    if row[5] < 57: # US state
        pob[index] = 0
    elif row[5] < 79: # US territories
        pob[index] = 1
    elif row[5] < 170: # europe
        pob[index] = 2
    elif row[5] < 255: # asia
        pob[index] = 3
    elif row[5] == 301: #canada
        pob[index] = 0 # group with US
    elif row[5] < 400: # rest of americas
        pob[index] = 4
    elif row[6] < 470:
        pob[index] = 5 # africas
    else: # none for WI data
        pob[index] = 6 # australia and everywhere else
        
    if row[6] in [0, 1, 13]: # HOH or partner
        rel[index] = 0
    elif row[6] in [2, 3, 4, 7, 14]: # child, grandchild, foster child
        rel[index] = 1
    elif row[6] in [5, 6, 8, 9, 10]: # other family
        rel[index] = 2 
    elif row[6] in [11, 12, 15]: # roommate, boarder, etc
        rel[index] = 3
    elif row[6] in [16]: # institutionalized
        rel[index] = 4 
    elif row[6] in [17]:
        rel[index] = 5 # group quarters

In [57]:
work = pd.Series(0 for x in range(len(features_pd)))
race = pd.Series(0 for x in range(len(features_pd)))
for index,row in features_pd.iterrows():
    if row[1] == 1: # for-profit
        work[index] = 1
    elif row[1] == 2: #non-profit
        work[index] = 2
    elif row[1] in [3, 4, 5]: #govt
        work[index] = 3
    elif row[1] in [6, 7, 8]: # self-employed
        work[index] = 4
        
    if row[9] == 1: #white
        race[index] = 1
    elif row[9] == 2: #black
        race[index] = 2
    elif row[9] == 6: #asian
        race[index] = 6
    elif row[9] == 8: # other
        race[index] = 8 
    elif row[9] == 9: # 2+
        race[index] = 9
    elif row[9] in [3, 4, 5, 7]: # group alaska, hawaii, native am together
        race[index] = 3

In [58]:
ages = pd.Series(0 for x in range(len(features_pd)))
ed = pd.Series(0 for x in range(len(features_pd)))
hours = pd.Series(0 for x in range(len(features_pd)))
for index,row in features_pd.iterrows():
    if row[0] < 35:
        ages[index] = 0
    elif row[0] < 55:
        ages[index] = 1
    else:
        ages[index] = 2
        
    if row[2] < 15: # no high school diploma
        ed[index] = 0 
    elif row[2] < 21: # no college diploma
        ed[index] = 1
    elif row[2] < 25: # college
        ed[index] = 2
    
    if row[7] >= 40:
        hours[index] = 1
        

In [59]:
sex = pd.Series(0 for x in range(len(features_pd)))
mar = pd.Series(0 for x in range(len(features_pd)))
for index,row in features_pd.iterrows():
    if row[8] == 2: # female
        sex[index] = 1
    
    if row[3] == 1: # married
        mar[index] = 1
    elif row[3] in [2,3,4]: # widowed, divorced, separated
        mar[index] = 2
    elif row[3] == 5: # never married
        mar[index] = 3

In [61]:
features_mod = features_pd.copy()
features_mod[0] = ages
features_mod[1] = work
features_mod[2] = ed
features_mod[3] = mar
features_mod[4] = occ
features_mod[5] = pob
features_mod[6] = rel
features_mod[7] = hours
features_mod[8] = sex
features_mod[9] = race

In [62]:
cols = ['hours_worked_over40', 'sex_male','age_under35','age_35to55','age_over55', 'work_forprofit', 'work_nonprofit', 'work_gov',
        'work_selfemploy', 'education_noHS', 'education_HS', 'education_college', 'rel_married', 'rel_divorced',
        'rel_single', 'occ_business', 'occ_science', 'occ_social', 'occ_protective', 'occ_sales', 'occ_admin',
        'occ_trades', 'occ_crafts', 'occ_logistics', 'birth_usstate','birth_usterr', 'birth_europe','birth_asia',
       'birth_americas','birth_africa','role_hoh','role_child','role_family','role_housemate',
       'role_institution','role_group', 'race_white','race_black','race_indigenous', 'race_asian','race_other',
       'race_multi','label']

In [63]:
one_hot = pd.get_dummies(features_mod, columns=[0,1,2,3,4,5,6,9])

In [64]:
one_hot['label'] = label.astype('int')

In [65]:
one_hot_copy = pd.DataFrame()

In [66]:
for i in range(len(one_hot.columns)):
    one_hot_copy[cols[i]] = one_hot[one_hot.columns[i]]

In [67]:
one_hot_copy.to_csv("raw_data/income.csv",index=False)