In [1]:
from folktables import ACSDataSource, ACSEmployment, BasicProblem, adult_filter
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [2]:
STATE = "MA"

data_source = ACSDataSource(survey_year='2018', 
                            horizon='1-Year', 
                            survey='person')

acs_data = data_source.get_data(states=[STATE], download=True)

In [3]:
possible_features=['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC', 'NATIVITY', 'DEAR', 'DEYE', 'DREM', 'SEX', 'RAC1P', 'ESR']
acs_data[possible_features].head()

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR
0,77,19.0,3,16,2,,1,3.0,4.0,1,1,2,2,2.0,2,1,6.0
1,18,18.0,5,17,2,,1,1.0,4.0,2,1,2,2,2.0,2,9,1.0
2,28,21.0,5,17,2,,1,1.0,4.0,2,1,2,2,2.0,1,1,1.0
3,22,19.0,5,17,2,,1,1.0,4.0,1,1,2,2,2.0,1,1,6.0
4,50,1.0,5,17,1,,1,1.0,4.0,1,1,2,1,1.0,2,1,6.0


In [4]:
features_to_use = [f for f in possible_features if f not in ["ESR", "RAC1P"]]

EmploymentProblem = BasicProblem(
    features=features_to_use,
    target='ESR',
    target_transform=lambda x: x == 1,
    group='RAC1P',
    preprocess=lambda x: x,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

features, label, group = EmploymentProblem.df_to_numpy(acs_data)

In [5]:
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    features, label, group, test_size=0.2, random_state=0)

In [6]:
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)
y_hat = model.predict(X_test)
tot_acc = (y_hat == y_test).mean()
white_acc = (y_hat == y_test)[group_test == 1].mean()
black_acc = (y_hat == y_test)[group_test == 2].mean()
print(f"Total accuracy: {tot_acc}")
print(f"The accuracy for white individuals: {white_acc}")
print(f"The accuracy for black individuals: {black_acc}")

Total accuracy: 0.7803521779425394
The accuracy for white individuals: 0.7833114897335081
The accuracy for black individuals: 0.7806122448979592


In [17]:
df = pd.DataFrame(X_train, columns = features_to_use)
df["group"] = group_train
df["label"] = y_train

# print(f"Total number of individuals: {len(df)}")
# print(f"Proportion of individuals with target label equal to 1: {df['label'].mean()}")
# print(f"Number of individuals in each group: {df.groupby('group')['label'].count()}")
# print(f"Proportion of individuals with target label equal to 1 in each group: {df.groupby('group')['label'].mean()}")

# Check for intersectional trends by studying the proportion of positive target labels broken out by your chosen group labels and an additional group label.
# use race (RAC1P) as your group, then you could also choose sex (SEX) and compute the proportion of positive labels by both race and sex.
# This might be a good opportunity to use a visualization such as a bar chart, e.g. via the seaborn package.
# group by race then check employment based on sex
df.groupby('group')['SEX', 'label'].count()


# group_dict = {i+1: status for i, status in enumerate(races)}
# working_by_race = df.query("DIS==1")[["label", "group"]].value_counts(sort=False)
# fig, axarr = plt.subplots(1, 6, figsize=(15, 5))
# for i, (index, race) in enumerate(group_dict.items()):
#     axarr[i].pie((working_by_race[False][index], working_by_race[True][index]), autopct='%1.1f%%')
#     axarr[i].set_title(race)
# plt.legend(labels=["Not employed", "Employed"], loc="lower right")


  df.groupby('group')['label', 'SEX'].count()


Unnamed: 0_level_0,label,SEX
group,Unnamed: 1_level_1,Unnamed: 2_level_1
1,45515,45515
2,3405,3405
3,66,66
4,1,1
5,24,24
6,3778,3778
7,24,24
8,1698,1698
9,1593,1593
