In [5]:
import pandas as pd
import numpy as np
import pathlib as pl
import os
import requests

# Get ASEC data from CPS API

## Variables to Retrieve

In [68]:
ASEC_VARIABLES = [
    'PEMLR', # labor force status
    'A_AGE',
    'A_SEX',
    'A_HGA',
    'PRDTRACE',
    'PEHSPNON',
    'PRDISFLG',
    'PRCITSHP',
    'MARSUPWT',
    'A_LFSR',
    'HRCHECK',
    'A_CLSWKR',
    'CLWK',
    'A_DTOCC', # last year major occ
    'WEMIND', # last year major ind
    'A_MJIND',
    'LJCW',
    'HTOTVAL',
    'A_MARITL',
    # 'HCOV',  # health insurance coverage last year
    'FPERSONS',  # number of persons in family
    'FRELU18',  # number of persons in family under 18
]

## Get single year of ASEC data

In [24]:
# this function will retrieve an entire year of ASEC data
def get_asec_year_df(api_key, asec_variables, year):
    base_url = f'https://api.census.gov/data/{year}/cps/asec/mar'
    get_vars = ','.join(asec_variables)
    url = f'{base_url}?get={get_vars}&key={api_key}'
    response = requests.get(url)
    
    data = response.json()
    df = pd.DataFrame(data[1:], columns=data[0])
    
    df = df.rename(columns={x: x.lower() for x in df.columns})
    df.loc[:, 'year'] = year
    
    for var in df.columns:
        if var == 'marsupwt':
            df[var] = df[var].astype(np.float64)
        else:
            df[var] = df[var].astype(np.int64)
            
    return df

# I've saved my API key in a local file
with open('c:/users/calvi/coding/census_api_key.txt', 'r') as f:
    api_key = f.readline().strip()

asec_2024 = get_asec_year_df(api_key, ASEC_VARIABLES, 2024)
asec_2024.head()

Unnamed: 0,a_age,a_sex,a_hga,prdtrace,pehspnon,prdisflg,prcitshp,marsupwt,a_lfsr,hrcheck,...,a_dtocc,wemind,a_mjind,ljcw,pearnval,htotval,a_maritl,fpersons,frelu18,year
0,68,2,39,1,2,2,1,1061.89,1,2,...,17,4,10,1,55000,262579,1,2,0,2024
1,68,1,43,1,2,2,1,1061.89,1,2,...,16,5,5,1,24000,262579,1,2,0,2024
2,58,2,44,1,2,1,1,4788.1,1,2,...,8,10,10,4,95000,96802,5,1,0,2024
3,71,2,43,1,2,2,1,5180.75,7,0,...,0,15,0,0,0,104000,1,2,0,2024
4,70,1,46,1,2,2,1,5180.75,7,2,...,0,10,0,1,55000,104000,1,2,0,2024


In [69]:
asec_df = pd.DataFrame()
base_year = 2024
start_year = base_year - 9
for year in range(start_year, base_year + 1):
    print(f'Processing year {year}')
    df_year = get_asec_year_df(api_key, ASEC_VARIABLES, year)
    asec_df = pd.concat([
        asec_df,
        df_year
    ])

asec_df.head()

Processing year 2015
Processing year 2016
Processing year 2017
Processing year 2018
Processing year 2019
Processing year 2020
Processing year 2021
Processing year 2022
Processing year 2023
Processing year 2024


Unnamed: 0,pemlr,a_age,a_sex,a_hga,prdtrace,pehspnon,prdisflg,prcitshp,marsupwt,a_lfsr,...,a_dtocc,wemind,a_mjind,ljcw,pearnval,htotval,a_maritl,fpersons,frelu18,year
0,1,48,1,41,1,2,2,1,690.05,1,...,22,2,6,1,40000,63930,1,2,0,2015
1,6,45,2,40,2,2,2,1,690.05,7,...,0,11,0,1,6000,63930,1,2,0,2015
2,1,23,2,39,1,2,2,1,583.8,1,...,16,5,5,1,16000,63930,3,2,1,2015
3,0,2,2,0,1,2,-1,1,468.04,0,...,0,0,0,0,0,63930,7,2,1,2015
4,5,80,2,39,1,2,1,1,735.83,7,...,0,15,0,0,0,11831,4,1,0,2015


## Save raw ASEC data to parquet

In [70]:
parquets_dir = pl.Path('model.ipynb').resolve().parent/'parquets'
if not os.path.exists(str(parquets_dir)):
    os.makedirs(str(parquets_dir))
parquet_file_path = parquets_dir/f'asec_{start_year}to{base_year}.parquet'

asec_df.to_parquet(parquet_file_path)

# Transform ASEC data to prepare for modeling

In [71]:
parquets_dir = pl.Path('model.ipynb').resolve().parent/'parquets'
parquet_file_path = parquets_dir/f'asec_{start_year}to{base_year}.parquet'
asec = pd.read_parquet(parquet_file_path)

## Remove rows

Starting with 1,695,000 rows

In [72]:
print(len(asec))

1695242


In [73]:
# ! filter with WKSWORK to exclude people who barely worked?

# only include people who worked in paying jobs last year
asec = asec[asec.ljcw.isin([1,2,3,4,5,6])].reset_index(drop=True)

len(asec)

847026

## Create modeling variables

### Education

In [74]:
def get_edu_var(df):
    df = df.copy()
    bins = [30, 39, 40, 41, 43, 44, 45, np.inf]
    names = ['LTHS', 'HS', 'SCND', 'AD', 'BA', 'MA', 'DOC']
    df['edu'] = pd.cut(df['a_hga'], bins, labels=names, right=False)
    
    return df

asec = get_edu_var(asec)
asec.edu.value_counts()

edu
HS      220119
BA      194849
SCND    145246
AD       90375
MA       85058
LTHS     79707
DOC      31672
Name: count, dtype: int64

### Race

In [75]:
def get_race_var(df):
    df = df.copy()
    df['race'] = 'other'
    df.loc[df.prdtrace == 1, 'race'] = 'white'
    df.loc[df.prdtrace == 2, 'race'] = 'black'
    df.loc[df.prdtrace == 4, 'race'] = 'asian'
    
    return df

asec = get_race_var(asec)
asec.race.value_counts()

race
white    661835
black     94406
asian     56524
other     34261
Name: count, dtype: int64

### Hispanic

In [76]:
def get_hispanic_var(df):
    df = df.copy()
    df['hisp'] = 0
    df.loc[df.pehspnon==1, 'hisp'] = 1
    
    return df

asec = get_hispanic_var(asec)
asec.hisp.value_counts()

hisp
0    685960
1    161066
Name: count, dtype: int64

### Sex

In [77]:
def get_male_var(df):
    df = df.copy()
    df['male'] = 0
    df.loc[df.a_sex==1, 'male'] = 1
    
    return df

asec = get_male_var(asec)
asec.male.value_counts()

male
1    441229
0    405797
Name: count, dtype: int64

### Citizenship

In [78]:
def get_citizenship_var(df):
    df = df.copy()
    bins = [1, 4, 5, np.inf]
    names = ['native', 'naturalized', 'noncitizen']
    df.loc[:, 'citshp'] = pd.cut(df['prcitshp'],
                                 bins=bins, labels=names, right=False, include_lowest=True)
    
    return df

asec = get_citizenship_var(asec)
asec.citshp.value_counts()

citshp
native         700363
noncitizen      76079
naturalized     70584
Name: count, dtype: int64

### Class of worker

In [79]:
def get_cow_var(df):
    df = df.copy()
    bins = [1, 2, 5, 6, np.inf]
    names = ['ws', 'gov', 'seinc', 'seuninc']
    df.loc[:, 'cow'] = pd.cut(df['ljcw'],
                                 bins=bins, labels=names, 
                                 right=False, include_lowest=True)
    
    return df

asec = get_cow_var(asec)
asec.cow.value_counts()

cow
ws         639444
gov        128180
seuninc     50159
seinc       29243
Name: count, dtype: int64

### Full-time/part-time

In [80]:
def get_pt_var(df):
    df = df.copy()
    df['pt'] = 0
    df.loc[df.hrcheck == 1, 'pt'] = 1
    
    return df

asec = get_pt_var(asec)
asec.pt.value_counts()

pt
0    682731
1    164295
Name: count, dtype: int64

### Disability

In [81]:
def get_disability_var(df):
    df = df.copy()
    df['disability'] = 0
    df.loc[df.prdisflg == 1, 'disability'] = 1
    
    return df

asec = get_disability_var(asec)
asec.disability.value_counts()

disability
0    809534
1     37492
Name: count, dtype: int64

### Marital Status

In [84]:
def get_married_var(df):
    df = df.copy()
    df['married'] = 0
    df.loc[df.a_maritl.isin([1,2,3]), 'married'] = 1
    
    return df

asec = get_married_var(asec)
asec.married.value_counts()

married
1    478408
0    368618
Name: count, dtype: int64

### Children

In [86]:
def get_children_var(df):
    df = df.copy()
    df['children'] = 0
    df.loc[df.frelu18 > 0, 'children'] = 1
    
    return df

asec = get_children_var(asec)
asec.children.value_counts()

children
0    464544
1    382482
Name: count, dtype: int64

### Income

Use log income because income distribution is skewed right

In [87]:
# ! need to adjust for inflation
def get_log_income_var(df):
    df = df.copy()
    # initially set to 0
    df['log_income'] = 0.0
    # only assign log value to positive values - we will treat negative and 0 income as all having 0 here
    df.loc[df.htotval > 0, 'log_income'] = np.log(df[df.htotval > 0].htotval)
    
    return df

asec = get_log_income_var(asec)
asec.log_income.describe()

count    847026.000000
mean         11.402650
std           0.858329
min           0.000000
25%          10.933107
50%          11.461843
75%          11.931682
max          15.009578
Name: log_income, dtype: float64

### LF leaver label

In [89]:
def get_leaver_var(df):
    df = df.copy()
    df['leaver'] = 0
    df.loc[~df.pemlr.isin([1,2]), 'leaver'] = 1
    
    return df

asec = get_leaver_var(asec)
asec.leaver.value_counts()

leaver
0    762546
1     84480
Name: count, dtype: int64

In [91]:
asec.wemind.value_counts()

wemind
10    194653
5     107915
9      98135
11     81823
4      79697
3      60256
8      53674
6      45739
13     43683
12     40626
1      15785
7      14841
2       5687
14      4512
Name: count, dtype: int64

## Rename some variables

In [93]:
asec = asec.rename(columns={
    'a_dtocc': 'occ',
    'wemind': 'ind',
    'a_age': 'age'
})

asec.columns

Index(['pemlr', 'age', 'a_sex', 'a_hga', 'prdtrace', 'pehspnon', 'prdisflg',
       'prcitshp', 'marsupwt', 'a_lfsr', 'hrcheck', 'a_clswkr', 'clwk',
       'peio1cow', 'wemocg', 'occ', 'ind', 'a_mjind', 'ljcw', 'pearnval',
       'htotval', 'a_maritl', 'fpersons', 'frelu18', 'year', 'edu', 'race',
       'hisp', 'male', 'citshp', 'cow', 'pt', 'disability', 'married',
       'children', 'log_income', 'leaver'],
      dtype='object')

## Save modeling data

In [101]:
model_data = asec[[
    'marsupwt', 'leaver',  # weight and label
    'age', 'log_income',  # continuous variables
    'pt', 'hisp', 'male', 'disability', 'married', 'children',  # binary variables
    'cow', 'race', 'citshp', 'edu', 'occ', 'ind' # categorical variables
]].copy()

model_data.head()

Unnamed: 0,marsupwt,leaver,age,log_income,pt,hisp,male,disability,married,children,cow,race,citshp,edu,occ,ind
0,690.05,0,48,11.065544,0,0,1,0,1,0,ws,white,native,AD,22,2
1,690.05,1,45,11.065544,1,0,0,0,1,0,ws,black,native,SCND,0,11
2,583.8,0,23,11.065544,0,0,0,0,1,1,ws,white,native,HS,16,5
3,513.98,0,64,10.196344,0,0,0,0,0,1,gov,white,native,SCND,8,10
4,785.82,0,46,11.611141,0,0,0,0,1,0,gov,white,native,HS,2,13


In [102]:
cat_vars = ['cow', 'race', 'citshp', 'edu', 'occ', 'ind']

for var in cat_vars:
    model_data[var] = model_data[var].astype('category')

# one-hot encode categorical variables
model_data = pd.get_dummies(model_data, drop_first=True,
                            columns=cat_vars, dtype=np.int8)

model_data.head()

Unnamed: 0,marsupwt,leaver,age,log_income,pt,hisp,male,disability,married,children,...,ind_5,ind_6,ind_7,ind_8,ind_9,ind_10,ind_11,ind_12,ind_13,ind_14
0,690.05,0,48,11.065544,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,690.05,1,45,11.065544,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,583.8,0,23,11.065544,0,0,0,0,1,1,...,1,0,0,0,0,0,0,0,0,0
3,513.98,0,64,10.196344,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,785.82,0,46,11.611141,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


Save to a parquet file

In [103]:
model_data_parquet_path = parquets_dir/f'leavers_model_data{base_year}.parquet'
model_data.to_parquet(model_data_parquet_path)

print(f'Saved model data as a parquet file: {str(model_data_parquet_path)}')

Saved model data as a parquet file: C:\Users\calvi\github\asec-separations-model\parquets\leavers_model_data2024.parquet
