### US Census Burreau ACS 5-years data collection and processing

Author: Bartosz Bonczak

This script is collects Census Bureau American Community Survey data using Census API for selected variables, states, years and geographical resolution.

In [1]:
import requests
import json
import pandas as pd
import numpy as np

from IPython.display import clear_output

In [2]:
def collect_acs_data(years, states, tables, geo_unit):
    """Function calling Census API and collecting 5-year ACS data on selected 
    tables, vintage and geographical unit level for the entire US.
    
    Args:
        years (list):   List of ACS data vintages
        states (list):  List of state names in the form of XX_STATE_NAME/,
                        where XX is a 2-digit state FIPS code
        tables (dict):  Dictionary of ACS Variables in the form {code:description}
                        based on https://api.census.gov/data/2019/acs/acs5/subject/variables.html
        geo_unit (str): 'ct' for Census Tract, 'cbg' for Census Block Group
 
    Returns:
        pandas.DataFrame: Data Frame with selected ACS data for a given state, vintage year and geography. 
    
    """
    
    # inititate main Data Frame
    acs_data = pd.DataFrame()
    
    # iterate over the vintage years
    for year in years:
        # initiate base API URL
        base_API_url = f'https://api.census.gov/data/{year}/acs/acs5?'
        
        # iterate over the state info
        for state in states:
            # generate state code and name
            state_code, state_name = state.split('_', maxsplit=1)
            
            # print statement
            print(f'year: {year}')
            print(f'---{state_name[:-1]}')

            # create columns
            columns = ','.join(list(tables.keys()))
            
            
            # update query based on function parameters
            if geo_unit=='ct':
                query = base_API_url + f'get={columns}&for=tract:*&in=state:{state_code}&in=county:*'
            elif geo_unit=='cbg':
                query = base_API_url + f'get={columns}&for=block%20group:*&in=state:{state_code}&in=county:*&in=tract:*'
            else:
                raise ValueError("Wrong geometry specified. \nPlease select 'ct' for Census Tract, 'cbg' for Census Block Group.")
            
            # call API and record response
            with requests.get(query) as response:
                data = json.loads(response.content)

            # create temporary data frame
            temp_df = pd.DataFrame.from_records(data[1:], columns=data[0])

            # generate GEOID
            if geo_unit=='ct':
                temp_df['GEOID'] = [str(state) + str(county) + str(tract) \
                                    for state, county, tract in zip(temp_df['state'], 
                                                                    temp_df['county'], 
                                                                    temp_df['tract'])
                                   ]
            elif geo_unit=='cbg':
                temp_df['GEOID'] = [str(state) + str(county) + str(tract) + str(block_group) \
                                    for state, county, tract, block_group in zip(temp_df['state'], 
                                                                                 temp_df['county'], 
                                                                                 temp_df['tract'], 
                                                                                 temp_df['block group'])
                                   ]
            
            # rename columns
            temp_df.rename(columns=tables, inplace=True)
            
            # create vintage column
            temp_df['year'] = year

            # append to main DF
            acs_data = acs_data.append(temp_df)
            
            # append to main DF
            clear_output(wait=True)
    
    # clean main DF
    acs_data.reset_index(drop=True, inplace=True)
    acs_data.fillna(np.nan, inplace=True)
    
    # convert eligible columns to numeric
    for c in acs_data[tables.values()]:
        acs_data[c] = pd.to_numeric(acs_data[c], errors='coerce')

    return acs_data # return main DF

In [3]:
# Generate state names
states_names = [
'01_ALABAMA/', 
'02_ALASKA/', 
'04_ARIZONA/', 
'05_ARKANSAS/', 
'06_CALIFORNIA/', 
'08_COLORADO/', 
'09_CONNECTICUT/', 
'10_DELAWARE/', 
'11_DISTRICT_OF_COLUMBIA/', 
'12_FLORIDA/', 
'13_GEORGIA/', 
'15_HAWAII/', 
'16_IDAHO/', 
'17_ILLINOIS/', 
'18_INDIANA/', 
'19_IOWA/', 
'20_KANSAS/', 
'21_KENTUCKY/', 
'22_LOUISIANA/', 
'23_MAINE/',
'24_MARYLAND/',
'25_MASSACHUSETTS/', 
'26_MICHIGAN/',
'27_MINNESOTA/', 
'28_MISSISSIPPI/', 
'29_MISSOURI/', 
'30_MONTANA/',
'31_NEBRASKA/',
'32_NEVADA/', 
'33_NEW_HAMPSHIRE/', 
'34_NEW_JERSEY/', 
'35_NEW_MEXICO/', 
'36_NEW_YORK/',
'37_NORTH_CAROLINA/',
'38_NORTH_DAKOTA/',
'39_OHIO/',
'40_OKLAHOMA/', 
'41_OREGON/',
'42_PENNSYLVANIA/', 
'44_RHODE_ISLAND/', 
'45_SOUTH_CAROLINA/', 
'46_SOUTH_DAKOTA/', 
'47_TENNESSEE/', 
'48_TEXAS/',
'49_UTAH/', 
'50_VERMONT/', 
'51_VIRGINIA/', 
'53_WASHINGTON/', 
'54_WEST_VIRGINIA/', 
'55_WISCONSIN/', 
'56_WYOMING/', 
'72_PUERTO_RICO/',
]

# define states numbers for the API
states_codes = [s.split('_')[0] for s in states_names]

In [4]:
# Create tables dictionary from https://api.census.gov/data/2019/acs/acs5/subject/variables.html
column_codes_ct = {
    'B01001_001E':'total_population',
    'B03002_003E':'white_no_lat',
    'B03002_004E':'black_no_lat',
    'B03002_005E':'native_no_lat',
    'B03002_006E':'asian_no_lat',
    'B03002_007E':'pacific_no_lat',
    'B03002_008E':'other_race_no_lat',
    'B03002_009E':'two_or_more_no_lat',
    'B01001I_001E':'hispanic_or_latino',
    'B01001_002E':'males',
    'B01002_001E':'median_age',
    'B19001_001E':'household_number',
    'B11001_002E':'family_households_number',
    'B06011_001E':'median_income', 
    'B19083_001E':'gini_index',
    'B08303_001E':'total_workers',
    'B08303_002E':'Less than 5 minutes',
    'B08303_003E':'5 to 9 minutes',
    'B08303_004E':'10 to 14 minutes',
    'B08303_005E':'15 to 19 minutes',
    'B08303_006E':'20 to 24 minutes',
    'B08303_007E':'25 to 29 minutes',
    'B08303_008E':'30 to 34 minutes',
    'B08303_009E':'35 to 39 minutes',
    'B08303_010E':'40 to 44 minutes',
    'B08303_011E':'45 to 59 minutes',
    'B08303_012E':'60 to 89 minutes',
    'B08303_013E':'90 minutes or more',
    'B08101_009E':'car_commute',
    'B08101_025E':'public_transport_commute',
    'B16001_002E':'only_english',
    'B17001_001E':'poverty_status',
    'B19083_001E':'gini_income_ineq',
    'B19301_001E':'income_per_capita',
    'B15003_001E':'education_pop_over25',
    'B15003_017E':'high_school',
    'B15003_022E':'BS_diploma',
    'B15003_023E':'MS_diploma',
    'B15003_024E':'prof_diploma',
    'B15003_025E':'phd_dimploma'
}

In [5]:
# define paramters
years = [y for y in range(2014,2020)]
states = [states_names[32]]

In [6]:
acs_ct_data = collect_acs_data(years, states, column_codes_ct, 'ct')

year: 2019
---NEW_YORK


In [7]:
# save data in the form of CSV file
target_dir = "../output/"
acs_ct_data.to_csv(f'{target_dir}acs_5yr_data.csv', index=False)