In [163]:
import pandas as pd
import us
import requests
import json
from collections import defaultdict


In [164]:
DATA_URL = 'https://data.census.gov/api/access/data/table?'
METADATA_URL = 'https://data.census.gov/api/search/metadata/table?'

In [165]:
EDUCATION_URL_QUERY = 'id=ACSST5Y2020.S1501&t=Educational+Attainment&g=0100000US$0500000'
RACE_URL_QUERY      = 'id=DECENNIALPL2020.P2&t=race&g=0100000US$0500000&tp=true'

In [166]:
EDUCATION_COLUMNS_PREFIX = 'Estimate!!Total!!RACE AND HISPANIC OR LATINO ORIGIN BY EDUCATIONAL ATTAINMENT!!'
EDUCATION_COLUMNS_DICT = {
   EDUCATION_COLUMNS_PREFIX + 'White alone, not Hispanic or Latino': 'total_white',
   EDUCATION_COLUMNS_PREFIX + "White alone, not Hispanic or Latino!!Bachelor's degree or higher": 'white_college'
}


In [167]:
RACE_COLUMNS_PREFIX = ' !!Total:!!Not Hispanic or Latino:!!Population of one race:!!'

RACE_COLUMNS_DICT = {
    RACE_COLUMNS_PREFIX + 'Black or African American alone': 'black',
    ' !!Total:!!Hispanic or Latino': 'hispanic',
    RACE_COLUMNS_PREFIX + 'American Indian and Alaska Native alone': 'native',
    RACE_COLUMNS_PREFIX + 'Asian alone': 'asian',
    RACE_COLUMNS_PREFIX + 'Native Hawaiian and Other Pacific Islander alone': 'hawaiian_pi',
    RACE_COLUMNS_PREFIX + 'Some Other Race alone': 'other',
    ' !!Total:!!Not Hispanic or Latino:!!Population of two or more races:': 'mixed'
}



In [168]:
def _read_census_json(query: str) -> pd.DataFrame:
    metadata_request = requests.get(METADATA_URL + query)
    metadata_request.raise_for_status()

    metadata = json.loads(metadata_request.text)

    column_mapping: dict[str, str] = {}
    for measure in metadata['response']['metadataContent']['measures']:
        column_mapping[measure['id']] = measure['label']
    for dimension in metadata['response']['metadataContent']['dimensions']:
        if 'item' in dimension:
            column_mapping[dimension['item']['id']] = dimension['item']['label']

    data_request = requests.get(DATA_URL + query)
    data_request.raise_for_status()

    data = json.loads(data_request.text)['response']['data']

    df = pd.DataFrame(data)

    # first row stores hard-to-read column names
    df.columns = df.iloc[0]
    df.drop(df.index[0], inplace=True)
    df.reset_index(drop=True, inplace=True)

    
    df.columns = [column_mapping.get(col, col) for col in df.columns]

    return df

In [169]:
county_2020_education_raw = _read_census_json(EDUCATION_URL_QUERY)
county_2020_race_raw = _read_census_json(RACE_URL_QUERY)

In [170]:
county_2020_race_raw.to_csv('test.csv')

In [171]:
county_2020_education_raw['FIPS'] = county_2020_education_raw['Geography'].apply(lambda id: id.split('US')[1])
county_2020_race_raw     ['FIPS'] = county_2020_race_raw     ['Geography'].apply(lambda id: id.split('US')[1])

In [172]:
def _only_voting_states_filter(row) -> bool:
    state_fips = row['FIPS'][:2]
    state = us.states.lookup(state_fips)
    return (state == us.states.DC) or (state in us.states.STATES)

In [173]:
county_2020_education = county_2020_education_raw\
    [county_2020_education_raw.apply(_only_voting_states_filter, axis=1)]\
    .rename(EDUCATION_COLUMNS_DICT, axis=1)\
    .loc[:, ['FIPS'] + list(EDUCATION_COLUMNS_DICT.values())]

In [174]:
county_2020_race = county_2020_race_raw\
    [county_2020_race_raw.apply(_only_voting_states_filter, axis=1)]\
    .rename(RACE_COLUMNS_DICT, axis=1)\
    .loc[:, ['FIPS'] + list(RACE_COLUMNS_DICT.values())]

In [175]:
for col in EDUCATION_COLUMNS_DICT.values():
    county_2020_education[col] = county_2020_education[col].astype(int)
for col in RACE_COLUMNS_DICT.values():
    county_2020_race[col] = county_2020_race[col].astype(int)

In [176]:
merged = county_2020_race\
    .merge(county_2020_education, on='FIPS')\
    .set_index('FIPS')

In [177]:
output = pd.DataFrame()

output['white-nocollege'] = merged['total_white'] - merged['white_college']
output['white-college'] = merged['white_college']
output['hispanic'] = merged['hispanic']
output['black'] = merged['black']
output['asian-other'] = merged['native'] + merged['asian'] + merged['hawaiian_pi'] + merged['other'] + merged['mixed']


In [180]:
try:
    with open('2020-pres/demographics.json') as fh:
        obj = json.load(fh)
except (FileNotFoundError, json.JSONDecodeError):
    obj = {}

with open('2020-pres/demographics.json', 'w+') as fh:
    obj['demographicDataByCounty'] = output.to_dict('index')
    if 'nationwidePopulation' not in obj:
        obj['nationwidePopulation'] = {}
    for col in output.columns:
        obj['nationwidePopulation'][col] = int(output[col].sum())
    json.dump(obj, fh, indent=4)