In [19]:
from pathlib import Path
import pandas as pd
import numpy as np
import plotly.express as px
import csv
import json
import us

In [2]:
data_dir = Path.cwd().parent / 'DATA'

In [3]:
YEAR = '2020'
year_data = data_dir / YEAR

In [4]:
sources = {
    f_obj.name: f_obj 
    for f_obj in year_data.glob('*') 
    if f_obj.is_dir() and f_obj.name[0] != '.'
}
print(f"data sources: {list(sources.keys())}")

data sources: ['AHRQ', 'CDC_PLACES', 'CDC_SVI', 'EPA']


## Reference: County FIPS to County Name
Interactive page, plus multiple download formats  
<https://coastwatch.pfeg.noaa.gov/erddap/convert/fipscounty.html>

In [14]:
# load crosswalk
fips_xref = 'FipsCountyCodes'
fips_xref_js = data_dir / f"{fips_xref}.json"
with open(fips_xref_js, 'r') as js_in:
    fips_raw = json.load(js_in)['table']
print(fips_raw.keys())
fips_codes = fips_raw['rows']
print(f"code entries loaded: {len(fips_codes)}")

dict_keys(['columnNames', 'columnTypes', 'rows'])
code entries loaded: 3353


In [21]:
st_abbr_names = us.states.mapping('abbr', 'name')

In [30]:
states_terr = {}
counties = {}
for code, name in fips_codes:
    if ',' not in name:
        st_name = st_abbr_names.get(name, None)
        if st_name is None:
            print(f"skipping abbr: {name}, fips: {code}")
        else:
            states_terr[name] = {
                'fips': code[0:2],
                'name': st_name,
            }
    else:
        counties[name] = code
print(f"loaded states/territories: {len(states_terr)}")
print(f"loaded counties: {len(counties)}")

skipping abbr: FM, fips: 64000
skipping abbr: MH, fips: 68000
skipping abbr: PW, fips: 70000
skipping abbr: UM, fips: 74000
loaded states/territories: 55
loaded counties: 3289


## NOTE: looking to merge data along US counties

In [6]:
source_files = {}
for src, src_dir in sources.items():
    print(src)
    for f_obj in src_dir.glob('*'):
        if 'COUNTY' in f_obj.name.upper() and 'GIS' not in f_obj.name.upper():
            if f_obj.suffix.upper() in ['.XLSX', '.CSV']:
                print(' ', f_obj.name)
                source_files[src] = f_obj
source_files

AHRQ
  SDOH_2020_COUNTY_1_0.XLSX
CDC_PLACES
  PLACES-Local-Data-for-Better-Health-County-Data.csv
CDC_SVI
  SVI2020_US_COUNTY.csv
EPA
  daily_aqi_by_county_2020.csv


{'AHRQ': WindowsPath('C:/Users/aaimr/Desktop/Dev/FDA-TOP-Sprint/DATA/2020/AHRQ/SDOH_2020_COUNTY_1_0.XLSX'),
 'CDC_PLACES': WindowsPath('C:/Users/aaimr/Desktop/Dev/FDA-TOP-Sprint/DATA/2020/CDC_PLACES/PLACES-Local-Data-for-Better-Health-County-Data.csv'),
 'CDC_SVI': WindowsPath('C:/Users/aaimr/Desktop/Dev/FDA-TOP-Sprint/DATA/2020/CDC_SVI/SVI2020_US_COUNTY.csv'),
 'EPA': WindowsPath('C:/Users/aaimr/Desktop/Dev/FDA-TOP-Sprint/DATA/2020/EPA/daily_aqi_by_county_2020.csv')}

In [7]:
for src, src_file in source_files.items():
    if src_file.suffix.upper() == '.XLSX':
        xl_file = pd.ExcelFile(src_file)
        src_data = pd.read_excel(xl_file, sheet_name='Data')
    elif src_file.suffix.upper() == '.CSV':
        src_data = pd.read_csv(src_file)
    n_rows, n_cols = src_data.shape
    print(f"{src} data: {n_rows} rows, {n_cols} cols")
    print(src_data.columns)

AHRQ data: 3229 rows, 682 cols
Index(['YEAR', 'COUNTYFIPS', 'STATEFIPS', 'STATE', 'COUNTY', 'REGION',
       'TERRITORY', 'ACS_TOT_POP_WT', 'ACS_TOT_POP_US_ABOVE1',
       'ACS_TOT_POP_ABOVE5',
       ...
       'POS_TOT_HOSP_AMBULANCE', 'POS_HOSP_AMBULANCE_RATE',
       'POS_TOT_HOSP_CHEMO', 'POS_HOSP_CHEMO_RATE', 'POS_TOT_HOSP_ED',
       'POS_HOSP_ED_RATE', 'POS_PCT_HOSP_FOR_PROFIT',
       'POS_PCT_HOSP_NON_PROFIT', 'POS_PCT_HOSP_GOV', 'CEN_AIAN_NH_IND'],
      dtype='object', length=682)
CDC_PLACES data: 176008 rows, 21 cols
Index(['Year', 'StateAbbr', 'StateDesc', 'LocationName', 'DataSource',
       'Category', 'Measure', 'Data_Value_Unit', 'Data_Value_Type',
       'Data_Value', 'Data_Value_Footnote_Symbol', 'Data_Value_Footnote',
       'Low_Confidence_Limit', 'High_Confidence_Limit', 'TotalPopulation',
       'Latitude', 'CategoryID', 'MeasureId', 'DataValueTypeID',
       'Short_Question_Text', 'Geolocatioin'],
      dtype='object')
CDC_SVI data: 3143 rows, 158 cols
Index(['