In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
data_dir = Path.cwd().parent / 'DATA'

In [3]:
YEAR = '2020'
year_data = data_dir / YEAR

In [13]:
sources = {
    f_obj.name: f_obj 
    for f_obj in year_data.glob('*') 
    if f_obj.is_dir() and f_obj.name[0] != '.'
}
print(f"data sources: {list(sources.keys())}")

data sources: ['AHRQ', 'CDC_PLACES', 'CDC_SVI', 'EPA_AQI']


## NOTE: looking to merge data along US counties

In [28]:
source_files = {}
for src, src_dir in sources.items():
    print(src)
    for f_obj in src_dir.glob('*'):
        if 'COUNTY' in f_obj.name.upper() and 'GIS' not in f_obj.name.upper():
            if f_obj.suffix.upper() in ['.XLSX', '.CSV']:
                print(' ', f_obj.name)
                source_files[src] = f_obj
source_files

AHRQ
  SDOH_2020_COUNTY_1_0.XLSX
CDC_PLACES
  PLACES-Local-Data-for-Better-Health-County-Data.csv
CDC_SVI
  SVI2020_US_COUNTY.csv
EPA_AQI
  daily_aqi_by_county_2020.csv


{'AHRQ': WindowsPath('C:/Users/AliHashmi/Desktop/Development/FDA-TOP-Sprint/DATA/2020/AHRQ/SDOH_2020_COUNTY_1_0.XLSX'),
 'CDC_PLACES': WindowsPath('C:/Users/AliHashmi/Desktop/Development/FDA-TOP-Sprint/DATA/2020/CDC_PLACES/PLACES-Local-Data-for-Better-Health-County-Data.csv'),
 'CDC_SVI': WindowsPath('C:/Users/AliHashmi/Desktop/Development/FDA-TOP-Sprint/DATA/2020/CDC_SVI/SVI2020_US_COUNTY.csv'),
 'EPA_AQI': WindowsPath('C:/Users/AliHashmi/Desktop/Development/FDA-TOP-Sprint/DATA/2020/EPA_AQI/daily_aqi_by_county_2020.csv')}

In [32]:
for src, src_file in source_files.items():
    if src_file.suffix.upper() == '.XLSX':
        xl_file = pd.ExcelFile(src_file)
        src_data = pd.read_excel(xl_file, sheet_name='Data')
    elif src_file.suffix.upper() == '.CSV':
        src_data = pd.read_csv(src_file)
    n_rows, n_cols = src_data.shape
    print(f"{src} data: {n_rows} rows, {n_cols} cols")
    print(src_data.columns)

AHRQ data: 3229 rows, 682 cols
Index(['YEAR', 'COUNTYFIPS', 'STATEFIPS', 'STATE', 'COUNTY', 'REGION',
       'TERRITORY', 'ACS_TOT_POP_WT', 'ACS_TOT_POP_US_ABOVE1',
       'ACS_TOT_POP_ABOVE5',
       ...
       'POS_TOT_HOSP_AMBULANCE', 'POS_HOSP_AMBULANCE_RATE',
       'POS_TOT_HOSP_CHEMO', 'POS_HOSP_CHEMO_RATE', 'POS_TOT_HOSP_ED',
       'POS_HOSP_ED_RATE', 'POS_PCT_HOSP_FOR_PROFIT',
       'POS_PCT_HOSP_NON_PROFIT', 'POS_PCT_HOSP_GOV', 'CEN_AIAN_NH_IND'],
      dtype='object', length=682)
CDC_PLACES data: 1000 rows, 21 cols
Index(['year', 'stateabbr', 'statedesc', 'locationname', 'datasource',
       'category', 'measure', 'data_value_unit', 'data_value_type',
       'data_value', 'data_value_footnote_symbol', 'data_value_footnote',
       'low_confidence_limit', 'high_confidence_limit', 'totalpopulation',
       'latitude', 'categoryid', 'measureid', 'datavaluetypeid',
       'short_question_text', 'geolocation'],
      dtype='object')
CDC_SVI data: 3143 rows, 158 cols
Index(['ST'