In [1]:
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
sources = {
    'AHRQ SDOH': 'https://www.ahrq.gov/sdoh/data-analytics/sdoh-data.html',
    'CDC EPH': 'https://ephtracking.cdc.gov/',
    'CDC PLACES': 'https://www.cdc.gov/places/',
    'CDC PLACES DATA': 'https://chronicdata.cdc.gov/',
    'CDC/ATSDR SVI': 'https://www.atsdr.cdc.gov/placeandhealth/svi/',
    'NCHS Data Access': 'https://www.cdc.gov/nchs/data_access/ftp_data.htm',
    'EPA AQI': 'https://aqs.epa.gov/aqsweb/airdata/download_files.html',
}

In [3]:
DATA_YEAR = '2020'

In [4]:
data_dir = Path.cwd().parent / 'DATA'
data_dir.mkdir(exist_ok=True, parents=True)

In [5]:
year_data = data_dir / DATA_YEAR
year_data.mkdir(exist_ok=True)

In [6]:
for source, url in sources.items():
    try:
        resp = requests.get(url)
        resp.raise_for_status()
        print(f"site: {source}, OK")
    except:
        print(f"site: {source}, status: {resp.status_code}")

site: AHRQ SDOH, OK
site: CDC EPH, OK
site: CDC PLACES, OK
site: CDC PLACES DATA, OK
site: CDC/ATSDR SVI, OK
site: NCHS Data Access, OK
site: EPA AQI, OK


In [7]:
KB = 1024
CHUNK_SIZE = 8 * KB

## AHRQ SDOH

In [8]:
# AHRQ SDOH data files
# docs: https://www.ahrq.gov/sites/default/files/wysiwyg/sdoh/SDOH-Data-Sources-Documentation-v1-Final.pdf
SOURCE = 'AHRQ SDOH'
ahrq_url = 'https://www.ahrq.gov'
sdoh_url = sources[SOURCE]

In [9]:
# collect file urls
sdoh_data = {}
resp = requests.get(sdoh_url)
if resp.status_code == 200:
    print(f"site: {SOURCE}, OK")
    soup = BeautifulSoup(resp.content)
    file_table = soup.find_all('table')[0]
    table_rows = file_table.find_all('tr')
    if len(table_rows):
        print(f"file table found, row count: {len(table_rows)}\nheadlings:")
    for r, row in enumerate(table_rows):
        if not r:
            for head_cell in row.find_all('th'):
                print('\t', head_cell.text)
                cell_elems = head_cell.text.split()
                cell_elems.pop(-1)
                sdoh_data[' '.join(cell_elems)] = None
        else:
            first_col = row.find_all('td')[0]
            row_year = first_col.text.split()[0]
            if row_year == DATA_YEAR:
                print(f"data year found: {row_year}")
                row_data = zip(sdoh_data.keys(), row.find_all('td'))
                for data_key, cell_data in row_data:
                    cell_link_tag = cell_data.select_one('a')
                    cell_link = cell_link_tag.get('href')
                    if '://' not in cell_link:
                        cell_link = ahrq_url + cell_link
                    sdoh_data[data_key] = {
                        'url': cell_link,
                    }

site: AHRQ SDOH, OK
file table found, row count: 13
headlings:
	 Codebook Files 2009-2020
	 County Data 2009-2020
	 ZIP Code Data 2011-2020
	 Census Tract Data 2009-2020
data year found: 2020


In [10]:
ahrq_dir = year_data / 'AHRQ'
ahrq_dir.mkdir(exist_ok=True)

In [11]:
ahrq_sources = ahrq_dir / 'source_files.json'
with open(ahrq_sources, 'w') as js_out:
    json.dump(sdoh_data, js_out, indent=4)

In [12]:
# download data
for file_type, file_info in sdoh_data.items():
    file_url = file_info['url']
    file_name = file_url.split('/')[-1].upper()
    try:
        resp = requests.get(file_url, stream=True)
        resp.raise_for_status()
        local_file = ahrq_dir / file_name
        with open(local_file, 'wb') as f_out:
            for chunk in resp.iter_content(chunk_size=CHUNK_SIZE):
                f_out.write(chunk)
            file_size = local_file.stat().st_size / KB
            print(f"file written: {file_name}, size: {round(file_size, 2):,} kb")
    except Exception as e:
        print(e)

file written: SDOH_2020_CODEBOOK_1_0.XLSX, size: 440.0 kb
file written: SDOH_2020_COUNTY_1_0.XLSX, size: 9,984.0 kb
file written: SDOH_2020_ZIPCODE_1_0.XLSX, size: 60,056.0 kb
file written: SDOH_2020_TRACT_1_0.XLSX, size: 129,120.0 kb


## CDC SVI

Data & download page: <https://www.atsdr.cdc.gov/placeandhealth/svi/data_documentation_download.html>

**NOTE**: There is no clear API or easy method to automate the download from this page. So, simply use the form to manually download 2020 data in the following configurations:

* **GEO**: United States, **Geo Type**: Census Tracks, **File Type**: Shapefile
* **GEO**: United States, **Geo Type**: Census Tracks, **File Type**: csv
* **GEO**: United States, **Geo Type**: Counties, **File Type**: Shapefile
* **GEO**: United States, **Geo Type**: Counties, **File Type**: csv

In [13]:
# CDC SVI data files
# docs: https://www.atsdr.cdc.gov/placeandhealth/svi/documentation/pdf/SVI2020Documentation_08.05.22.pdf
svi_dir = year_data / 'CDC_SVI'
svi_dir.mkdir(exist_ok=True)

In [33]:
for f_obj in svi_dir.glob('*'):
    print(f_obj.name)
    if f_obj.is_dir():
        for f in f_obj.glob('*'):
            print(f"-- {f.name}")

SVI2020_US_census_track
-- SVI2020_US_tract.cpg
-- SVI2020_US_tract.dbf
-- SVI2020_US_tract.prj
-- SVI2020_US_tract.sbn
-- SVI2020_US_tract.sbx
-- SVI2020_US_tract.shp
-- SVI2020_US_tract.shp.xml
-- SVI2020_US_tract.shx
SVI2020_US_census_track.csv
SVI2020_US_COUNTY
-- SVI2020_US_county.cpg
-- SVI2020_US_county.dbf
-- SVI2020_US_county.prj
-- SVI2020_US_county.sbn
-- SVI2020_US_county.sbx
-- SVI2020_US_county.shp
-- SVI2020_US_county.shp.xml
-- SVI2020_US_county.shx
SVI2020_US_COUNTY.csv


## CDC PLACES

In [15]:
# CDC SVI data files
places_dir = year_data / 'CDC_PLACES'
places_dir.mkdir(exist_ok=True)

In [16]:
# set up query url
SOURCE = 'CDC PLACES DATA'
places_url = sources[SOURCE]
places_query_url = places_url + f"browse?q=PLACES%20{DATA_YEAR}"
print(f"PLACES dataset url: {places_query_url}")

PLACES dataset url: https://chronicdata.cdc.gov/browse?q=PLACES%202020


In [24]:
chronic_data = 'https://chronicdata.cdc.gov'
resources_url = chronic_data + '/resource'
api_url = chronic_data + '/api/views'

In [18]:
dataset_link_class = 'browse2-result-name-link'

In [19]:
dwnld_sfx = 'rows.csv?accessType=DOWNLOAD'

In [25]:
# collect dataset pages
places_datasets = {}
resp = requests.get(places_query_url)
if resp.status_code == 200:
    soup = BeautifulSoup(resp.content)
    n = 0
    for ds_link in soup.find_all('a', class_=dataset_link_class):
        if DATA_YEAR in ds_link.text or 'Dictionary' in ds_link.text:
            n += 1
            ds_page = ds_link.get('href')
            ds_id = ds_page.split('/')[-1]
            places_datasets[ds_link.text] = {
                'page': ds_page,
                'ds_id': ds_id,
                'download': f"{api_url}/{ds_id}/{dwnld_sfx}" ,
            }
            print(f"{n}. dataset: {ds_link.text}")
            print(f"-- page: {ds_page}")
            print(f"-- ID: {ds_id}\n")            

1. dataset: PLACES and 500 Cities: Data Dictionary
-- page: https://chronicdata.cdc.gov/500-Cities-Places/PLACES-and-500-Cities-Data-Dictionary/m35w-spkz
-- ID: m35w-spkz

2. dataset: PLACES: Local Data for Better Health, Place Data 2020 release
-- page: https://chronicdata.cdc.gov/500-Cities-Places/PLACES-Local-Data-for-Better-Health-Place-Data-202/q8xq-ygsk
-- ID: q8xq-ygsk

3. dataset: PLACES: Local Data for Better Health, County Data 2020 release
-- page: https://chronicdata.cdc.gov/500-Cities-Places/PLACES-Local-Data-for-Better-Health-County-Data-20/dv4u-3x3q
-- ID: dv4u-3x3q

4. dataset: PLACES: Local Data for Better Health, ZCTA Data 2020 release
-- page: https://chronicdata.cdc.gov/500-Cities-Places/PLACES-Local-Data-for-Better-Health-ZCTA-Data-2020/fbbf-hgkc
-- ID: fbbf-hgkc

5. dataset: PLACES: Local Data for Better Health, Census Tract Data 2020 release
-- page: https://chronicdata.cdc.gov/500-Cities-Places/PLACES-Local-Data-for-Better-Health-Census-Tract-D/4ai3-zynv
-- ID: 

In [21]:
places_sources = places_dir / 'source_files.json'
with open(places_sources, 'w') as js_out:
    json.dump(places_datasets, js_out, indent=4)

In [28]:
download_type = 'csv'
remv = [':', ',', '(', ')', '2020', 'release']

In [32]:
# download data
for dataset, ds_info in places_datasets.items():
    download_url = ds_info['download']
    raw_name = dataset
    for rm in remv:
        raw_name = raw_name.replace(rm, '')
    ds_filenm = '-'.join(raw_name.split())
    ds_file = places_dir / f"{ds_filenm}.{download_type}"
    try:
        resp = requests.get(download_url, stream=True)
        resp.raise_for_status()
        print(f"successful response: {raw_name}")
        with open(ds_file, 'wb') as f_out:
            for n, chunk in enumerate(resp.iter_content(chunk_size=CHUNK_SIZE), start=1):
                f_out.write(chunk)
                if not n % 250:
                    print(f"-- chunk {n} written")
            file_size = ds_file.stat().st_size / KB
            print(f"file written: {ds_file.name}, size: {round(file_size, 2):,} kb")
    except Exception as e:
        print(e)

successful response: PLACES and 500 Cities Data Dictionary
file written: PLACES-and-500-Cities-Data-Dictionary.csv, size: 0.0 kb
successful response: PLACES Local Data for Better Health Place Data  
-- chunk 250 written
-- chunk 500 written
-- chunk 750 written
-- chunk 1000 written
-- chunk 1250 written
-- chunk 1500 written
-- chunk 1750 written
-- chunk 2000 written
-- chunk 2250 written
-- chunk 2500 written
-- chunk 2750 written
-- chunk 3000 written
-- chunk 3250 written
file written: PLACES-Local-Data-for-Better-Health-Place-Data.csv, size: 376,901.21 kb
successful response: PLACES Local Data for Better Health County Data  
-- chunk 250 written
file written: PLACES-Local-Data-for-Better-Health-County-Data.csv, size: 41,931.67 kb
successful response: PLACES Local Data for Better Health ZCTA Data  
-- chunk 250 written
-- chunk 500 written
-- chunk 750 written
-- chunk 1000 written
-- chunk 1250 written
-- chunk 1500 written
-- chunk 1750 written
file written: PLACES-Local-Data-fo

## EPA AQI

Manual download of 2020 Daily AQI data from: <https://aqs.epa.gov/aqsweb/airdata/download_files.html>

Documentation: <https://aqs.epa.gov/aqsweb/airdata/FileFormats.html>

## 5. Daily Summary Files

### 5.1. Content

Each daily summary file contains data for every monitor (sampled parameter) in our database for each day. These files are separated by parameter (or parameter group) to make the sizes more manageable.

This file will contain a daily summary record that is:

1) The aggregate of all sub-daily measurements taken at the monitor.
2) The single sample value if the monitor takes a single, daily sample (e.g., there is only one sample with a 24-hour duration). In this case, the mean and max daily sample will have the same value.

The daily summary files contain (at least) one record for each monitor that reported data for the given day. There may be multiple records for the monitor if:

* There are calculated sample durations for the pollutant. For example, PM2.5 is sometimes reported as 1-hour samples and EPA calculates 24-hour averages.
* There are multiple standards for the pollutant (q.v. pollutant standards).
* There were exceptional events associated with some measurements that the monitoring agency has or may request be excluded from comparison to the standard.

### 5.2. Format

The file is comma separated variables (CSV) with a header row.

| Field Position | Field Name | Description |
|---|---|---|
| 1 | State Code | The FIPS code of the state in which the monitor resides. |
| 2 | County Code | The FIPS code of the county in which the monitor resides. |
| 3 | Site Num | A unique number within the county identifying the site. |
| 4 | Parameter Code | The AQS code corresponding to the parameter measured by the monitor. |
| 5 | POC | This is the "Parameter Occurrence Code" used to distinguish different instruments that measure the same parameter at the same site. |
| 6 | Latitude | The monitoring site's angular distance north of the equator measured in decimal degrees |
| 7 | Longitude | The monitoring site's angular distance east of the prime meridian measured in decimal degrees. |
| 8 | Datum | The Datum associated with the Latitude and Longitude measures. |
| 9 | Parameter Name | The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants. |
| 10 | Sample Duration | The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour). |
| 11 | Pollutant Standard | A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.) |
| 12 | Date Local | The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor. |
| 13 | Units of Measure | The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations. |
| 14 | Event Type | Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor. |
| 15 | Observation Count | The number of observations (samples) taken during the day. |
| 16 | Observation Percent | The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters). |
| 17 | Arithmetic Mean | The average (arithmetic mean) value for the day. |
| 18 | 1 st Max Value | The highest value for the day. |
| 19 | 1 st Max Hour | The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken. |
| 20 | AQI | The Air Quality Index for the day for the pollutant, if applicable. |
| 21 | Method Code | An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column. |
| 22 | Method Name | A short description of the processes, equipment, and protocols used in gathering and measuring the sample. |
| 23 | Local Site Name | The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it. |
| 24 | Address | The approximate street address of the monitoring site. |
| 25 | State Name | The name of the state where the monitoring site is located. |
| 26 | County Name | The name of the county where the monitoring site is located. |
| 27 | City Name | The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas. |
| 28 | CBSA Name | The name of the core bases statistical area (metropolitan area) where the monitoring site is located. |
| 29 | Date of Last Change | The date the last time any numeric values in this record were updated in the AQS data system. |