# 3. EPA Data

Format data into some relevant CSV's for later analysis

In [None]:
# set up path to app credentials - see exploration/README.md
%env GOOGLE_APPLICATION_CREDENTIALS=../google_app_credentials.json

# set up bigquery client
from google.cloud import bigquery
bq = bigquery.Client()

In [None]:
# set up some dependencies
import json, time, random, csv
import geopandas as gp
import pandas as pd
import matplotlib.pyplot as plt
import descartes # for plotting with geopanadas

In [None]:
resp = bq.query('''
    SELECT DISTINCT do_date AS year
    FROM `eosc410-project.data.acs_cbsa_20*`
    ORDER BY do_date ASC
''')
years = [row["year"] for row in resp]

## Selecting Metrics for Air Quality

The EPA data provides a wide variety measurements for interpreting air quality, taken with various methods and timescales. We are interested in the following measurements in particular, which we will use in this exploration:

* PM2.5 (https://aqs.epa.gov/aqsweb/documents/codetables/methods_speciation.html)
    * Acceptable PM2.5 AQI & Speciation Mass (88502) - note AQI is air quality index
* Meteorological (https://aqs.epa.gov/aqsweb/documents/codetables/methods_met.html):
    * ~Ammoia (62604) in rain~ - unfortunately this does not seem to be collected by any stations in the EPA dataset
    * Average Ambient Temperature (68105)

Since availability of data at each station varies significantly, we'll avoid taking too many features.

TODO: continue assessing what features are available

In [196]:
'''
Utility function to collect relevant measurements from each station
'''
def collect_measurements(year: str, epa: pd.DataFrame, parameters = [88502, 68105]) -> pd.DataFrame:
    regions = epa['epa_site']
    regions_query = ','.join(['"%s"' % r for r in regions])
    parameters_query = ','.join([str(p) for p in parameters])
    resp = bq.query('''
        WITH query AS ( SELECT [ %s ] AS sites, [ %s ] AS parameters ),
            filtered AS (
            SELECT
                CONCAT(epa.state_code, ".", epa.county_code, ".", epa.site_num) AS epa_site,
                CONCAT(epa.state_code, ".", epa.county_code, ".", epa.site_num, ".", epa.parameter_code) AS epa_measurement,
                epa.arithmetic_mean,
                epa.arithmetic_standard_dev,
                epa.units_of_measure
            FROM
                query, `eosc410-project.data.epa_air_quality_annual` AS epa
            WHERE
                epa.year = %s                                                                              # one year at a time
                AND epa.parameter_code IN UNNEST(query.parameters)                                         # only relevant parameters
                AND (epa.sample_duration = '24 HOUR' OR epa.sample_duration = '24-HR BLK AVG')             # 1hr is too granular
                AND CONCAT(epa.state_code, ".", epa.county_code, ".", epa.site_num) IN UNNEST(query.sites) # relevant sites only
        )

        SELECT
            filtered.epa_measurement,
            ANY_VALUE(filtered.epa_site) AS epa_site,
            AVG(filtered.arithmetic_mean) AS mean,
            AVG(filtered.arithmetic_standard_dev) AS stdev,
            ANY_VALUE(filtered.units_of_measure) AS units
        FROM
            filtered
        GROUP BY
            filtered.epa_measurement
        ORDER BY
            epa_site
    ''' % (regions_query, parameters_query, year))
    measures = resp.to_dataframe()
    index = epa[['epa_site', 'acs_geoid']].astype(str)
    merged = measures.merge(index, left_on='epa_site', right_on='epa_site')
    def make_combined_id(r):
        return '%s.epa.%s' % (r['acs_geoid'], r['epa_measurement'])
    merged['combined_id'] = merged.apply(lambda r: make_combined_id(r), axis=1)
    return merged

In [197]:
# test on one year
y = '2018'
measurements = collect_measurements(y, pd.read_csv('../_data/epa_to_acs_cbsa/%s.csv' % y))

In [199]:
print(measurements.dtypes)
print(measurements.head())
print('\nsites with measurements:', len(measurements['epa_site'].unique()))
print('measurement types:', len(measurements['epa_measurement'].unique()))
print('measurement units:', len(measurements['units'].unique()))
rows = measurements[measurements.combined_id == '19300.epa.01.003.0010.68105']
print(rows)
print(len(rows))

epa_measurement     object
epa_site            object
mean               float64
stdev              float64
units               object
acs_geoid           object
combined_id         object
dtype: object
     epa_measurement     epa_site       mean     stdev  \
0  01.055.0010.88502  01.055.0010  10.339130  5.738066   
1  01.073.0023.88502  01.073.0023   8.746729  4.327083   
2  01.073.0023.68105  01.073.0023  18.580149  8.734221   
3  01.073.1005.88502  01.073.1005   9.015773  3.956376   
4  01.073.1005.68105  01.073.1005  18.362618  8.386037   

                         units acs_geoid                  combined_id  
0  Micrograms/cubic meter (LC)     23460  23460.epa.01.055.0010.88502  
1  Micrograms/cubic meter (LC)     13820  13820.epa.01.073.0023.88502  
2           Degrees Centigrade     13820  13820.epa.01.073.0023.68105  
3  Micrograms/cubic meter (LC)     13820  13820.epa.01.073.1005.88502  
4           Degrees Centigrade     13820  13820.epa.01.073.1005.68105  

sites with meas

In [None]:
# query 2007 measurements for some deets
print(measurements_2007)
print('distinct parameters', measurements_2007['parameter_code'].unique())
print('distinct sample durations:', measurements_2007['sample_duration'].unique())
counts = measurements_2007[measurements_2007.parameter_code==88502].groupby('epa_site')['sample_duration'].transform('nunique').rename('Unique durations')
sites = measurements_2007[counts > 1].groupby(['epa_site', counts])s
print('stations with multiple 88502 measurements', sites)

In [201]:
timeseries_flat = pd.DataFrame()
timeseries_vert = pd.DataFrame(columns=['combined_id'])

for y in years:
    print('%s: processing' % y)

    # load data
    epa = pd.read_csv('../_data/epa_to_acs_cbsa/%s.csv' % y)
    ms = collect_measurements(y, epa)

    # setup
    flat_row = {'year': y}
    assigned = []
    def get_row_val(id, col):
        rows = ms[ms['combined_id'] == id]
        if len(rows) != 1:
            return None
        assigned.append(id)
        return rows[col].iloc[0]
    if len(timeseries_vert) > 0:
        timeseries_vert['%s.mean' % y] = timeseries_vert.apply(lambda r: get_row_val(r['combined_id'], 'mean'), axis=1)
        timeseries_vert['%s.stdev' % y] = timeseries_vert.apply(lambda r: get_row_val(r['combined_id'], 'stdev'), axis=1)

    # do iteration
    for index, row in ms.iterrows():
        id = row['combined_id']
        mean = row['mean']
        stdev = row['stdev']

        flat_row['%s.mean' % id] = mean
        flat_row['%s.stdev' % id] = stdev

        if id not in assigned:
            timeseries_vert = timeseries_vert.append({
                'combined_id': id,
                '%s.mean' % y: mean,
                '%s.stdev' % y: stdev,
            }, ignore_index=True)

    timeseries_flat = timeseries_flat.append(flat_row, ignore_index=True)

print('exporting...')
timeseries_flat.to_csv('../_data/epa/timeseries_flat.csv')
timeseries_vert.to_csv('../_data/epa/timeseries_vert.csv')
print('done')

2007: processing
2008: processing
2009: processing
2010: processing
2011: processing
2012: processing
2013: processing
2014: processing
2015: processing
2016: processing
2017: processing
2018: processing
exporting...
done
