# 3. Historical Trends

For each region, what kind of changes do we see over time?

In [2]:
# set up path to app credentials - see exploration/README.md
%env GOOGLE_APPLICATION_CREDENTIALS=../google_app_credentials.json

# set up bigquery client
from google.cloud import bigquery
bq = bigquery.Client()

env: GOOGLE_APPLICATION_CREDENTIALS=../google_app_credentials.json


In [7]:
# set up some dependencies
import json, time, random, csv
import geopandas as gp
import pandas as pd
import matplotlib.pyplot as plt
import descartes # for plotting with geopanadas

In [10]:
resp = bq.query('''
    SELECT DISTINCT do_date AS year
    FROM `eosc410-project.data.acs_cbsa_20*`
    ORDER BY do_date ASC
''')
years = [row["year"] for row in resp]

## Air Quality Over Time

The EPA data provides a wide variety measurements for interpreting air quality, taken with various methods and timescales. We are interested in the following measurements in particular, which we will use in this exploration:

* PM2.5 (https://aqs.epa.gov/aqsweb/documents/codetables/methods_speciation.html)
    * Acceptable PM2.5 AQI & Speciation Mass (88502) - note AQI is air quality index
* Meteorological (https://aqs.epa.gov/aqsweb/documents/codetables/methods_met.html):
    * ~Ammoia (62604) in rain~ - unfortunately this does not seem to be collected by any stations in the EPA dataset
    * Average Ambient Temperature (68105)

Since availability of data at each station varies significantly, we'll avoid taking too many features.

TODO: continue assessing what features are available

In [46]:
'''
Utility function to collect relevant measurements from each station
'''
def collect_measurements(year: str, epa: pd.DataFrame, parameters = [88502, 68105]):
    regions = epa['epa_site']
    regions_query = ','.join(['"%s"' % r for r in regions])
    parameters_query = ','.join([str(p) for p in parameters])
    resp = bq.query('''
        WITH query AS ( SELECT [ %s ] AS sites, [ %s ] AS parameters ),
            filtered AS (
            SELECT
                CONCAT(epa.state_code, ".", epa.county_code, ".", epa.site_num) AS epa_site,
                CONCAT(epa.state_code, ".", epa.county_code, ".", epa.site_num, ".", epa.parameter_code) AS epa_measurement,
                epa.arithmetic_mean,
                epa.arithmetic_standard_dev,
                epa.units_of_measure
            FROM
                query, `eosc410-project.data.epa_air_quality_annual` AS epa
            WHERE
                epa.year = %s                                                                              # one year at a time
                AND epa.parameter_code IN UNNEST(query.parameters)                                         # only relevant parameters
                AND (epa.sample_duration = '24 HOUR' OR epa.sample_duration = '24-HR BLK AVG')             # 1hr is too granular
                AND CONCAT(epa.state_code, ".", epa.county_code, ".", epa.site_num) IN UNNEST(query.sites) # relevant sites only
        )

        SELECT
            filtered.epa_measurement,
            ANY_VALUE(filtered.epa_site) AS epa_site,
            AVG(filtered.arithmetic_mean) AS mean,
            AVG(filtered.arithmetic_standard_dev) AS stdev,
            ANY_VALUE(filtered.units_of_measure) AS units
        FROM
            filtered
        GROUP BY
            filtered.epa_measurement
        ORDER BY
            epa_site
    ''' % (regions_query, parameters_query, year))
    return resp.to_dataframe()

In [47]:
# test on one year
y = '2007'
measurements = collect_measurements(y, pd.read_csv('../_data/epa_to_acs_cbsa/%s.csv' % y))

In [50]:
print('sites with measurements:', len(measurements['epa_site'].unique()))
print('measurement types:', len(measurements['epa_measurement'].unique()))
print('measurement units:', len(measurements['units'].unique()))

sites with measurements: 1137
measurement types: 1461
measurement units: 2


In [28]:
# query 2007 measurements for some deets
print(measurements_2007)
print('distinct parameters', measurements_2007['parameter_code'].unique())
print('distinct sample durations:', measurements_2007['sample_duration'].unique())
counts = measurements_2007[measurements_2007.parameter_code==88502].groupby('epa_site')['sample_duration'].transform('nunique').rename('Unique durations')
sites = measurements_2007[counts > 1].groupby(['epa_site', counts])s
print('stations with multiple 88502 measurements', sites)

SyntaxError: invalid syntax (<ipython-input-28-6db22e7a1e86>, line 6)

In [6]:
# collect all records in a time series
timeseries = pd.DataFrame()
# for y in ['2007']:
    # epa = pd.read_csv('../_data/epa_to_acs_cbsa/%s.csv' % y)
    # TODO
    # .append({'Name' : 'Sahil' , 'Age' : 22} , ignore_index=True)

18795
