In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing necessary libraries

In [None]:
#
#    These are standard python modules
#
#import json, time, urllib.parse
import json, time
import numpy as np
import pandas as pd
#
#    The 'requests' module is a distribution module for making web requests.
#
import requests
from tqdm import tqdm

In [None]:
#########
#
#    CONSTANTS
#

#
#    This is the root of all AQS API URLs
#
API_REQUEST_URL = 'https://aqs.epa.gov/data/api'

#
#    These are 'actions' we can ask the API to take or requests that we can make of the API
#
#    Sign-up request - generally only performed once - unless you lose your key
API_ACTION_SIGNUP = '/signup?email={email}'
#
#    List actions provide information on API parameter values that are required by some other actions/requests
API_ACTION_LIST_CLASSES = '/list/classes?email={email}&key={key}'
API_ACTION_LIST_PARAMS = '/list/parametersByClass?email={email}&key={key}&pc={pclass}'
API_ACTION_LIST_SITES = '/list/sitesByCounty?email={email}&key={key}&state={state}&county={county}'
#
#    Monitor actions are requests for monitoring stations that meet specific criteria
API_ACTION_MONITORS_COUNTY = '/monitors/byCounty?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&state={state}&county={county}'
API_ACTION_MONITORS_BOX = '/monitors/byBox?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&minlat={minlat}&maxlat={maxlat}&minlon={minlon}&maxlon={maxlon}'
#
#    Summary actions are requests for summary data. These are for daily summaries
API_ACTION_DAILY_SUMMARY_COUNTY = '/dailyData/byCounty?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&state={state}&county={county}'
API_ACTION_DAILY_SUMMARY_BOX = '/dailyData/byBox?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&minlat={minlat}&maxlat={maxlat}&minlon={minlon}&maxlon={maxlon}'
#
#    It is always nice to be respectful of a free data resource.
#    We're going to observe a 100 requests per minute limit - which is fairly nice
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED
#
#
#    This is a template that covers most of the parameters for the actions we might take, from the set of actions
#    above. In the examples below, most of the time parameters can either be supplied as individual values to a
#    function - or they can be set in a copy of the template and passed in with the template.
#
AQS_REQUEST_TEMPLATE = {
    "email":      "",
    "key":        "",
    "state":      "",     # the two digit state FIPS # as a string
    "county":     "",     # the three digit county FIPS # as a string
    "begin_date": "",     # the start of a time window in YYYYMMDD format
    "end_date":   "",     # the end of a time window in YYYYMMDD format, begin_date and end_date must be in the same year
    "minlat":    0.0,
    "maxlat":    0.0,
    "minlon":    0.0,
    "maxlon":    0.0,
    "param":     "",     # a list of comma separated 5 digit codes, max 5 codes requested
    "pclass":    ""      # parameter class is only used by the List calls
}

API Access details

In [None]:
USERNAME = "andixit@uw.edu"
APIKEY = "ecruosprey95"

Only filtering for specific pollutants

In [None]:
AQI_PARAM_CLASS = "AQI POLLUTANTS"

In [None]:
#   Gaseous AQI pollutants CO, SO2, NO2, and O2
AQI_PARAMS_GASEOUS = "42101,42401,42602,44201"
#
#   Particulate AQI pollutants PM10, PM2.5, and Acceptable PM2.5
AQI_PARAMS_PARTICULATES = "81102,88101,88502"
#
#

Find the data for North Platte, Nebraska

In [None]:
CITY_LOCATIONS = {
    'np' :       {'city'   : 'North Platte',
                  'county' : 'Lincoln',
                  'state'  : 'Nebraska',
                  'fips'   : '35000',
                  'latlon' : [41.1403, -100.7601] }
}

In [None]:
#
#    This implements the daily summary request. Daily summary provides a daily summary value for each sensor being requested
#    from the start date to the end date.
#
#    Like the two other functions, this can be called with a mixture of a defined parameter dictionary, or with function
#    parameters. If function parameters are provided, those take precedence over any parameters from the request template.
#
def request_daily_summary(email_address = None, key = None, param=None,
                          begin_date = None, end_date = None, fips = None,
                          endpoint_url = API_REQUEST_URL,
                          endpoint_action = API_ACTION_DAILY_SUMMARY_COUNTY,
                          request_template = AQS_REQUEST_TEMPLATE,
                          headers = None):

    #  This prioritizes the info from the call parameters - not what's already in the template
    if email_address:
        request_template['email'] = email_address
    if key:
        request_template['key'] = key
    if param:
        request_template['param'] = param
    if begin_date:
        request_template['begin_date'] = begin_date
    if end_date:
        request_template['end_date'] = end_date
    if fips and len(fips)==5:
        request_template['state'] = fips[:2]
        request_template['county'] = fips[2:]

    # Make sure there are values that allow us to make a call - these are always required
    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_daily_summary()'")
    if not request_template['key']:
        raise Exception("Must supply a key to call 'request_daily_summary()'")
    if not request_template['param']:
        raise Exception("Must supply param values to call 'request_daily_summary()'")
    if not request_template['begin_date']:
        raise Exception("Must supply a begin_date to call 'request_daily_summary()'")
    if not request_template['end_date']:
        raise Exception("Must supply an end_date to call 'request_daily_summary()'")
    # Note we're not validating FIPS fields because not all of the daily summary actions require the FIPS numbers

    # compose the request
    request_url = endpoint_url+endpoint_action.format(**request_template)

    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response



In [None]:
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_GASEOUS
request_data['state'] = CITY_LOCATIONS['np']['fips'][:2]
request_data['county'] = CITY_LOCATIONS['np']['fips'][2:]

# request daily summary data for 2020
gaseous_aqi = request_daily_summary(request_template=request_data, begin_date="20200101", end_date="20201231")
print("Response for the gaseous pollutants ...")
#
if gaseous_aqi["Header"][0]['status'] == "Success":
    print(json.dumps(gaseous_aqi['Data'],indent=4))
elif gaseous_aqi["Header"][0]['status'].startswith("No data "):
    print("Looks like the response generated no data. You might take a closer look at your request and the response data.")
else:
    print(json.dumps(gaseous_aqi,indent=4))

request_data['param'] = AQI_PARAMS_PARTICULATES
# request daily summary data for 2020
particulate_aqi = request_daily_summary(request_template=request_data, begin_date="20200101", end_date="20201231")
print("Response for the particulate pollutants ...")
#
if particulate_aqi["Header"][0]['status'] == "Success":
    print(json.dumps(particulate_aqi['Data'],indent=4))
elif particulate_aqi["Header"][0]['status'].startswith("No data "):
    print("Looks like the response generated no data. You might take a closer look at your request and the response data.")
else:
    print(json.dumps(particulate_aqi,indent=4))

Response for the gaseous pollutants ...
Looks like the response generated no data. You might take a closer look at your request and the response data.
Response for the particulate pollutants ...
Looks like the response generated no data. You might take a closer look at your request and the response data.


There is no data being returned for the exact FIPS code location of North Platte, Nebraska.

Hence, we extend our search radius to find surrounding stations

In [None]:
"#
#   These are rough estimates for creating bounding boxes based on a city location
#   You can find these rough estimates on the USGS website:
#   https://www.usgs.gov/faqs/how-much-distance-does-a-degree-minute-and-second-cover-your-maps
#
LAT_25MILES = 25.0 * (1.0/69.0)    # This is about 25 miles of latitude in decimal degrees
LON_25MILES = 25.0 * (1.0/54.6)    # This is about 25 miles of longitude in decimal degrees
#
#   Compute a rough estimates for a bounding box around a given place
#   The bounding box is scaled in 50 mile increments. That is the bounding box will have sides that
#   are rough multiples of 50 miles, with the center of the box around the indicated place.
#   The scale parameter determines the scale (size) of the bounding box
#
def bounding_latlon(place=None,scale=1.0):
    minlat = place['latlon'][0] - float(scale) * LAT_25MILES
    maxlat = place['latlon'][0] + float(scale) * LAT_25MILES
    minlon = place['latlon'][1] - float(scale) * LON_25MILES
    maxlon = place['latlon'][1] + float(scale) * LON_25MILES
    return [minlat,maxlat,minlon,maxlon]

In [None]:
#
#    This implements the monitors request. This requests monitoring stations. This can be done by state, county, or bounding box.
#
#    Like the two other functions, this can be called with a mixture of a defined parameter dictionary, or with function
#    parameters. If function parameters are provided, those take precedence over any parameters from the request template.
#
def request_monitors(email_address = None, key = None, param=None,
                          begin_date = None, end_date = None, fips = None,
                          endpoint_url = API_REQUEST_URL,
                          endpoint_action = API_ACTION_MONITORS_COUNTY,
                          request_template = AQS_REQUEST_TEMPLATE,
                          headers = None):

    #  This prioritizes the info from the call parameters - not what's already in the template
    if email_address:
        request_template['email'] = email_address
    if key:
        request_template['key'] = key
    if param:
        request_template['param'] = param
    if begin_date:
        request_template['begin_date'] = begin_date
    if end_date:
        request_template['end_date'] = end_date
    if fips and len(fips)==5:
        request_template['state'] = fips[:2]
        request_template['county'] = fips[2:]

    # Make sure there are values that allow us to make a call - these are always required
    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_monitors()'")
    if not request_template['key']:
        raise Exception("Must supply a key to call 'request_monitors()'")
    if not request_template['param']:
        raise Exception("Must supply param values to call 'request_monitors()'")
    if not request_template['begin_date']:
        raise Exception("Must supply a begin_date to call 'request_monitors()'")
    if not request_template['end_date']:
        raise Exception("Must supply an end_date to call 'request_monitors()'")
    # Note we're not validating FIPS fields because not all of the monitors actions require the FIPS numbers

    # compose the request
    request_url = endpoint_url+endpoint_action.format(**request_template)

    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [None]:
#
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_PARTICULATES     # remember we have both gaseous and particulates
#
#   We got the monitoring stations for Bend OR above (Deschutes county) - let's work with that one again
request_data['state'] = CITY_LOCATIONS['np']['fips'][:2]
request_data['county'] = CITY_LOCATIONS['np']['fips'][2:]
#
# the first example uses the default - request monitors by county, we'll just use a recent date for now
response = request_monitors(request_template=request_data, begin_date="20210701", end_date="20210731")
#
# the response should be similar to the 'list' request above - but in this case we should only get monitors that
# monitor the AQI_PARAMS_PARTICULATES set of params.
#
if response["Header"][0]['status'] == "Success":
    print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))


{
    "Header": [
        {
            "status": "No data matched your selection",
            "request_time": "2023-11-08T01:45:21-05:00",
            "url": "https://aqs.epa.gov/data/api/monitors/byCounty?email=andixit@uw.edu&key=ecruosprey95&param=81102,88101,88502&bdate=20210701&edate=20210731&state=35&county=000",
            "rows": 0
        }
    ],
    "Data": []
}


There are no monitoring stations in North Platte.

Expanding the search radius to 50,100,150,200,250 miles, I get multiple stations which encompass a lot of years of air quality index data

In [None]:
#
#    Create a copy of the AQS_REQUEST_TEMPLATE
#
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_PARTICULATES     # same particulate request as the one abover
#
#   Not going to use these - comment them out
#request_data['state'] = CITY_LOCATIONS['bend']['fips'][:2]
#request_data['county'] = CITY_LOCATIONS['bend']['fips'][2:]
#
#   Now, we need bounding box parameters

#   50 mile box
#bbox = bounding_latlon(CITY_LOCATIONS['np'],scale=1.0)
#   100 mile box
#bbox = bounding_latlon(CITY_LOCATIONS['np'],scale=2.0)
#   150 mile box
#bbox = bounding_latlon(CITY_LOCATIONS['np'],scale=3.0)
#   200 mile box
#bbox = bounding_latlon(CITY_LOCATIONS['np'],scale=4.0)
#   250 mile box
bbox = bounding_latlon(CITY_LOCATIONS['np'],scale=5.0)

# the bbox response comes back as a list - [minlat,maxlat,minlon,maxlon]

#   put our bounding box into the request_data
request_data['minlat'] = bbox[0]
request_data['maxlat'] = bbox[1]
request_data['minlon'] = bbox[2]
request_data['maxlon'] = bbox[3]

#
#   we need to change the action for the API from the default to the bounding box - same recent date for now
response = request_monitors(request_template=request_data, begin_date="20020602", end_date="20020603",
                            endpoint_action = API_ACTION_MONITORS_BOX)
#
#
#
if response["Header"][0]['status'] == "Success":
    for station in response['Data']:
      print("state_code: ",station["state_code"])
      print("county_code: ",station["county_code"])
      print("open_date: ",station["open_date"])
      print("close_date: ",station["close_date"])
      print(" ")
    #print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))


state_code:  20
county_code:  181
open_date:  1985-08-01
close_date:  2017-03-31
 
state_code:  31
county_code:  111
open_date:  1999-03-01
close_date:  2005-12-31
 
state_code:  31
county_code:  069
open_date:  2002-06-01
close_date:  None
 
state_code:  31
county_code:  047
open_date:  1991-09-01
close_date:  2016-03-08
 
state_code:  31
county_code:  049
open_date:  1999-08-04
close_date:  2002-11-13
 
state_code:  31
county_code:  171
open_date:  2002-06-01
close_date:  None
 
state_code:  31
county_code:  047
open_date:  1994-10-01
close_date:  2016-03-08
 


These are the station locations and the time durations of data they have that I will use.

'20181' -> 1985-1991

'31047' -> 1991-2002

'31171' -> 2002-2023

In [None]:
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_GASEOUS
request_data['state'] = 31
request_data['county'] = 171

# request daily summary data for the month of Jan in 2020
gaseous_aqi = request_daily_summary(request_template=request_data, begin_date="20200101", end_date="20200102")
print("Response for the gaseous pollutants ...")
#
if gaseous_aqi["Header"][0]['status'] == "Success":
    print(json.dumps(gaseous_aqi['Data'],indent=4))
elif gaseous_aqi["Header"][0]['status'].startswith("No data "):
    print("Looks like the response generated no data. You might take a closer look at your request and the response data.")
else:
    print(json.dumps(gaseous_aqi,indent=4))

request_data['param'] = AQI_PARAMS_PARTICULATES
# request daily summary data for the month of Jan in 2020
particulate_aqi = request_daily_summary(request_template=request_data, begin_date="20200101", end_date="20200102")
print("Response for the particulate pollutants ...")
#
if particulate_aqi["Header"][0]['status'] == "Success":
    print(json.dumps(particulate_aqi['Data'],indent=4))
elif particulate_aqi["Header"][0]['status'].startswith("No data "):
    print("Looks like the response generated no data. You might take a closer look at your request and the response data.")
else:
    print(json.dumps(particulate_aqi,indent=4))

Response for the gaseous pollutants ...
Looks like the response generated no data. You might take a closer look at your request and the response data.
Response for the particulate pollutants ...
[
    {
        "state_code": "31",
        "county_code": "171",
        "site_number": "9000",
        "parameter_code": "88502",
        "poc": 1,
        "latitude": 41.888789,
        "longitude": -100.339141,
        "datum": "WGS84",
        "parameter": "Acceptable PM2.5 AQI & Speciation Mass",
        "sample_duration_code": "7",
        "sample_duration": "24 HOUR",
        "pollutant_standard": null,
        "date_local": "2020-01-01",
        "units_of_measure": "Micrograms/cubic meter (LC)",
        "event_type": "No Events",
        "observation_count": 1,
        "observation_percent": 100.0,
        "validity_indicator": "Y",
        "arithmetic_mean": 1.2,
        "first_max_value": 1.2,
        "first_max_hour": 0,
        "aqi": 5,
        "method_code": "707",
        "metho

I get AQI data for particulate pollutants but not for gaseous pollutants, hence I eliminate those.

Developing a logic to iterate across years, call API and get AQI data for the entire year and aggregate it and save the avergae AQI value for the year.

In [None]:
stations = {1985:'20181',1986:'20181',1987:'20181',1988:'20181',1989:'20181',1990:'20181',
            1991:'31047',1992:'31047',1993:'31047',1994:'31047',1995:'31047',1996:'31047',1997:'31047',1998:'31047',1999:'31047',2000:'31047',2001:'31047',
            2002:'31171',2003:'31171',2004:'31171',2005:'31171',2006:'31171',2007:'31171',2008:'31171',2009:'31171',2010:'31171',2011:'31171',2012:'31171',
            2013:'31171',2014:'31171',2015:'31171',2016:'31171',2017:'31171',2018:'31171',2019:'31171',2020:'31171',2021:'31171',2022:'31171'}

request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_PARTICULATES
years = list(np.arange(1985,2023,1))
aqi_data = dict()

for year in tqdm(years):
  all_aqis = []
  fips = stations[year]
  state_fips = fips[0:2]
  county_fips = fips[2:5]
  request_data['state'] = state_fips
  request_data['county'] = county_fips
  start_date = str(year)+'0101'
  end_date = str(year)+'1231'
  particulate_aqi = request_daily_summary(request_template=request_data, begin_date=start_date, end_date=end_date)
  if particulate_aqi["Header"][0]['status'] == "Success":
    data = particulate_aqi['Data']
    for day_data in data:
      if "aqi" in day_data:
        try:
          all_aqis.append(day_data["aqi"])
        except:
          print("Null AQI value found, ignored")

    avg_aqi = round(np.average(all_aqis))
    aqi_data[year] = avg_aqi

    #print(json.dumps(particulate_aqi['Data'],indent=4))
  elif particulate_aqi["Header"][0]['status'].startswith("No data "):
    print("No data for year ",year)
    aqi_data[year] = 0
  else:
      print("No data for year ",year)
      aqi_data[year] = 0



100%|██████████| 38/38 [00:17<00:00,  2.19it/s]


In [None]:
aqi_df = pd.DataFrame.from_dict(aqi_data,orient='index')
aqi_df.reset_index(inplace=True)
aqi_df.rename(columns = {'index':'Year', 0:'avg_aqi'}, inplace = True)
aqi_df.head()

Unnamed: 0,Year,avg_aqi
0,1985,24
1,1986,32
2,1987,36
3,1988,27
4,1989,26


In [None]:
aqi_df.to_csv('/content/drive/MyDrive/AUT 2023/DATA 512/P1/aqi.csv')