<a href="https://colab.research.google.com/github/cathyxinchangli/ATMS-597-SP-2020/blob/master/ATMS-597-SP-2020-Project-2/Project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import numpy as np
import pandas as pd
import datetime

import matplotlib as mlp
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection
from matplotlib.colors import ListedColormap
import matplotlib.dates as mdates

%matplotlib inline

Two goals,
* Return station and city information to add in the picture names
* Loop through cities and stations

In [2]:
def make_request(endpoint, payload=None):
    """
    Make a request to a specific endpoint on the weather API
    passing headers and optional payload.
    
    Parameters:
        - endpoint: The endpoint of the API you want to 
                    make a GET request to.
        - payload: A dictionary of data to pass along 
                   with the request.
    
    Returns:
        Response object.
    ---
    S. Moline, 'Hands on Data Analysis with Pandas'
    https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/blob/master/ch_04/0-weather_data_collection.ipynb
    """
    return requests.get(
        f'https://www.ncdc.noaa.gov/cdo-web/api/v2/{endpoint}',
        headers={
            'token': 'HfZphTTGbXckpmEuqevAXQZFRCTEJgkJ' # obtained by X. Li, 2/4/2020
        },
        params=payload
    )

def fetch_data(locationid, startdate, enddate, endpoint="data",
               datasetid="GHCND", stationid=None, units="metric", limit=1000):
    """
    Simple fetch request for requesting TMIN and TMAX data from API within 
    the same year.

    Parameters:
        - datasetid: default 'GHCND',
        - locationid
        - stationid 
        - startdate: a datetime object
        - enddate: a datetime object
        - units: specify preferred units for retrieving the data
        - limit: max. 1000 requests

    Returns:
        - DataFrame of the requested dataset

    """
    response = make_request(
        endpoint,
        {
            "datasetid": datasetid,
            "datatypeid": ["TMAX", "TMIN"],
            "locationid": locationid,
            "stationid": stationid,
            "startdate": startdate,
            "enddate": enddate,
            "unit": units,
            "limit": limit,
        })
    response = pd.DataFrame(response.json()["results"])
    return response

def loop_request(locationid, startdate, enddate, endpoint="data",
                 datasetid="GHCND", stationid=None, units="metric", limit=1000):
    """
    Fetch TMIN and TMAX data from API over arbitrary time period without 
    running into data request limit.

    Parameters:
        - datasetid: default 'GHCND',
        - locationid
        - stationid: default None
        - startdate: a datetime object
        - enddate: a datetime object
        - units: preferred units for retrieving the data, default 'metric'
        - limit: max. 1000 requests

    Returns:
        - DataFrame of the requested dataset

    """
    if startdate.year == enddate.year:
        return fetch_data(locationid, startdate, enddate, endpoint, datasetid, 
                          stationid, units, limit)
    else:
        enddate_0 = datetime.date(startdate.year, 12, 31)
        df_tmp_0 = fetch_data(locationid, startdate, enddate_0, endpoint, datasetid, 
                            stationid, units, limit)
        for year in range(startdate.year + 1, enddate.year):
            startdate_tmp = datetime.date(year, 1, 1)
            enddate_tmp = datetime.date(year + 1, 1, 1)
            df_tmp = fetch_data(locationid, startdate_tmp, enddate_tmp, endpoint, datasetid, 
                                stationid, units, limit)
            df_tmp_0 = pd.concat([df_tmp_0, df_tmp])
        startdate_1 = datetime.date(enddate.year, 1, 1)
        df_tmp_1 = fetch_data(locationid, startdate_1, enddate, endpoint, datasetid, 
                                stationid, units, limit)
        df_tmp_0 = pd.concat([df_tmp_0, df_tmp_1])
        return df_tmp_0
        
def clean_up(datain):
    '''
    Clean up missing values, adjust the units of the values, covert datetime 
    format to pandas-datetime and re-index using datetime.

    Parameters:
        - datain: the DataFrame to perform the function on
    
    Returns:
        cleaned up DataFrame
    ---
    Adapted from S. Nesbitt, Module 2, ATMS 597 SP 2020
    '''
    # clean up missing values
    datain["value"][(np.abs(datain["value"])==9999)] = np.nan
    datain.dropna(inplace=True)
    # adjust unit from 10th of degree C to degree C
    datain["value"] = datain["value"].astype("float") / 10.
    # convert datetime format and re-index
    datain["date"] = pd.to_datetime(datain["date"], infer_datetime_format=True) # this is hard coded
    datain.index = datain["date"]
    return datain

In [None]:
def list_stations(locationid, endpoint="stations",
                  datasetid="GHCND", units="metric", limit=1000):

In [5]:
df = loop_request(locationid="CITY:US360019", startdate=datetime.date(1998, 12, 1), enddate=datetime.date(1999, 2, 1), stationid="GHCND:USC00280907")
df = clean_up(df)
display(df.describe(), df.head(), df.tail())

KeyError: 'results'