In [2]:
import requests

def make_request(endpoint, payload=None):
    """
    Parameters:
    - endpoint: The endpoint of the API you want to
    make a GET request to.
    - payload: A dictionary of data to pass along
    with the request.

    To use the NCEI API, you will have to request a token by filling out this form
    with your email address: https://www.ncdc.noaa.gov/cdo-web/
    token.
    """

    return requests.get(
        'https://www.ncdc.noaa.gov/cdo-web/'
        f'api/v2/{endpoint}',
        headers={'token': 'aZVfQQPqRwFaRhBDymFJJggLiugEbSNd'},
        params=payload
    )

In [3]:
response = make_request('datasets', {'startdate': '2018-10-01'})

In [4]:
# check if the request is successful 
response.status_code

200

In [5]:
# another way to check if the request the successful
response.ok

True

Once we have our response, we can use the json() method to get the payload. Then, we
can use dictionary methods to determine which part we want to look at

In [7]:
payload = response.json()
payload.keys()

dict_keys(['metadata', 'results'])

In [8]:
# metadata tell us information about the result
payload['metadata']

{'resultset': {'offset': 1, 'count': 11, 'limit': 25}}

The results key contains a list of dictionaries. If we select the first one, we can
look at the keys to see what fields the data contains. 

In [10]:
payload['results'][0].keys()

dict_keys(['uid', 'mindate', 'maxdate', 'name', 'datacoverage', 'id'])

In [11]:
# we want to look at the IDs and names of the datasets
[(data['id'], data['name']) for data in payload['results']]

[('GHCND', 'Daily Summaries'),
 ('GSOM', 'Global Summary of the Month'),
 ('GSOY', 'Global Summary of the Year'),
 ('NEXRAD2', 'Weather Radar (Level II)'),
 ('NEXRAD3', 'Weather Radar (Level III)'),
 ('NORMAL_ANN', 'Normals Annual/Seasonal'),
 ('NORMAL_DLY', 'Normals Daily'),
 ('NORMAL_HLY', 'Normals Hourly'),
 ('NORMAL_MLY', 'Normals Monthly'),
 ('PRECIP_15', 'Precipitation 15 Minute'),
 ('PRECIP_HLY', 'Precipitation Hourly')]

Now that we have a value for
datasetid (GHCND), we proceed to identify one for datacategoryid

In [12]:
response = make_request(
    'datacategories', payload={'datasetid': 'GHCND'}
)

In [13]:
response.status_code

200

In [14]:
response.json()['results']

[{'name': 'Evaporation', 'id': 'EVAP'},
 {'name': 'Land', 'id': 'LAND'},
 {'name': 'Precipitation', 'id': 'PRCP'},
 {'name': 'Sky cover & clouds', 'id': 'SKY'},
 {'name': 'Sunshine', 'id': 'SUN'},
 {'name': 'Air Temperature', 'id': 'TEMP'},
 {'name': 'Water', 'id': 'WATER'},
 {'name': 'Wind', 'id': 'WIND'},
 {'name': 'Weather Type', 'id': 'WXTYPE'}]

Based on the previous result, we know that we want a value of TEMP for
datacategoryid. Next, we use this to identify the data types we want by using the
datatypes endpoint.

In [15]:
response = make_request(
    'datatypes',
    payload={'datacategoryid' : 'TEMP',
    'limit': 100}
)

In [17]:
[(datatype['id'], datatype['name']) 
  for datatype in response.json()['results']]

[('CDSD', 'Cooling Degree Days Season to Date'),
 ('DATN',
  'Number of days included in the multiday minimum temperature (MDTN)'),
 ('DATX',
  'Number of days included in the multiday maximum temperature (MDTX)'),
 ('DLY-DUTR-NORMAL', 'Long-term averages of daily diurnal temperature range'),
 ('DLY-DUTR-STDDEV',
  'Long-term standard deviations of daily diurnal temperature range'),
 ('DLY-TAVG-NORMAL', 'Long-term averages of daily average temperature'),
 ('DLY-TAVG-STDDEV',
  'Long-term standard deviations of daily average temperature'),
 ('DLY-TMAX-NORMAL', 'Long-term averages of daily maximum temperature'),
 ('DLY-TMAX-STDDEV',
  'Long-term standard deviations of daily maximum temperature'),
 ('DLY-TMIN-NORMAL', 'Long-term averages of daily minimum temperature'),
 ('DLY-TMIN-STDDEV',
  'Long-term standard deviations of daily minimum temperature'),
 ('EMNT', 'Extreme minimum temperature for the period.'),
 ('EMXT', 'Extreme maximum temperature for the period.'),
 ('HDSD', 'Heating De

We are looking for the TAVG, TMAX, and TMIN data types. Now that we have everything
we need to request temperature data for all locations.

In [48]:
# get location category id 
response = make_request(
    'locationcategories', 
    payload={'datasetid': 'GHCND'}
)
response.status_code

200

In [49]:
import pprint
pprint.pprint(response.json())

{'metadata': {'resultset': {'count': 12, 'limit': 25, 'offset': 1}},
 'results': [{'id': 'CITY', 'name': 'City'},
             {'id': 'CLIM_DIV', 'name': 'Climate Division'},
             {'id': 'CLIM_REG', 'name': 'Climate Region'},
             {'id': 'CNTRY', 'name': 'Country'},
             {'id': 'CNTY', 'name': 'County'},
             {'id': 'HYD_ACC', 'name': 'Hydrologic Accounting Unit'},
             {'id': 'HYD_CAT', 'name': 'Hydrologic Cataloging Unit'},
             {'id': 'HYD_REG', 'name': 'Hydrologic Region'},
             {'id': 'HYD_SUB', 'name': 'Hydrologic Subregion'},
             {'id': 'ST', 'name': 'State'},
             {'id': 'US_TERR', 'name': 'US Territory'},
             {'id': 'ZIP', 'name': 'Zip Code'}]}


We want to look at New York City, so, for the locationcategoryid filter, CITY is the proper value. The notebook we are working in has a function to search for a field by name using binary search on the API; binary search is a more efficient way of searching through an ordered list.

In [56]:
def get_item(name, what, endpoint, start=1, end=None):
    # find the midpoint to cut the data in half each time
    mid = (start + (end or 1)) // 2
    # # lowercase the name so this is not case-sensitive
    name = name.lower()
    # # define the payload we will send with each request
    payload = {
        'datasetid': 'GHCND', 'sortfield': 'name',
        'offset': mid, # we'll change the offset each time
        'limit': 1 # we only want one value back
    }

    # make request adding additional filters from `what`
    response = make_request(endpoint, {**payload, **what})

    if response.ok:
        payload = response.json()
        # if ok, grab the end index from the response
        # metadata the first time through
        end = end or \
            payload['metadata']['resultset']['count']
        # grab the lowercase version of the current name
        current_name = \
            payload['results'][0]['name'].lower()
        # if what we are searching for is in the current
        # name, we have found our item
        if name in current_name:
            # return the found item
            return payload['results'][0]
        else:
            if start >= end:
            # if start index is greater than or equal
            # to end index, we couldn't find it
                return {}
            elif name < current_name:
                # name comes before the current name in the
                # alphabet => search further to the left
                return get_item(name, what, endpoint,
                                start, mid - 1)
            elif name > current_name:
                # name comes after the current name in the
                # alphabet => search further to the right
                return get_item(name, what, endpoint,
                                mid + 1, end)
    else:
        # # response wasn't ok, use code to determine why
        print('Response not OK, '
              f'status: {response.status_code}')
        
        

Now, let's use the binary search implementation to find the ID for New York City

In [57]:
nyc = get_item(
    'New York', {'locationcategoryid': 'CITY'},
                 'locations')

nyc

{'mindate': '1869-01-01',
 'maxdate': '2024-01-23',
 'name': 'New York, NY US',
 'datacoverage': 1,
 'id': 'CITY:US360019'}

When searching a very long ordered list, think of binary search.

we can drill down to the ID of the station that is collecting the data. This is the
most granular level.

In [59]:
central_park = get_item(
    'NY City Central Park',
    {'locationid': nyc['id']},
    'stations'
)
central_park

{'elevation': 42.7,
 'mindate': '1869-01-01',
 'maxdate': '2024-01-22',
 'latitude': 40.77898,
 'name': 'NY CITY CENTRAL PARK, NY US',
 'datacoverage': 1,
 'id': 'GHCND:USW00094728',
 'elevationUnit': 'METERS',
 'longitude': -73.96925}

Now, let's request NYC's temperature data in Celsius for October 2018, recorded from
Central Park.

In [65]:
response = make_request(
    'data',
    {'datasetid': 'GHCND',
     'locationid': nyc['id'],
     'startdate': '2018-10-01',
     'enddate': '2018-10-31',
     'datatypeid': ['TAVG', 'TMAX', 'TMIN'],
     'units': 'metric',
     'limit': 1000}
)
response.status_code

200

Lastly, we will create a DataFrame object; since the results portion of the JSON
payload is a list of dictionaries, we can pass it directly to pd.DataFrame()

In [66]:
import pandas as pd
df = pd.DataFrame(response.json()['results'])
df.head()

Unnamed: 0,date,datatype,station,attributes,value
0,2018-10-01T00:00:00,TMAX,GHCND:USC00280907,",,7,0700",21.1
1,2018-10-01T00:00:00,TMIN,GHCND:USC00280907,",,7,0700",8.9
2,2018-10-01T00:00:00,TMAX,GHCND:USC00281335,",,7,0700",22.8
3,2018-10-01T00:00:00,TMIN,GHCND:USC00281335,",,7,0700",11.7
4,2018-10-01T00:00:00,TMAX,GHCND:USC00283704,",,7,0700",22.8


In [69]:
df.datatype.unique()

array(['TMAX', 'TMIN', 'TAVG'], dtype=object)

In [70]:
if get_item(
    'NY City Central Park', {'locationid': nyc['id'], 'datatypeid': 'TAVG'}, 'stations'
):
    print('Found!')

Found!


In [71]:
df.datatype.value_counts()

datatype
TMAX    421
TMIN    421
TAVG     93
Name: count, dtype: int64

Try a different station

In [73]:
laguardia = get_item(
    'LaGuardia', {'locationid': nyc['id']}, 'stations'
)
laguardia

{'elevation': 3,
 'mindate': '1939-10-07',
 'maxdate': '2024-01-23',
 'latitude': 40.77945,
 'name': 'LAGUARDIA AIRPORT, NY US',
 'datacoverage': 1,
 'id': 'GHCND:USW00014732',
 'elevationUnit': 'METERS',
 'longitude': -73.88027}

In [74]:
# get NYC daily summaries data 
response = make_request(
    'data', 
    {
        'datasetid': 'GHCND',
        'stationid': laguardia['id'],
        'locationid': nyc['id'],
        'startdate': '2018-10-01',
        'enddate': '2018-10-31',
        'datatypeid': ['TAVG', 'TMAX', 'TMIN'], # temperature at time of observation, min, and max
        'units': 'metric',
        'limit': 1000
    }
)
response.status_code

200

In [75]:
df = pd.DataFrame(response.json()['results'])
df.head()

Unnamed: 0,date,datatype,station,attributes,value
0,2018-10-01T00:00:00,TAVG,GHCND:USW00014732,"H,,S,",21.2
1,2018-10-01T00:00:00,TMAX,GHCND:USW00014732,",,W,2400",25.6
2,2018-10-01T00:00:00,TMIN,GHCND:USW00014732,",,W,2400",18.3
3,2018-10-02T00:00:00,TAVG,GHCND:USW00014732,"H,,S,",22.7
4,2018-10-02T00:00:00,TMAX,GHCND:USW00014732,",,W,2400",26.1


In [76]:
df.datatype.value_counts()

datatype
TAVG    31
TMAX    31
TMIN    31
Name: count, dtype: int64

In [77]:
df.to_csv('data/nyc_temperatures.csv', index=False)