<a href="https://colab.research.google.com/github/asg017/cdo-weather/blob/master/weather.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# pudding.cool Weather Explore

## Build API

In [0]:
!pip install arcgis2geojson

from google.colab import files
import json
import requests
from arcgis2geojson import arcgis2geojson
from time import sleep
import pandas as pd


Collecting arcgis2geojson
  Downloading https://files.pythonhosted.org/packages/11/b6/de1ad83d3bc7165bb1558d942050e4267de0bb8c12af3a089b54cc89c2bd/arcgis2geojson-1.4.0.tar.gz
Building wheels for collected packages: arcgis2geojson
  Running setup.py bdist_wheel for arcgis2geojson ... [?25l- done
[?25h  Stored in directory: /root/.cache/pip/wheels/9f/20/ac/f92942895a80c06f734b0b68843ee630601bd89656cad885a0
Successfully built arcgis2geojson
Installing collected packages: arcgis2geojson
Successfully installed arcgis2geojson-1.4.0


In [0]:
print("Please upload a secrets.json with \"token\" inside ")
uploaded = files.upload()
if uploaded['secrets.json'] is None:
  raise Exception("please upload a file with the name secrets.json!")
cdo_token = json.loads(uploaded['secrets.json'].decode('utf-8')).get('token')

Please upload a secrets.json with "token" inside 


Saving secrets.json to secrets.json


In [0]:
class CDOApi():
  
  host = 'www.ncdc.noaa.gov'
  base_path = '/cdo-web/api/v2'
  protocol = 'https'
  full_path = '{0}://{1}{2}'.format(protocol, host, base_path)
  
  def __init__(self, token, *args, **kwargs):
    self.token = token
  
  def request(self, path, params={}, method='GET'):
    r = requests.get(path, headers={"token":self.token}, params=params)
    return r.json()
  
  def build_path(self, endpoint):
    return '{0}/{1}'.format(self.full_path, endpoint)
  
  def _get_next(self, res, params={"limit":25}):
    res_len = len(res.get('results'))
    resultset = res.get('metadata').get('resultset')
    if resultset.get('offset') + res_len <= resultset.get('count'):
      return {
          "offset": resultset.get('offset') + res_len, 
          "limit":params.get('limit')
      }
    
    return None
  
  def locations(self, params={}):
    res = self.request(self.build_path('locations'), params)
    _next = self._get_next(res, params)
    return res, _next
  
  def datasets(self, params={}):
    res = self.request(self.build_path('datasets'), params)
    _next = self._get_next(res, params)
    return res, _next
  
  def stations(self, params={}):
    res = self.request(self.build_path('stations'), params)
    _next = self._get_next(res, params)
    return res, _next
  
  def location(self, id, params={}):
    res = self.request(self.build_path('locations/{}'.format(id)), params)
    return res
  
  def station(self, id, params={}):
    res = self.request(self.build_path('stations/{}'.format(id)), params)
    return res
  

In [0]:
class GISCDOApi():
  
  host = 'gis.ncdc.noaa.gov'
  base_path = '/arcgis/rest/services/geo/references/MapServer/16'
  protocol = 'https'
  full_path = '{0}://{1}{2}'.format(protocol, host, base_path)
  
  
  def request(self, path, params={}, method='GET'):
    r = requests.get(path, params=params)
    return r.json()
  
  def build_path(self, endpoint):
    return '{0}/{1}'.format(self.full_path, endpoint)
  
  def _get_next(self, res, params={"limit":25}):
    res_len = len(res.get('results'))
    resultset = res.get('metadata').get('resultset')
    if resultset.get('offset') + res_len <= resultset.get('count'):
      return {
          "offset": resultset.get('offset') + res_len, 
          "limit":params.get('limit')
      }
    
    return None
  
  # query? f=json & where=CITY_FIPS='US000001'"
  def query(self, params={"f":"json"}):
    res = self.request(self.build_path('query'), params)
    return res
  
  def city(self, id, params={"f":"json"}):
    params.update({"where":"CITY_FIPS='{}'".format(id)})
    return self.query(params=params)

In [0]:
api = CDOApi(cdo_token)

# Getting all stations for a given city
api.stations(params={"locationid":"CITY:US020001", "limit":1000})

({'metadata': {'resultset': {'count': 98, 'limit': 1000, 'offset': 1}},
  'results': [{'datacoverage': 0.9261,
    'elevation': 67.1,
    'elevationUnit': 'METERS',
    'id': 'COOP:500172',
    'latitude': 61.18889,
    'longitude': -149.80556,
    'maxdate': '2004-10-01',
    'mindate': '1993-07-01',
    'name': 'ALASKA PACIFIC UNIVERSITY, AK US'},
   {'datacoverage': 0.8624,
    'elevation': 35.7,
    'elevationUnit': 'METERS',
    'id': 'COOP:500272',
    'latitude': 61.17472,
    'longitude': -149.905,
    'maxdate': '2009-10-01',
    'mindate': '2005-08-01',
    'name': 'ANCHORAGE ARCTIC AND INTERNATIONAL, AK US'},
   {'datacoverage': 0.9956,
    'elevation': 39.9,
    'elevationUnit': 'METERS',
    'id': 'COOP:500275',
    'latitude': 61.1561,
    'longitude': -149.9847,
    'maxdate': '2015-11-01',
    'mindate': '1998-02-01',
    'name': 'ANCHORAGE FORECAST OFFICE, AK US'},
   {'datacoverage': 1,
    'elevation': 25.9,
    'elevationUnit': 'METERS',
    'id': 'COOP:500276',
   

In [0]:
# Fetch all possible locations, load in pandas dataframe (and focus on USA)

locations = []
params = {"locationcategoryid":"CITY", "limit":1000}

res, _next = api.locations(params)
locations.extend(res.get('results'))

while _next is not None:
  sleep(.25)
  params.update(_next)
  res, _next  = api.locations(params)
  locations.extend(res.get('results'))
  

df = pd.DataFrame(locations)

def extract_country(idx):
  colon_i = idx.find(':')
  return idx[colon_i+1:colon_i+3]

df['country'] = df['id'].apply(extract_country)
df['mindate'] = pd.to_datetime(df['mindate'], format='%Y-%m-%d', errors='raise')
df['maxdate'] = pd.to_datetime(df['maxdate'], format='%Y-%m-%d', errors='raise')
us_df = df[df['country'] == 'US']
us_df

Unnamed: 0,datacoverage,id,maxdate,mindate,name,country
1068,1.0,CITY:US000001,2018-12-20,1872-01-01,"Washington D.C., US",US
1069,1.0,CITY:US010001,2018-12-20,1895-11-01,"Alexander City, AL US",US
1070,1.0,CITY:US010002,2018-12-20,1903-02-01,"Anniston, AL US",US
1071,1.0,CITY:US010003,2018-12-20,1906-04-01,"Auburn, AL US",US
1072,1.0,CITY:US010004,2018-12-20,1900-10-01,"Birmingham, AL US",US
1073,1.0,CITY:US010005,2018-12-20,1907-08-01,"Cullman, AL US",US
1074,1.0,CITY:US010006,2018-12-20,1931-01-01,"Dothan, AL US",US
1075,1.0,CITY:US010007,2018-12-20,1902-05-01,"Enterprise, AL US",US
1076,1.0,CITY:US010008,2018-12-20,1892-06-01,"Eufaula, AL US",US
1077,1.0,CITY:US010009,2018-12-20,1893-01-01,"Florence, AL US",US


In [0]:
a = GISCDOApi()
r = a.city('US000001')
print(r)
arcgis2geojson(r)

## small explore

In [0]:
!wget https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2018.csv.gz
!gunzip 2018.csv.gz

--2018-12-17 05:44:37--  https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2018.csv.gz
Resolving www1.ncdc.noaa.gov (www1.ncdc.noaa.gov)... 205.167.25.171, 205.167.25.172, 2610:20:8040:2::171, ...
Connecting to www1.ncdc.noaa.gov (www1.ncdc.noaa.gov)|205.167.25.171|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 178834282 (171M) [application/x-gzip]
Saving to: ‘2018.csv.gz’


2018-12-17 05:44:44 (23.8 MB/s) - ‘2018.csv.gz’ saved [178834282/178834282]

gzip: 2018.csv already exists; do you wish to overwrite (y or n)? y
y


In [0]:
import pandas as pd

In [0]:
df = pd.read_csv('2018.csv', header=None)

In [0]:
df = df.rename(index=str, columns={0:"station", 1:"date", 2:"type", 3:"value"})
df = df.drop(columns=[4,5,6,7])

In [0]:
df = df[ (df['type'] == 'TMAX') | (df['type'] == 'TMIN') ]

In [0]:
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d', errors='raise')

In [0]:
df.groupby(['station', 'type']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value,value,value,value,value,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
station,type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
AE000041196,TMAX,213.0,360.539906,68.017995,220.0,305.00,381.0,419.00,466.0
AE000041196,TMIN,178.0,228.000000,62.543454,98.0,175.75,239.5,280.75,337.0
AEM00041194,TMAX,196.0,357.994898,65.634067,220.0,311.75,375.0,413.25,470.0
AEM00041194,TMIN,159.0,260.918239,60.378235,136.0,212.50,273.0,313.00,355.0
AEM00041217,TMAX,163.0,362.153374,73.663531,212.0,304.50,371.0,422.00,491.0
AEM00041217,TMIN,159.0,245.213836,64.377680,103.0,195.00,260.0,301.00,364.0
AEM00041218,TMAX,171.0,375.713450,72.278097,220.0,317.50,397.0,436.50,483.0
AEM00041218,TMIN,141.0,239.007092,67.378621,106.0,194.00,250.0,295.00,368.0
AFM00040938,TMAX,11.0,229.000000,70.388920,130.0,188.00,200.0,274.00,361.0
AFM00040938,TMIN,94.0,119.712766,97.464261,-53.0,34.00,137.0,204.75,268.0


## GHCN FTP

In [0]:
!mkdir data/

In [0]:
from ftplib import FTP
from io import BytesIO
from os import path

NCDC_FTP = 'ftp.ncdc.noaa.gov'

def connect_to_ftp():
    ftp = FTP(NCDC_FTP)
    message = ftp.login()
    print(message)
    return ftp

In [0]:
ftp = connect_to_ftp()

 ** This is a United States Department of Commerce computer     **
 ** system, which may be accessed and used only for             **
 ** official Government business by authorized personnel.       **
 ** Unauthorized access or use of this computer system may      **
 ** subject violators to criminal, civil, and/or administrative **
 ** action.  All information on this computer system may be     **
 ** intercepted, recorded, read, copied, and disclosed by and   **
 ** to authorized personnel for official purposes, including    **
 ** criminal investigations.  Access or use of this computer    **
 ** system by any person, whether authorized or unauthorized,   **
 ** constitutes consent to these terms.                         **
230 Anonymous access granted, restrictions apply


In [0]:
states_path = '/pub/data/ghcn/daily/ghcnd-states.txt'
local_path = 'data/states.csv'
print(path.join(NCDC_FTP,states_path))
ftp.retrlines('RETR ' + path.join(NCDC_FTP,states_path))
#str_buf = BytesIO()
#ftp.retrbinary('RETR ' + path.join(NCDC_FTP,states_path), str_buf.write)

In [0]:
!cat data/states.csv

In [0]:
ftp.quit()

In [0]:
!wget https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt

In [0]:
from google.colab import files

files.download('ghcnd-stations.txt')