In [1]:
import pandas as pd
import numpy as np
import intake

In [2]:
list(intake.registry)

['parquet',
 'alias',
 'catalog',
 'csv',
 'intake_remote',
 'json',
 'jsonl',
 'ndzarr',
 'numpy',
 'textfiles',
 'tiled',
 'tiled_cat',
 'yaml_file_cat',
 'yaml_files_cat',
 'zarr_cat',
 'foo',
 'some_test_driver']

In [3]:
from intake.catalog import Catalog
from intake.catalog.local import LocalCatalogEntry, UserParameter

daily_summaries_by_year = LocalCatalogEntry(
    name='Daily summaries by year',
    description="NOAA daily summaries .csv.gz files",
    driver='csv',
    args={
        'urlpath': 'filecache::https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_year/{{ year }}.csv.gz',
        'csv_kwargs': '{{ daily_summary_kwargs }}'
    },
    parameters=[
        UserParameter(
            name="year",
            description="data collection year",
            type="str",
            default="2023",
        ),
        UserParameter(
            name="csv_kwargs",
            description="",
            type="dict",
        )
    ],
    metadata={
        "source": "https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_year/"
    }
    
)

daily_summaries_by_station = LocalCatalogEntry(
    name='Daily summaries by year',
    description="NOAA daily summaries .csv.gz files",
    driver='csv',
    args={
        'urlpath': 'filecache::https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_station/{{ station }}.csv.gz',
        'csv_kwargs': '{{ daily_summary_kwargs }}'
    },
    parameters=[
        UserParameter(
            name="station",
            description="data collection station",
            type="str",
            default="ACW00011604",
        ),
        UserParameter(
            name="csv_kwargs",
            description="",
            type="dict",
        )
    ],
    metadata={
        "source": "https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_station/"
    }
    
)

daily_summaries_by_year_readme = LocalCatalogEntry(
    name='Daily summaries by year',
    description='README for NOAA daily summaries files',
    driver='textfiles',
    args={
        'urlpath': 'filecache::https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_year/readme-by_year.txt'
    }
)

daily_summaries_by_station_readme = LocalCatalogEntry(
    name='Daily summaries by station',
    description='README for NOAA daily summaries files',
    driver='textfiles',
    args={
        'urlpath': 'filecache::https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_station/readme-by_station.txt',
        'text_mode': True
    }
)

# daily_summaries_by_station_readme_transform = LocalCatalogEntry(
#     name="Daily summaries by station readme",
#     description='README for NOAA daily summaries files',
#     driver='intake.source.derived.GenericTransform',
#     args={
#         'targets': ['by_station_readme'],
#         'transform': 'builtins.str.join',
#         'transform_kwargs': {},
#     }

# )

daily_summaries_stations = LocalCatalogEntry(
    name='Daily summaries stations',
    description='Table of stations included in daily summaries files',
    driver='csv',
    args={
        'urlpath': 'filecache::https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt',
        'csv_kwargs': {
            'sep': "\s+",
            'usecols': [0, 1, 2],
            'names': ['ID', 'latitude', 'longitude'],
        },
    },
)

daily_summaries_cat= Catalog.from_dict(
    {
        "by_year": daily_summaries_by_year,
        "by_year_readme": daily_summaries_by_year_readme,
        "by_station": daily_summaries_by_station,
        "by_station_readme": daily_summaries_by_station_readme,
        "stations": daily_summaries_stations,
    },
    name='NOAA Climate Data Daily Summaries',
    description="Global Historical Climate Network includes daily land surface observations from around the world. The GHCN-Daily was developed to meet the needs of climate analysis and monitoring studies that require data at a sub-monthly time resolution (e.g., assessments of the frequency of heavy rainfall, heat wave duration, etc.). The dataset includes observations from World Meteorological Organization, Cooperative, and CoCoRaHS networks. If observed, the station dataset includes max and minimum temperatures, total precipitation, snowfall, and depth of snow on ground. Some U.S. station data are typically delayed only 24 hours.",
    metadata={
        "parameters": {
            "daily_summary_kwargs": {
                "description": "Header info and other kwargs applying to all daily summary files",
                "type": "dict",
                "default": {
                    'names': ["id", "date", "element", "data_value", "m_flag", "q_flag", "s_flag", "obs_time"],
                    'dtype': {
                        "id": "object",
                        "date": "object",
                        "element": "object",
                        "data_value": "Int64",
                        "m_flag": "object", 
                        "q_flag": "object",
                        "s_flag": "object",
                        "obs_time": "Int64",
                    },
                    'blocksize': None,
                },
            },
        },
    },
)
daily_summaries_cat.save("noaa_daily_summaries.yaml")

noaa_cat = Catalog.from_dict(
    {
        "daily_summaries": LocalCatalogEntry(
            name=daily_summaries_cat.name,
            description=daily_summaries_cat.description,
            driver='intake.catalog.local.YAMLFileCatalog',
            args={
                'path': '{{ CATALOG_DIR }}/noaa_daily_summaries.yaml'
            }
        ),
    },
    name='NOAA Climate Data',
    description='NOAA Climate Data',
    metadata={
        'source': 'https://www.ncei.noaa.gov/metadata/geoportal/rest/metadata/item/gov.noaa.ncdc:C00861/html'
    }
)

noaa_cat.save('noaa_cat.yaml')
noaa_cat = intake.open_catalog('noaa_cat.yaml')

In [4]:
noaa_cat.daily_summaries.by_station_readme.read()

['The "station".csv files contain all daily elements for that GHCN station for its entire period of record. \n',
 'Each element-day is provided on a separate line and all files are updated daily for the entire period of record.\n',
 '\n',
 'The following information serves as a definition of each field for all element-day records. \n',
 'Each field described below is separated by a comma ( , ) and follows the order below:\n',
 '\n',
 'ID = 11 character station identification code\n',
 'YEAR/MONTH/DAY = 8 character date in YYYYMMDD format (e.g. 19860529 = May 29, 1986)\n',
 'ELEMENT = 4 character indicator of element type \n',
 'DATA VALUE = 5 character data value for ELEMENT \n',
 'M-FLAG = 1 character Measurement Flag \n',
 'Q-FLAG = 1 character Quality Flag \n',
 'S-FLAG = 1 character Source Flag \n',
 'OBS-TIME = 4-character time of observation in hour-minute format (i.e. 0700 =7:00 am); if no ob time information \n',
 'is available, the field is left empty\n',
 '\n',
 'See section 

In [5]:
!cat noaa_daily_summaries.yaml

description: Global Historical Climate Network includes daily land surface observations
  from around the world. The GHCN-Daily was developed to meet the needs of climate
  analysis and monitoring studies that require data at a sub-monthly time resolution
  (e.g., assessments of the frequency of heavy rainfall, heat wave duration, etc.).
  The dataset includes observations from World Meteorological Organization, Cooperative,
  and CoCoRaHS networks. If observed, the station dataset includes max and minimum
  temperatures, total precipitation, snowfall, and depth of snow on ground. Some U.S.
  station data are typically delayed only 24 hours.
metadata:
  parameters:
    daily_summary_kwargs:
      default:
        blocksize: null
        dtype:
          data_value: Int64
          date: object
          element: object
          id: object
          m_flag: object
          obs_time: Int64
          q_flag: object
          s_flag: object
        names:
        -

In [6]:
#noaa_cat.daily_summaries.by_year.read()
#noaa_cat.daily_summaries.user_parameters["daily_summary_kwargs"].default
noaa_cat.daily_summaries.by_year

by_year:
  args:
    csv_kwargs:
      blocksize: null
      dtype:
        data_value: Int64
        date: object
        element: object
        id: object
        m_flag: object
        obs_time: Int64
        q_flag: object
        s_flag: object
      names:
      - id
      - date
      - element
      - data_value
      - m_flag
      - q_flag
      - s_flag
      - obs_time
    urlpath: filecache::https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_year/2023.csv.gz
  description: NOAA daily summaries .csv.gz files
  driver: intake.source.csv.CSVSource
  metadata:
    catalog_dir: /Users/brosenthal/code/catalogs/
    source: https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_year/


In [7]:
noaa_cat.daily_summaries.by_year(year='1999').read()

Unnamed: 0,id,date,element,data_value,m_flag,q_flag,s_flag,obs_time
0,AE000041196,19990101,TMAX,282,,,I,
1,AE000041196,19990101,TAVG,213,H,,S,
2,AEM00041194,19990101,TMAX,270,,,S,
3,AEM00041194,19990101,TMIN,170,,,S,
4,AEM00041194,19990101,TAVG,221,H,,S,
...,...,...,...,...,...,...,...,...
34558736,ZI000067975,19991231,PRCP,30,,,S,
34558737,ZI000067975,19991231,TAVG,242,H,,S,
34558738,ZI000067983,19991231,TMAX,334,,,I,
34558739,ZI000067983,19991231,TMIN,181,,,I,
