In [1]:
%%capture
!pip install intake xarray[complete] pystac

In [2]:
import pandas as pd
import xarray  as xr
import pystac
import json
from datetime import datetime,timezone

In [3]:
chirpsS3 = "s3://climate-action-datalake/zone=raw/source=chirps/variable=precipitation.zarr/"
datetime_utc = datetime.now(tz=timezone.utc)


In [4]:
catalog = pystac.Catalog(id='datacube-catalog', description='Datacube catalog stored in S3')

In [5]:
print(list(catalog.get_all_items()))
print(list(catalog.get_children()))

[]
[]


In [6]:
dataset_item = pystac.Item(
    id="chirps",
    geometry= None,
    bbox=None,
    datetime =datetime_utc,
    properties={
        "zarr_store": chirpsS3  # Reference the Zarr store location
    }
)

In [7]:
catalog.add_item(dataset_item)

In [8]:
dataset_item.get_parent()

In [9]:
catalog.describe()

* <Catalog id=datacube-catalog>
  * <Item id=chirps>


In [10]:
dataset_item.add_asset(
    key='chips-zarr',
    asset=pystac.Asset(
        title = "Data cube chirps",
        description = "Historical chirps global data transformed into cloud native format Zarr",
        href=chirpsS3,
        media_type=pystac.MediaType.ZARR ,
        extra_fields  = {
            'authentication_enabled': 'yes'
            ,'source': 'https://data.chc.ucsb.edu/products/CHIRPS-2.0/global_daily/tifs/p05'
            ,'update_frecuency': 'Monthy'
            ,'resolution': '0.05 x 0.05'
            ,'dimensions':
                [{'lat':'latitud'
                     ,'description': 'latitud of the data'},
                {'lon':'longitud'
                     ,'description': 'longitud of the data'},
                {'time':'date'
                     ,'description': 'The date of the registry at daily level'
                     ,'format':'yyyy-mm-dd'
                }]
            ,'variables':[{'precipitation':'precipitation'
                     ,'description': 'Precipitation of the geo point per date'
                     ,'units':'mm'}]
            ,'transformations': "From geotiff to zarr"
        }))


In [11]:
help(pystac.Asset)

Help on class Asset in module pystac.asset:

class Asset(builtins.object)
 |  Asset(href: 'str', title: 'str | None' = None, description: 'str | None' = None, media_type: 'str | None' = None, roles: 'list[str] | None' = None, extra_fields: 'dict[str, Any] | None' = None) -> 'None'
 |  
 |  An object that contains a link to data associated with an Item or Collection that
 |  can be downloaded or streamed.
 |  
 |  Args:
 |      href : Link to the asset object. Relative and absolute links are both
 |          allowed.
 |      title : Optional displayed title for clients and users.
 |      description : A description of the Asset providing additional details,
 |          such as how it was processed or created. CommonMark 0.29 syntax MAY be used
 |          for rich text representation.
 |      media_type : Optional description of the media type. Registered Media Types
 |          are preferred. See :class:`~pystac.MediaType` for common media types.
 |      roles : Optional, Semantic role

In [12]:
help(dataset_item.add_asset)

Help on method add_asset in module pystac.asset:

add_asset(key: 'str', asset: 'Asset') -> 'None' method of pystac.item.Item instance
    Adds an Asset to this object.
    
    Args:
        key : The unique key of this asset.
        asset : The Asset to add.



In [13]:
print(json.dumps(dataset_item.to_dict(), indent=4))

{
    "type": "Feature",
    "stac_version": "1.0.0",
    "id": "chirps",
    "properties": {
        "zarr_store": "s3://climate-action-datalake/zone=raw/source=chirps/variable=precipitation.zarr/",
        "datetime": "2024-03-15T20:11:06.299112Z"
    },
    "geometry": null,
    "links": [
        {
            "rel": "root",
            "href": null,
            "type": "application/json"
        },
        {
            "rel": "parent",
            "href": null,
            "type": "application/json"
        }
    ],
    "assets": {
        "chips-zarr": {
            "href": "s3://climate-action-datalake/zone=raw/source=chirps/variable=precipitation.zarr/",
            "type": "application/vnd+zarr",
            "title": "Data cube chirps",
            "description": "Historical chirps global data transformed into cloud native format Zarr",
            "authentication_enabled": "yes",
            "source": "https://data.chc.ucsb.edu/products/CHIRPS-2.0/global_daily/tifs/p05",
   

In [14]:
catalog.normalize_hrefs('../catalog')

In [15]:
catalog.save(catalog_type=pystac.CatalogType.SELF_CONTAINED)

In [16]:
help(pystac.MediaType)

Help on class MediaType in module pystac.media_type:

class MediaType(pystac.utils.StringEnum)
 |  MediaType(value, names=None, *, module=None, qualname=None, type=None, start=1)
 |  
 |  A list of common media types that can be used in STAC Asset and Link metadata.
 |  
 |  Method resolution order:
 |      MediaType
 |      pystac.utils.StringEnum
 |      builtins.str
 |      enum.Enum
 |      builtins.object
 |  
 |  Data and other attributes defined here:
 |  
 |  COG = image/tiff; application=geotiff; profile=cloud-optimized
 |  
 |  FLATGEOBUF = application/vnd.flatgeobuf
 |  
 |  GEOJSON = application/geo+json
 |  
 |  GEOPACKAGE = application/geopackage+sqlite3
 |  
 |  GEOTIFF = image/tiff; application=geotiff
 |  
 |  HDF = application/x-hdf
 |  
 |  HDF5 = application/x-hdf5
 |  
 |  HTML = text/html
 |  
 |  JPEG = image/jpeg
 |  
 |  JPEG2000 = image/jp2
 |  
 |  JSON = application/json
 |  
 |  KML = application/vnd.google-earth.kml+xml
 |  
 |  PDF = application/pdf
 |  


In [17]:
chirps = xr.open_zarr(chirpsS3)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



In [18]:
chirps

Unnamed: 0,Array,Chunk
Bytes,844.14 GiB,54.93 MiB
Shape,"(15736, 1, 2000, 7200)","(1, 1, 2000, 7200)"
Dask graph,15736 chunks in 2 graph layers,15736 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 844.14 GiB 54.93 MiB Shape (15736, 1, 2000, 7200) (1, 1, 2000, 7200) Dask graph 15736 chunks in 2 graph layers Data type float32 numpy.ndarray",15736  1  7200  2000  1,

Unnamed: 0,Array,Chunk
Bytes,844.14 GiB,54.93 MiB
Shape,"(15736, 1, 2000, 7200)","(1, 1, 2000, 7200)"
Dask graph,15736 chunks in 2 graph layers,15736 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
