Authors: A.Radhakrishnan, GFDL
Ack: Anderson Banihirwe for intake-esm updates, GFDL colleagues for data

esgf-world.json is the ESM collections spec file for the netCDF data in the S3 bucket esgf-world. 
The catalog is updated on an on-demand basis for now. 
You can refer to https://github.com/aradhakrishnanGFDL/gfdl-aws-analysis/tree/community/esm-collection-spec-examples for the most recent catalogs
More examples can be found in https://github.com/aradhakrishnanGFDL/gfdl-aws-analysis/tree/community/examples 


In [6]:
col_url = "https://cmip6-nc.s3.us-east-2.amazonaws.com/esgf-world.json" 

In [None]:
from glob import glob
import xarray as xr
import cftime
import numpy as np
import matplotlib.pyplot as plt
import intake, intake_esm
import pandas as pd
pd.set_option("display.max_colwidth", None)
#!pip install cmip6_preprocessing

%pip install git+https://github.com/jbusecke/cmip6_preprocessing.git


from cmip6_preprocessing.preprocessing import combined_preprocessing
from cmip6_preprocessing.preprocessing import (correct_units,rename_cmip6)

In [8]:
col = intake.open_esm_datastore(col_url)
esmcol_data = col.esmcol_data

In [9]:
def latest_version(cat):
    """
    input
    cat: esmdatastore
    output
    esmdatastore with latest DRS versions
    """
    latest_cat = cat.df.sort_values(by=['version','path']).drop_duplicates(['temporal subset','model','mip_table',
                                               'institute','variable','ensemble_member',
                                               'grid_label','experiment_id'],keep='last')
    return latest_cat

In [10]:
query_Omon_zos = dict(experiment_id=['abrupt-4xCO2'],#,'historical'],
                 mip_table=['Omon'],
                 ensemble_member=["r1i1p1f1"],
                 model=['IPSL-CM6A-LR'],#,'GFDL-ESM4'],
                 grid_label=['gn'],
                 variable=["zos"])

cat_Omon_zos = col.search(**query_Omon_zos)
cat_Omon_zos_lat = latest_version(cat_Omon_zos)


cat_Omon_zos_latest = intake.open_esm_datastore(cat_Omon_zos_lat,esmcol_data=esmcol_data)

In [11]:
cat_Omon_zos_latest.df

Unnamed: 0,project,institute,model,experiment_id,frequency,modeling_realm,mip_table,ensemble_member,grid_label,variable,temporal subset,version,path
3,CMIP6,IPSL,IPSL-CM6A-LR,abrupt-4xCO2,mon,ocean,Omon,r1i1p1f1,gn,zos,185001-214912,v20190118,s3://esgf-world/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/abrupt-4xCO2/r1i1p1f1/Omon/zos/gn/v20190118/zos_Omon_IPSL-CM6A-LR_abrupt-4xCO2_r1i1p1f1_gn_185001-214912.nc
4,CMIP6,IPSL,IPSL-CM6A-LR,abrupt-4xCO2,mon,ocean,Omon,r1i1p1f1,gn,zos,215001-244912,v20190118,s3://esgf-world/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/abrupt-4xCO2/r1i1p1f1/Omon/zos/gn/v20190118/zos_Omon_IPSL-CM6A-LR_abrupt-4xCO2_r1i1p1f1_gn_215001-244912.nc
5,CMIP6,IPSL,IPSL-CM6A-LR,abrupt-4xCO2,mon,ocean,Omon,r1i1p1f1,gn,zos,245001-274912,v20190118,s3://esgf-world/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/abrupt-4xCO2/r1i1p1f1/Omon/zos/gn/v20190118/zos_Omon_IPSL-CM6A-LR_abrupt-4xCO2_r1i1p1f1_gn_245001-274912.nc


In [85]:
# Pass this function for preprocessing zos data if needed
def pp_zos(ds):
    ds = rename_cmip6(ds)
  #  ds = fix_time(ds)
    ds = correct_units(ds)
    return ds

TEST 1 - fails on ds_zos_ipsl_abrupt.sel(time=slice("2150","2151"))

In [12]:
dset_dict_zos = cat_Omon_zos_latest.to_dataset_dict(cdf_kwargs={'decode_times': False,'chunks': {'time': 1}}, storage_options={'anon':True},
                                                   )



--> The keys in the returned dictionary of datasets are constructed as follows:
	'project.institute.model.experiment_id.mip_table'


In [13]:
ds_zos_ipsl_abrupt = dset_dict_zos["CMIP6.IPSL.IPSL-CM6A-LR.abrupt-4xCO2.Omon"]


In [14]:
ds_zos_ipsl_abrupt

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 480.74 kB 480.74 kB Shape (332, 362) (332, 362) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",362  332,

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 480.74 kB 480.74 kB Shape (332, 362) (332, 362) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",362  332,

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.92 MB,1.92 MB
Shape,"(332, 362, 4)","(332, 362, 4)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.92 MB 1.92 MB Shape (332, 362, 4) (332, 362, 4) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",4  362  332,

Unnamed: 0,Array,Chunk
Bytes,1.92 MB,1.92 MB
Shape,"(332, 362, 4)","(332, 362, 4)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.92 MB,1.92 MB
Shape,"(332, 362, 4)","(332, 362, 4)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.92 MB 1.92 MB Shape (332, 362, 4) (332, 362, 4) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",4  362  332,

Unnamed: 0,Array,Chunk
Bytes,1.92 MB,1.92 MB
Shape,"(332, 362, 4)","(332, 362, 4)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 480.74 kB 480.74 kB Shape (332, 362) (332, 362) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",362  332,

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,172.80 kB,16 B
Shape,"(10800, 2)","(1, 2)"
Count,21603 Tasks,10800 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 172.80 kB 16 B Shape (10800, 2) (1, 2) Count 21603 Tasks 10800 Chunks Type float64 numpy.ndarray",2  10800,

Unnamed: 0,Array,Chunk
Bytes,172.80 kB,16 B
Shape,"(10800, 2)","(1, 2)"
Count,21603 Tasks,10800 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.19 GB,480.74 kB
Shape,"(1, 10800, 332, 362)","(1, 1, 332, 362)"
Count,32403 Tasks,10800 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 5.19 GB 480.74 kB Shape (1, 10800, 332, 362) (1, 1, 332, 362) Count 32403 Tasks 10800 Chunks Type float32 numpy.ndarray",1  1  362  332  10800,

Unnamed: 0,Array,Chunk
Bytes,5.19 GB,480.74 kB
Shape,"(1, 10800, 332, 362)","(1, 1, 332, 362)"
Count,32403 Tasks,10800 Chunks
Type,float32,numpy.ndarray


In [23]:
ds_zos_ipsl_abrupt.sel(time=slice("2150","2151"))


KeyError: '2150'

In [19]:
ds_zos_ipsl_abrupt.time

TEST 2 - fails on ds_zos_ipsl_abrupt_t.sel(time=slice("2150","2151"))

In [20]:
dset_dict_zos = cat_Omon_zos_latest.to_dataset_dict(cdf_kwargs={'decode_times': True,'chunks': {'time': 1}}, storage_options={'anon':True},
                                                   )



--> The keys in the returned dictionary of datasets are constructed as follows:
	'project.institute.model.experiment_id.mip_table'


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


In [24]:
ds_zos_ipsl_abrupt_t = dset_dict_zos["CMIP6.IPSL.IPSL-CM6A-LR.abrupt-4xCO2.Omon"]


In [25]:
ds_zos_ipsl_abrupt_t.sel(time=slice("2150","2151"))

TypeError: '<' not supported between instances of 'cftime._cftime.DatetimeGregorian' and 'str'

TEST 3 - works directly using xarray/fsspec on one object with open_dataset and open_mfdataset ds_zos_ipsl_abrupt.sel(time=slice("2150","2151"))

In [None]:
import xarray as xr
import s3fs

fs_s3 = s3fs.S3FileSystem()


In [27]:
xr.show_versions()


INSTALLED VERSIONS
------------------
commit: None
python: 3.8.6 | packaged by conda-forge | (default, Oct  7 2020, 19:08:05) 
[GCC 7.5.0]
python-bits: 64
OS: Linux
OS-release: 4.14.219-161.340.amzn2.x86_64
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: C.UTF-8
LANG: C.UTF-8
LOCALE: en_US.UTF-8
libhdf5: 1.10.6
libnetcdf: 4.7.4

xarray: 0.16.1
pandas: 1.1.4
numpy: 1.19.4
scipy: 1.5.3
netCDF4: 1.5.4
pydap: installed
h5netcdf: 0.8.1
h5py: 2.10.0
Nio: None
zarr: 2.5.0
cftime: 1.2.1
nc_time_axis: 1.2.0
PseudoNetCDF: None
rasterio: 1.1.8
cfgrib: 0.9.8.4
iris: None
bottleneck: 1.3.2
dask: 2.30.0
distributed: 2.30.1
matplotlib: 3.3.2
cartopy: 0.18.0
seaborn: None
numbagg: None
pint: 0.16.1
setuptools: 49.6.0.post20201009
pip: 20.2.4
conda: None
pytest: None
IPython: 7.19.0
sphinx: 3.3.0


In [54]:
s3path = "s3://esgf-world/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/abrupt-4xCO2/r1i1p1f1/Omon/zos/gn/v20190118/zos_Omon_IPSL-CM6A-LR_abrupt-4xCO2_r1i1p1f1_gn_215001-244912.nc"
ds = fs_s3.open(s3path, mode='rb')
xr_ds =xr.open_dataset(ds) #works 
xr_ds.zos.sel(time= slice("2150","2151")) 

  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


In [64]:
xr_ds.time

In [67]:
dataset = []
dataset.append(ds)
#215001-244912
#245001-274912
index1 = 185001
index2 = 214912

while index1 <= 245001:
    index1 += 30000
    if index1 > 245001:
        break
    index2 += 30000
    s3path = 's3://esgf-world/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/abrupt-4xCO2/r1i1p1f1/Omon/zos/gn/v20190118/zos_Omon_IPSL-CM6A-LR_abrupt-4xCO2_r1i1p1f1_gn_' + str(index1) + '-' + str(index2) + '.nc'
    remote_file = fs_s3.open(s3path, mode='rb')
    dataset.append(remote_file)
    
ds = xr.open_mfdataset(dataset, chunks={'time': 1}, combine='by_coords')

  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)


In [68]:
ds.zos.sel(time= slice("2749","2749"))

Unnamed: 0,Array,Chunk
Bytes,5.77 MB,480.74 kB
Shape,"(12, 332, 362)","(1, 332, 362)"
Count,14414 Tasks,12 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 5.77 MB 480.74 kB Shape (12, 332, 362) (1, 332, 362) Count 14414 Tasks 12 Chunks Type float32 numpy.ndarray",362  332  12,

Unnamed: 0,Array,Chunk
Bytes,5.77 MB,480.74 kB
Shape,"(12, 332, 362)","(1, 332, 362)"
Count,14414 Tasks,12 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,5 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 480.74 kB 480.74 kB Shape (332, 362) (332, 362) Count 5 Tasks 1 Chunks Type float32 numpy.ndarray",362  332,

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,5 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,5 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 480.74 kB 480.74 kB Shape (332, 362) (332, 362) Count 5 Tasks 1 Chunks Type float32 numpy.ndarray",362  332,

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,5 Tasks,1 Chunks
Type,float32,numpy.ndarray


In [70]:
ds['time']

In [None]:
%pip install git+https://github.com/intake/intake-esm.git


In [35]:
import intake, intake_esm
col = intake.open_esm_datastore(col_url)
esmcol_data = col.esmcol_data

query_Omon_zos = dict(experiment_id=['abrupt-4xCO2'],#,'historical'],
                 mip_table=['Omon'],
                 ensemble_member=["r1i1p1f1"],
                 model=['IPSL-CM6A-LR'],#,'GFDL-ESM4'],
                 grid_label=['gn'],
                 variable=["zos"])

cat_Omon_zos = col.search(**query_Omon_zos)
cat_Omon_zos_lat = latest_version(cat_Omon_zos)


cat_Omon_zos_latest = intake.open_esm_datastore(cat_Omon_zos_lat,esmcol_data=esmcol_data)

dset_dict_zos = cat_Omon_zos_latest.to_dataset_dict(cdf_kwargs={'decode_times': True,'chunks': {'time': 1}}, storage_options={'anon':True},
                                                   )


--> The keys in the returned dictionary of datasets are constructed as follows:
	'project.institute.model.experiment_id.mip_table'


  return array(a, dtype, copy=False, order=order)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return array(a, dtype, copy=False, order=order)


In [55]:
ds_zos_ipsl_abrupt_r = dset_dict_zos["CMIP6.IPSL.IPSL-CM6A-LR.abrupt-4xCO2.Omon"]

ds_zos_ipsl_abrupt_r.time # sel(time=slice("2150","2151"))

In [58]:
ds_zos_ipsl_abrupt_r['time']#.sel(time=slice("1850","1850"))