In [59]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import gcsfs
from scipy import signal
import util
import intake

In [60]:
# Experiments to process
experiment_ids = ['historical', 'ssp370']

# Seasons to process
seasons = ['all','DJF','JJA']

time_range = [1976, 2100]
# Time slices (future) to process
time_slices = ([['1991','2020'],
                ['2001','2030'],
                ['2011','2040'],
                ['2021','2050'],
                ['2031','2060'],
                ['2041','2070'],
                ['2051','2080'],
                ['2061','2090'],
                ['2071','2100']])

variable_ids = ['pr','tas','huss']

table_ids = ['Amon']

In [61]:
if util.is_ncar_host():
    col = intake.open_esm_datastore("../catalogs/glade-cmip6.json")
else:
    col = intake.open_esm_datastore("../catalogs/pangeo-cmip6.json")

In [50]:
# subset to overall things we're looking at
subset = col.search(experiment_id=experiment_ids, 
                    variable_id=variable_ids,
                    table_id=table_ids)

In [51]:
# subset to models that have all the variables we want
# daily pr; monthly tas, pr, huss
uni_dict = subset.unique(['source_id','table_id','variable_id'])
models = set(uni_dict['source_id']['values']) # all the models
#cat_day = subset.search(table_id='day', variable_id='pr')
#models = models.intersection({model for model in cat_day.df.source_id.unique().tolist()})
for v in variable_ids:
    query = dict(variable_id=v, table_id='Amon')
    cat = col.search(**query)
    models = models.intersection({model for model in cat.df.source_id.unique().tolist()})

models = list(models)

# subset to relevant models
our_models = subset.search(source_id=models)

In [52]:
## subset on years
model_df = our_models.df.copy()

# clean up dates
years = model_df.time_range.str.split('-', expand=True).apply(lambda ser: ser.str[:4]).astype(int)

# subset
valid = ((years[0]>= time_range[0]) & (years[0] <= time_range[1]) | 
         (years[1]>= time_range[0]) & (years[1] <= time_range[1]))

# return to our collection
our_models.df = model_df[valid]

In [53]:
# confirm that there aren't duplicates over grid label, version, or activity_id, since these aren't really separate models
assert not (our_models.search(experiment_id='ssp370').df.duplicated(subset=set(our_models.df.columns)-
                                                            {'activity_id','grid_label','path','version'}).any())

In [54]:
our_models

glade-cmip6-ESM Collection with 4243 entries:
	> 3 activity_id(s)

	> 17 institution_id(s)

	> 25 source_id(s)

	> 2 experiment_id(s)

	> 81 member_id(s)

	> 1 table_id(s)

	> 3 variable_id(s)

	> 3 grid_label(s)

	> 0 dcpp_init_year(s)

	> 70 version(s)

	> 167 time_range(s)

	> 4243 path(s)

In [55]:
our_models.df.columns

Index(['activity_id', 'institution_id', 'source_id', 'experiment_id',
       'member_id', 'table_id', 'variable_id', 'grid_label', 'dcpp_init_year',
       'version', 'time_range', 'path'],
      dtype='object')

In [56]:
uni_dict = our_models.unique(['source_id'])

In [57]:
uni_dict['source_id']['values']

['BCC-ESM1',
 'CNRM-CM6-1',
 'CNRM-ESM2-1',
 'BCC-CSM2-MR',
 'CESM2-WACCM',
 'CESM2',
 'FGOALS-g3',
 'SAM0-UNICON',
 'UKESM1-0-LL',
 'HadGEM3-GC31-LL',
 'GFDL-ESM4',
 'GFDL-CM4',
 'GISS-E2-1-H',
 'GISS-E2-1-G',
 'CanESM5',
 'E3SM-1-0',
 'MCM-UA-1-0',
 'EC-Earth3',
 'EC-Earth3-Veg',
 'MRI-ESM2-0',
 'MIROC-ES2L',
 'MIROC6',
 'IPSL-CM6A-LR',
 'NorESM2-LM',
 'MPI-ESM1-2-HR']

In [58]:
dset_dict = our_models.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times': False}, 
                                cdf_kwargs={'chunks': {}, 'decode_times': False})


xarray will load netCDF datasets with dask using a single chunk for all arrays.
For effective chunking, please provide chunks in cdf_kwargs.
For example: cdf_kwargs={'chunks': {'time': 36}}

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 39 group(s)


  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,


OSError: [Errno -101] NetCDF: HDF error: b'/glade/collections/cmip/CMIP6/CMIP/E3SM-Project/E3SM-1-0/historical/r2i1p1f1/Amon/pr/gr/v20190729/pr/pr_Amon_E3SM-1-0_historical_r2i1p1f1_gr_200001-201412.nc'

In [None]:
dset_dict.keys()

  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
  use_cftime=use_cftime,
