In [17]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import gcsfs
from scipy import signal
import util
import intake

In [18]:
# Experiments to process
experiment_ids = ['historical', 'ssp370']

# Seasons to process
seasons = ['all','DJF','JJA']

time_range = [1976, 2085]
# Time slices (future) to process
time_slices = ([['2006','2035'],
                ['2016','2045'],
                ['2026','2055'],
                ['2036','2065'],
                ['2046','2075'],
                ['2056','2085']])

variable_ids = ['pr','tas','huss']

table_ids = ['Amon']

In [19]:
if util.is_ncar_host():
    col = intake.open_esm_datastore("../catalogs/glade-cmip6.json")
else:
    col = intake.open_esm_datastore("../catalogs/pangeo-cmip6.json")

In [20]:
# subset to overall things we're looking at
subset = col.search(experiment_id=experiment_ids, 
                    variable_id=variable_ids,
                    table_id=table_ids)

In [21]:
# subset to models that have all the variables we want
# daily pr; monthly tas, pr, huss
uni_dict = subset.unique(['source_id','table_id','variable_id'])
models = set(uni_dict['source_id']['values']) # all the models
#cat_day = subset.search(table_id='day', variable_id='pr')
#models = models.intersection({model for model in cat_day.df.source_id.unique().tolist()})
for v in variable_ids:
    query = dict(variable_id=v, table_id='Amon')
    cat = col.search(**query)
    models = models.intersection({model for model in cat.df.source_id.unique().tolist()})

models = list(models)

# subset to relevant models
our_models = subset.search(source_id=models)

In [22]:
## subset on years
model_df = our_models.df.copy()

# clean up dates
years = model_df.time_range.str.split('-', expand=True).apply(lambda ser: ser.str[:4]).astype(int)

# subset
valid = ((years[0]>= time_range[0]) & (years[0] <= time_range[1]) | 
         (years[1]>= time_range[0]) & (years[1] <= time_range[1]))

# return to our collection
our_models.df = model_df[valid]

In [23]:
# confirm that there aren't duplicates over grid label, version, or activity_id, since these aren't really separate models
assert not (our_models.search(experiment_id='ssp370').df.duplicated(subset=set(our_models.df.columns)-
                                                            {'activity_id','grid_label','path','version'}).any())

In [24]:
our_models

glade-cmip6-ESM Collection with 3888 entries:
	> 3 activity_id(s)

	> 17 institution_id(s)

	> 25 source_id(s)

	> 2 experiment_id(s)

	> 81 member_id(s)

	> 1 table_id(s)

	> 3 variable_id(s)

	> 3 grid_label(s)

	> 0 dcpp_init_year(s)

	> 70 version(s)

	> 149 time_range(s)

	> 3888 path(s)