In [28]:
import intake
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

# Load CMIP6 via intake-esm
col_url = "https://storage.googleapis.com/cmip6/pangeo-cmip6.json"
col = intake.open_esm_datastore(col_url)

# Search catalog
query = col.search(
    experiment_id="historical",
    table_id="SImon",
    variable_id=["siu", "siv", "siconc"],
    member_id="r1i1p1f1",
    grid_label="gn"
)

print(query.df.head())

# Convert query to dataset dictionary
dset_dict = query.to_dataset_dict(
    zarr_kwargs={'consolidated': True},
    xarray_combine_by_coords_kwargs={'compat': 'override'}
)

  activity_id institution_id     source_id experiment_id member_id table_id  \
0        CMIP      NOAA-GFDL      GFDL-CM4    historical  r1i1p1f1    SImon   
1        CMIP      NOAA-GFDL      GFDL-CM4    historical  r1i1p1f1    SImon   
2        CMIP      NOAA-GFDL      GFDL-CM4    historical  r1i1p1f1    SImon   
3        CMIP           IPSL  IPSL-CM6A-LR    historical  r1i1p1f1    SImon   
4        CMIP           IPSL  IPSL-CM6A-LR    historical  r1i1p1f1    SImon   

  variable_id grid_label                                             zstore  \
0         siu         gn  gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/histo...   
1         siv         gn  gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/histo...   
2      siconc         gn  gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/histo...   
3      siconc         gn  gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor...   
4         siu         gn  gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor...   

   dcpp_init_year   version  
0             NaN  2

In [29]:
query['model_name'] = query.df.apply(
    lambda row: f"CMIP.{row['institution_id']}.{row['source_id']}.{row['experiment_id']}.{row['table_id']}.{row['grid_label']}",
    axis=1
)

model_names = query['model_name'].unique()

print("models in  query:")
for model_name in model_names:
    print(model_name)

models in  query:
CMIP.NOAA-GFDL.GFDL-CM4.historical.SImon.gn
CMIP.IPSL.IPSL-CM6A-LR.historical.SImon.gn
CMIP.NASA-GISS.GISS-E2-1-G.historical.SImon.gn
CMIP.BCC.BCC-CSM2-MR.historical.SImon.gn
CMIP.BCC.BCC-ESM1.historical.SImon.gn
CMIP.MIROC.MIROC6.historical.SImon.gn
CMIP.AWI.AWI-CM-1-1-MR.historical.SImon.gn
CMIP.NCAR.CESM2-WACCM.historical.SImon.gn
CMIP.NCAR.CESM2.historical.SImon.gn
CMIP.SNU.SAM0-UNICON.historical.SImon.gn
CMIP.NASA-GISS.GISS-E2-1-H.historical.SImon.gn
CMIP.CCCma.CanESM5.historical.SImon.gn
CMIP.HAMMOZ-Consortium.MPI-ESM-1-2-HAM.historical.SImon.gn
CMIP.NUIST.NESM3.historical.SImon.gn
CMIP.CAMS.CAMS-CSM1-0.historical.SImon.gn
CMIP.MPI-M.MPI-ESM1-2-LR.historical.SImon.gn
CMIP.MPI-M.MPI-ESM1-2-HR.historical.SImon.gn
CMIP.NOAA-GFDL.GFDL-ESM4.historical.SImon.gn
CMIP.NCC.NorESM2-LM.historical.SImon.gn
CMIP.MRI.MRI-ESM2-0.historical.SImon.gn
CMIP.CAS.FGOALS-f3-L.historical.SImon.gn
CMIP.NCC.NorESM2-MM.historical.SImon.gn
CMIP.NCAR.CESM2-WACCM-FV2.historical.SImon.gn
CMI

In [30]:
model = 'CMIP.CAS.CAS-ESM2-0.historical.SImon.gn'
ds = dset_dict[model]

ds.attrs

{'Conventions': 'CF-1.7 CMIP-6.2',
 'activity_id': 'CMIP',
 'branch_method': 'Spin-up documentation',
 'branch_time_in_child': 674885.0,
 'branch_time_in_parent': 36135.0,
 'cmor_version': '3.5.0',
 'contact': 'He Zhang(zhanghe@mail.iap.ac.cn)',
 'data_specs_version': '01.00.31',
 'experiment': 'all-forcing simulation of the recent past',
 'experiment_id': 'historical',
 'external_variables': 'areacello',
 'forcing_index': 1,
 'frequency': 'mon',
 'further_info_url': 'https://furtherinfo.es-doc.org/CMIP6.CAS.CAS-ESM2-0.historical.none.r1i1p1f1',
 'grid': 'gs1x1',
 'grid_label': 'gn',
 'history': '2020-12-25T11:51:49Z ;rewrote data to be consistent with CMIP for variable siage found in table SImon.',
 'initialization_index': 1,
 'institution': 'Chinese Academy of Sciences, Beijing 100029, China',
 'institution_id': 'CAS',
 'license': 'CMIP6 model data produced by Chinese Academy of Sciences (CAS) is licensed under a Creative Commons Attribution ShareAlike 4.0 International License (http

In [31]:
# Initialize list to collect metadata for each model
metadata = []

# Function to extract ocean and atm models
def extract_ocean_model(source_attr):
    match = re.search(r"ocean:\s*([A-Za-z0-9.]+)", source_attr)
    return match.group(1) if match else "N/A"

def extract_atmosphere_model(source_attr):
    match = re.search(r"atmos:\s*([A-Za-z0-9. ]+)", source_attr)
    return match.group(1).strip() if match else "N/A"


# Loop through each model in dataset dictionary
for ds_name, ds in dset_dict.items():
    print(f"Dataset: {ds_name}")
    print(ds.data_vars)
    
    # Retrieve attributes for metadata table
    institution = ds.attrs.get('institution', 'N/A')
    resolution = ds.attrs.get('nominal_resolution', 'N/A')
    grid_type = ds.attrs.get('grid_label', 'N/A')
    num_ensembles = len(ds.member_id) if 'member_id' in ds.coords else 'N/A'
    rheology_scheme = ds.attrs.get('rheology_scheme', 'N/A')
    atmosphere_model = ds.attrs.get('atmosphere', 'N/A')
    references = ds.attrs.get('references', 'N/A')
    
    # source attr
    source_attr = ds.attrs.get('source', 'N/A')
    ocean_model = extract_ocean_model(source_attr)
    atmosphere_model = extract_atmosphere_model(source_attr)
    
    metadata.append([
        ds_name, institution, resolution, grid_type, num_ensembles, 
        rheology_scheme, atmosphere_model, ocean_model, references
    ])

# Convert metadata list to DataFrame
metadata_table = pd.DataFrame(metadata, columns=[
    'Model', 'Institution', 'Resolution', 'Grid Type', 
    'Number of Ensembles', 'Rheology Scheme', 
    'Atmosphere Model', 'Ocean Model', 'References'
])

# Save the metadata table to a CSV file
# metadata_table.to_csv('metadata.csv', index=False)

# Display the metadata table
print(metadata_table)

Dataset: CMIP.IPSL.IPSL-CM6A-LR.historical.SImon.gn
Data variables:
    siconc   (member_id, dcpp_init_year, time, y, x) float32 dask.array<chunksize=(1, 1, 600, 332, 362), meta=np.ndarray>
    siu      (member_id, dcpp_init_year, time, y, x) float32 dask.array<chunksize=(1, 1, 845, 332, 362), meta=np.ndarray>
    siv      (member_id, dcpp_init_year, time, y, x) float32 dask.array<chunksize=(1, 1, 833, 332, 362), meta=np.ndarray>
Dataset: CMIP.NCAR.CESM2-FV2.historical.SImon.gn
Data variables:
    siconc   (member_id, dcpp_init_year, time, nj, ni) float32 dask.array<chunksize=(1, 1, 705, 384, 320), meta=np.ndarray>
    siu      (member_id, dcpp_init_year, time, nj, ni) float32 dask.array<chunksize=(1, 1, 912, 384, 320), meta=np.ndarray>
    siv      (member_id, dcpp_init_year, time, nj, ni) float32 dask.array<chunksize=(1, 1, 898, 384, 320), meta=np.ndarray>
Dataset: CMIP.EC-Earth-Consortium.EC-Earth3-CC.historical.SImon.gn
Data variables:
    siconc   (member_id, dcpp_init_year, time,

In [37]:
metadata_table.head(9)
metadata_table.tail()
metadata_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Model                43 non-null     object
 1   Institution          43 non-null     object
 2   Resolution           43 non-null     object
 3   Grid Type            43 non-null     object
 4   Number of Ensembles  43 non-null     int64 
 5   Rheology Scheme      43 non-null     object
 6   Atmosphere Model     43 non-null     object
 7   Ocean Model          43 non-null     object
 8   References           43 non-null     object
dtypes: int64(1), object(8)
memory usage: 3.1+ KB


In [40]:
metadata_table.groupby("Atmosphere Model").count()

# groupby can take any column name or take a ist of column names if you specify it. 
# alternative to count could be median, mean, std(), variance etc. 
# could also apply and give fctn 


Unnamed: 0_level_0,Model,Institution,Resolution,Grid Type,Number of Ensembles,Rheology Scheme,Ocean Model,References
Atmosphere Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BCC,2,2,2,2,2,2,2,2
CAM,3,3,3,3,3,3,3,3
CAM4,2,2,2,2,2,2,2,2
CAM5.3,2,2,2,2,2,2,2,2
CAM5.3 with UNICON,1,1,1,1,1,1,1,1
CCSR AGCM,1,1,1,1,1,1,1,1
CIESM,1,1,1,1,1,1,1,1
CanAM5,1,1,1,1,1,1,1,1
ECHAM v6.3,1,1,1,1,1,1,1,1
ECHAM5,1,1,1,1,1,1,1,1
