This notebook aims to process CMIP6 climate model data for a specific geographical region and time period. The main steps include:

1.  **Accessing CMIP6 Catalog**: Connecting to the CMIP6 datastore hosted on Google Cloud to fetch model data.
2.  **Reading Model Table**: Loading an Excel file containing information about the models and variables to be processed.
3.  **Defining Area of Interest**: Reading a KML file that defines the geographical region for which data will be extracted.
4.  **Data Processing and Analysis**: Iterating over the specified models, applying auxiliary functions to:
    *   Search for and load model data.
    *   Associate grid area with each model.
    *   Correct and standardize the time dimension.
    *   Rename and standardize coordinates and depth units.
    *   Apply a geographical mask to select only the region of interest.
    *   Calculate the temporal mean of the data.
    *   Vertically interpolate the data to a new depth grid.
    *   Calculate the volume (assuming a thickness of 5 units for each interpolated level).


In [None]:
!pip install openpyxl regionmask numpy==1.20.0

Collecting numpy==1.20.0
  Downloading numpy-1.20.0-cp39-cp39-manylinux2010_x86_64.whl (15.4 MB)
[K     |████████████████████████████████| 15.4 MB 799 kB/s eta 0:00:01
Collecting regionmask
  Downloading regionmask-0.12.1-py3-none-any.whl (73 kB)
[K     |████████████████████████████████| 73 kB 26.0 MB/s eta 0:00:01
[?25h  Downloading regionmask-0.11.0-py3-none-any.whl (71 kB)
[K     |████████████████████████████████| 71 kB 38.4 MB/s eta 0:00:01
[?25h  Downloading regionmask-0.10.0-py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 32.5 MB/s eta 0:00:01
Installing collected packages: numpy, regionmask
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.0
    Uninstalling numpy-2.0.0:
      Successfully uninstalled numpy-2.0.0
  Attempting uninstall: regionmask
    Found existing installation: regionmask 0.12.1
    Uninstalling regionmask-0.12.1:
      Successfully uninstalled regionmask-0.12.1
[31mERROR: pip's dependency resolver does no

In [None]:
#Importing libraries
import gcsfs
import intake
import xarray as xr
import pandas as pd
import cftime
import geopandas as gpd
import regionmask
import re
import numpy as np
import fiona
fiona.drvsupport.supported_drivers['KML'] = 'rw'

In [None]:
def rename_coords(ds):
    """Renames the latitude, longitude, and depth variables to 'lat', 'lon', and 'lev',
    respectively, using the coordinate variable names automatically detected in the file.
    """
    # Creates a dictionary with the possible old names for the latitude, longitude, and depth variables
    # and their corresponding new names
    coord_names = {
        'latitude': 'lat', 'nav_lat': 'lat', 'lat': 'lat',
        'longitude': 'lon', 'nav_lon': 'lon', 'lon': 'lon',
        'olevel': 'lev',
        'olevel_bounds': 'lev_bnds'
    }
    # Iterates over the list of coordinate names present in the file
    for coord_name in ds.coords.keys():
        # Checks whether the coordinate name matches one of the possible old names
        if coord_name in coord_names:
            # Renames the coordinate variable using the rename() method
            ds = ds.rename({coord_name: coord_names[coord_name]})
    # Returns the dataset with the coordinate variables renamed
    return ds.copy()

# Function to fix the time variable! Some CMIP6 formats for the time variable make it hard to manipulate.
def to_360day_monthly(da):
    ''' Conversion of the time dimension for climate models.
        Function created by Claire Carouge on the CLEX CMS Blog '''
    val = da.copy()
    time1 = da.time.copy()
    for itime in range(val.sizes['time']):
        bb = val.time.values[itime].timetuple()
        time1.values[itime] = cftime.Datetime360Day(bb[0], bb[1], 16)
    val = val.assign_coords({'time': time1})
    return val

# Function that inserts the area variable into the xarray dataset for volume calculation
def assing_area(dic):
    pesquisa = {'source_id': dic['source_id'],
                'table_id': "Ofx",
                'variable_id': 'areacello',
                'experiment_id': dic['experiment_id'],
                'member_id': dic['member_id']}

    cat_area = cmip6.search(require_all_on='source_id', **pesquisa)
    cat_area = cat_area.to_dataset_dict(aggregate=True,
                            storage_options={'token': 'anon'},
                            zarr_kwargs={'consolidated': True,
                            'decode_times': True,
                            'use_cftime': True})

    # If nothing is returned, return None
    if len(cat_area) == 0:
        return None

    # Selects the grid-cell area from grid 'gn'
    for key in list(cat_area.keys()):
        if ".gn" in key:
            ds_area = cat_area[key]

    ds_area = ds_area.squeeze().drop(["member_id", "dcpp_init_year"])
    ds_select_area = ds_area["areacello"]
    return rename_coords(ds_select_area)

# Function that converts depth from centimeters to meters
def depth_m(ds):
    if "lev" in ds:
        if "units" in ds["lev"].attrs:
            units = ds["lev"].units.lower()
            if units == "cm" or units == "centimeters":
                ds["lev"] = ds["lev"] / 100
                ds["lev"].attrs["units"] = "m"
    return ds

# Renames the level_bnds variable
def rename_lev_bnds(ds):
    if 'axis_nbounds' in ds.dims:
        ds = ds.rename_dims({'axis_nbounds': 'bnds'})
    elif 'd2' in ds.dims:
        ds = ds.rename_dims({'d2': 'bnds'})

    return ds


In [None]:
#Accesses the CMIP6 metadata set hosted on Google Cloud.
cmip6 = intake.open_esm_datastore("https://storage.googleapis.com/cmip6/pangeo-cmip6.json")

In [None]:
# Reading my Model Table.
# Check the path, in my Jupyter Lab everything is in the same folder.
df = pd.read_excel("Tabela_Modelos/Tabela_Modelos.xlsx", sheet_name=1)
df = df.rename(columns = lambda x : x.strip())

In [None]:
# Performs my search according to my Model Table
pesquisas = []
for index, row in df.iterrows():
    pesquisa = {'source_id': row['source_id'],
                'table_id': row['table_id'],
                'variable_id': row['variable_id'].split(', '),
                'experiment_id': row['experiment_id'],
                'member_id': row['member_id']}
    pesquisas.append(pesquisa)

In [None]:
gdf = gpd.read_file('Area_Projeto/50S_20S/50S_20S.kml', driver="KML")

In [None]:
import os
import pandas as pd

# Defines the initial time slice
time_inicio = 1981
time_fim = 2010

# Iterates to select my area of interest for all Models
for pesquisa in pesquisas:
    cat = cmip6.search(require_all_on='source_id', **pesquisa)
    cat = cat.to_dataset_dict(aggregate=True,
                            storage_options={'token': 'anon'},
                            zarr_kwargs={'consolidated': True,
                            'decode_times': True,
                            'use_cftime': True})

    for key in list(cat.keys()):
        if ".gn" in key:
            ds = cat[key]

            # Model area.
            ds_area = assing_area(pesquisa)

            if ds_area is None:
                continue

            # Correcting times. Uses the previously defined function to correct the time variable
            ds = to_360day_monthly(ds)

            # Renames variables to standardize names across models
            ds = rename_coords(ds)

            # Renames lev_bnds
            ds = rename_lev_bnds(ds)

            # Converts depth units to meters
            ds = depth_m(ds)

            # Deletes variables present in xarray that are not of interest to us.
            ds_drop = ds.drop([v for v in ds.coords if v not in ['lat', 'lon', 'time', 'lev', 'lev_bnds']])

            # Adds the area variable to my xarray
            ds_with_area = ds_drop.assign_coords(area=ds_area)

            # Defines a time slice
            ds_time = ds_with_area.sel(time=slice(str(time_inicio), str(time_fim)))

            # Squeezes to remove unimportant dimensions
            ds_time = ds_time.squeeze()

            # Converts the GeoDataFrame to a region mask object
            mask = regionmask.mask_geopandas(gdf, ds_time['lon'], ds_time['lat'])

            # Applies the mask to the dataset
            ds_masked = ds_time.where(mask==mask, drop=True)

            # Temporal Mean
            ds_mean = ds_masked.mean(dim='time').compute()

            # Levels to interpolate
            new_levels = np.arange(0, 3000, 5)

            # Vertical interpolation
            ds_interp = ds_mean.interp(lev=new_levels)

            # Calculates the volume
            ds_interp["vol"] = 5 * ds_interp["area"]

            ds_interp.to_netcdf("Medias_nc/50S_20S/{}.nc".format(key))


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


TypeError: to_dataset_dict() got an unexpected keyword argument 'xarray_open_kwargs'