### This notebook downloads the files needed to compute zonal-mean differences (longitude averages) for the study region. It interpolates the data to the World Ocean Atlas (WOA) depth levels in case a comparison with observational data is desired, although this step is optional. The notebook computes the zonal mean while preserving the time dimension.

####This workflow can be done in two ways:

#### 1. Download only the subset for the study region, as implemented here, and compute the zonal mean directly; or

#### 2. Download the full model data, extract the study region afterwards (preserving time, longitude, and latitude), and compute the zonal mean during the figure-generation stage.

In [None]:
!pip install geopandas regionmask openpyxl numpy==1.20.0

Collecting regionmask
  Downloading regionmask-0.12.1-py3-none-any.whl (73 kB)
[K     |████████████████████████████████| 73 kB 7.4 MB/s eta 0:00:011
[?25hCollecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[K     |████████████████████████████████| 250 kB 11.5 MB/s eta 0:00:01
Collecting et-xmlfile
  Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Collecting regionmask
  Downloading regionmask-0.11.0-py3-none-any.whl (71 kB)
[K     |████████████████████████████████| 71 kB 19.0 MB/s eta 0:00:01
[?25h  Downloading regionmask-0.10.0-py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 52.5 MB/s eta 0:00:01
Installing collected packages: et-xmlfile, regionmask, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5 regionmask-0.10.0


In [None]:
#Importing libraries
import gcsfs
import intake
import xarray as xr
import pandas as pd
import cftime
import geopandas as gpd
import regionmask
import re
import numpy as np
import fiona
import xesmf as xe
import gsw
fiona.drvsupport.supported_drivers['KML'] = 'rw'

In [None]:
#WOA levels for comparison:
new_levels = [0.00e+00, 5.00e+00, 1.00e+01, 1.50e+01, 2.00e+01, 2.50e+01,
       3.00e+01, 3.50e+01, 4.00e+01, 4.50e+01, 5.00e+01, 5.50e+01,
       6.00e+01, 6.50e+01, 7.00e+01, 7.50e+01, 8.00e+01, 8.50e+01,
       9.00e+01, 9.50e+01, 1.00e+02, 1.25e+02, 1.50e+02, 1.75e+02,
       2.00e+02, 2.25e+02, 2.50e+02, 2.75e+02, 3.00e+02, 3.25e+02,
       3.50e+02, 3.75e+02, 4.00e+02, 4.25e+02, 4.50e+02, 4.75e+02,
       5.00e+02, 5.50e+02, 6.00e+02, 6.50e+02, 7.00e+02, 7.50e+02,
       8.00e+02, 8.50e+02, 9.00e+02, 9.50e+02, 1.00e+03, 1.05e+03,
       1.10e+03, 1.15e+03, 1.20e+03, 1.25e+03, 1.30e+03, 1.35e+03,
       1.40e+03, 1.45e+03, 1.50e+03, 1.55e+03, 1.60e+03, 1.65e+03,
       1.70e+03, 1.75e+03, 1.80e+03, 1.85e+03, 1.90e+03, 1.95e+03,
       2.00e+03, 2.10e+03, 2.20e+03, 2.30e+03, 2.40e+03, 2.50e+03,
       2.60e+03, 2.70e+03, 2.80e+03, 2.90e+03, 3.00e+03, 3.10e+03,
       3.20e+03, 3.30e+03, 3.40e+03, 3.50e+03, 3.60e+03, 3.70e+03,
       3.80e+03, 3.90e+03, 4.00e+03, 4.10e+03, 4.20e+03, 4.30e+03,
       4.40e+03, 4.50e+03, 4.60e+03, 4.70e+03, 4.80e+03, 4.90e+03,
       5.00e+03, 5.10e+03, 5.20e+03, 5.30e+03, 5.40e+03, 5.50e+03]

In [None]:
def rename_coords(ds):
    """Renames latitude, longitude, and depth variables to 'lat', 'lon', and 'lev',
    respectively, using automatically found coordinate variable names in the file.
    """
    # Creates a dictionary with possible old names for latitude, longitude, and depth variables
    # and their corresponding new names
    coord_names = {
        'latitude': 'lat', 'nav_lat': 'lat', 'lat': 'lat',
        'longitude': 'lon', 'nav_lon': 'lon', 'lon': 'lon',
        'olevel': 'lev',
        'olevel_bounds': 'lev_bnds'
    }
    # Iterates over the list of coordinate names present in the file
    for coord_name in ds.coords.keys():
        # Checks if the coordinate name corresponds to one of the possible old names for coordinate variables
        if coord_name in coord_names:
            # Renames the coordinate variable using the rename() method
            ds = ds.rename({coord_name: coord_names[coord_name]})
    # Returns the Dataset with renamed coordinate variables
    return ds.copy()

#Function to fix the time variable! Because some time formats available in CMIP6 make it difficult to manipulate.
def to_360day_monthly(da):
    ''' Conversion of the time dimension of climate models.
        Function created by Claire Carouge in the CLEX CMS Blog'''
    val = da.copy()
    time1 = da.time.copy()
    for itime in range(val.sizes['time']):
        bb = val.time.values[itime].timetuple()
        time1.values[itime] = cftime.Datetime360Day(bb[0],bb[1],16)
    val = val.assign_coords({'time':time1})
    return val

#Function that converts depth from centimeters to meters.
def depth_m(ds):
    if "lev" in ds:
        if "units" in ds["lev"].attrs:
            units = ds["lev"].attrs["units"].lower()
            if units == "cm" or units == "centimeters":
                ds["lev"] = ds["lev"] / 100
                ds["lev"].attrs["units"] = "m"
    return ds

In [None]:
#Accesses the CMIP6 metadata set hosted on Google Cloud.

In [None]:
#Reading my Model Table.
#Check the path, in my jupyter lab everything is in the same folder.
df = pd.read_excel("Tabela_Modelos/Tabela_Modelos.xlsx", sheet_name='SSP585')
df = df.rename(columns = lambda x : x.strip())

In [None]:
gdf = gpd.read_file('Area_Projeto/Regiao_Juntas/Regiao_Juntas.shp')

In [None]:
#Performs my search according to my Model Table
pesquisas = []
for index, row in df.iterrows():
    pesquisa = {'source_id': row['source_id'],
                'table_id': row['table_id'],
                'variable_id': row['variable_id'].split(', '),
                'experiment_id': row['experiment_id'],
                'member_id': row['member_id']}
    pesquisas.append(pesquisa)

In [None]:
# Filters only the desired models
modelos_desejados = ['MIROC6', 'CESM2', 'GFDL-ESM4']
pesquisas = [p for p in pesquisas if p['source_id'] in modelos_desejados]

pesquisas

[{'source_id': 'CESM2',
  'table_id': 'Omon',
  'variable_id': ['thetao', 'so'],
  'experiment_id': 'ssp585',
  'member_id': 'r10i1p1f1'},
 {'source_id': 'MIROC6',
  'table_id': 'Omon',
  'variable_id': ['thetao', 'so'],
  'experiment_id': 'ssp585',
  'member_id': 'r1i1p1f1'},
 {'source_id': 'GFDL-ESM4',
  'table_id': 'Omon',
  'variable_id': ['thetao', 'so'],
  'experiment_id': 'ssp585',
  'member_id': 'r1i1p1f1'}]

In [None]:
import pandas as pd

for pesquisa in pesquisas:
    cat = cmip6.search(require_all_on='source_id', **pesquisa)
    cat = cat.to_dataset_dict(aggregate=True,
                              storage_options={'token': 'anon'},
                              zarr_kwargs={'consolidated': True,
                                                 'decode_times': True,
                                                 'use_cftime': True})

    # Variable to store the dataset
    ds = None

    # Iterate over the list keys
    for key in list(cat.keys()):
        # Check if the key ends with "gr"
        if key.endswith("gr"):
            ds = cat[key]
            break  # If "gr" is found, no need to continue searching

    # If "gr" was not found, ds will retain the last value assigned in the loop
    # If no value ends with "gr", ds will retain the last value ending with "gn"
    if ds is None:
        for key in list(cat.keys()):
            if key.endswith("gn"):
                ds = cat[key]
                break  # If "gn" is found, no need to continue searching

    #Converts time to the correct format.
    ds = ds[["thetao", "so"]].squeeze()

    #Fixes depth
    ds = depth_m(ds)

    if "gn" in ds.grid_label:
        ds_out = xr.Dataset(
            {
                "lat" : (["lat"], np.arange(-90, 90, 1.0)),
                "lon" : (["lon"], np.arange(-180, 180, 1.0)),
            }
        )

        regridder = xe.Regridder(ds, ds_out, "bilinear", ignore_degenerate=True, periodic=True)
        ds_out = regridder(ds, keep_attrs = True)

    else:
        ds_out = ds

    #Renames olevel to level
    ds_out = rename_coords(ds_out)

    #Pressure from depth
    ds_out["press"] = gsw.p_from_z(-ds_out.lev, ds_out.lat)

    #Transforms preformed salinity to absolute salinity
    ds_out["sa"] = gsw.SA_from_Sstar(ds_out.so, ds_out.press ,ds_out.lon, ds_out.lat)

    # Vertical interpolation using linear interpolation
    ds_interp = ds_out.interp(lev=new_levels, method='linear')

    #Converts the GeoDataFrame to a region mask object
    mask = regionmask.mask_geopandas(gdf, ds_interp['lon'], ds_interp['lat'])

    #Applies the mask to the dataset
    ds_masked = ds_interp.where(mask==mask, drop=True)

    #Calculates the mean for longitude
    ds_masked = ds_masked.mean(dim='lon')

    #Saves the netcdf
    ds_masked.to_netcdf("{}.nc".format(key))


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  value = value[(slice(None),) * axis + (subkey,)]
