# Preliminaries
To get to this point...

In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import zarr
import gcsfs
from dask.diagnostics import ProgressBar
from numpy import unique
import cf_xarray
import operator
import os
import sys
sys.path.append('Code/')
import fcts
import glob

# Setting up model info

This section sets up the configuration options including:
 * Model sources
 * Variable(s) of interest
 * Frequency of measurements
 * output path for saving subset data from CMIP6 models

In [38]:
# models we care about
# # These weren't working for Andrew, lets try again
source_list = [
    'IPSL-CM6A-LR', #
    'CMCC-CM2-SR5',
    'MIROC6', #
    'CanESM5',
    'MRI-ESM2-0',
    # 'HadGEM3-GC31-LL', # 
    'GFDL-CM4',
    'CESM2',
    'CESM2-WACCM',
    # 'CIESM', #
    'CNRM-CM6-1', #
    'CNRM-ESM2-1', #
    'CanESM5-CanOE',
    'EC-Earth3',
    # 'EC-Earth3-Veg', #
    # 'EC-Earth3-Veg-LR', #
    'FGOALS-g3',
    'FGOALS-f3-L',
    'FIO-ESM-2-0',
    'GISS-E2-1-G', #
    'INM-CM4-8',
    'INM-CM5-0'
    'MIROC-ES2L', #
    # 'NESM3', #
    # 'NorESM2-LM', #
    'NorESM2-MM', #
    'UKESM1-0-LL' #
]
# Enter the var of interest {so, thetao, tos}
# thetao = 
# so = 
# tos = sea surface temperature

# Set the variable to download
variable_id = 'thetao'

# True/False for bottom salinity or temperature from thetao or so variables
TOP = False 

# enter the table (based on the frequency of measurements)
table_id = 'Omon'

# Enter the experiments of interest {'historical', 'ssp585', 'ssp126'}
filter_list = ['ssp585']
experiment_run = 'ssp585'
grp1 = 'source_id' # used for grouping normally don't need to change
grp2 = 'member_id' # used for grouping normally don't need to change

# Output path
# path = "/Users/aallyn/Library/CloudStorage/Box-Box/RES_Data/CMIP6/SSP5_85/"
path = "/Users/adamkemberling/Library/CloudStorage/Box-Box/RES_Data/CMIP6/SSP5_85/"


# Set some filtering options for x,y,z extents
mod_lims = {
    "xmin" : -100,
    "xmax" : -40,
    "ymin" : 20,
    "ymax" : 70,
    "max_z" : 400
}


### Query Model List

Using those config options we can query the cmip6 consolidated stores catalog for the suite of models that fit the criteria we need. This catalog contains information on which institution ran the model, which scenario it used, and other information about the model. But most importantly, it provides a `ztore` field which has information on where to access the data.

In [39]:
# Data catalog is stored as a 30MB CSV file
# the columns correspond to the CMIP6 controlled vocab
AllModels = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')


# To run/query a suite of models
# We can filter using the config options in the above chunk
df_var = AllModels.query(f"variable_id == '{variable_id}' & table_id == '{table_id}' & experiment_id == @filter_list")
filteredModels = fcts.ExperimentFilter(df_var, grp1, grp2)
filteredModels_grid = filteredModels.query(f"source_id == @source_list").reset_index(drop=True)

# look at it's structure
print(filteredModels_grid[0:3])

      source_id  member_id  Number_of_exp  activity_id institution_id  \
0         CESM2  r10i1p1f1              1  ScenarioMIP           NCAR   
1   CESM2-WACCM   r1i1p1f1              1  ScenarioMIP           NCAR   
2  CMCC-CM2-SR5   r1i1p1f1              1  ScenarioMIP           CMCC   

  experiment_id table_id variable_id grid_label  \
0        ssp585     Omon      thetao         gn   
1        ssp585     Omon      thetao         gr   
2        ssp585     Omon      thetao         gn   

                                              zstore  dcpp_init_year   version  
0  gs://cmip6/CMIP6/ScenarioMIP/NCAR/CESM2/ssp585...             NaN  20200528  
1  gs://cmip6/CMIP6/ScenarioMIP/NCAR/CESM2-WACCM/...             NaN  20190815  
2  gs://cmip6/CMIP6/ScenarioMIP/CMCC/CMCC-CM2-SR5...             NaN  20200622  


# Iterate List and Extract + Save Data

From that list of candidate models, we can open slice out the variable(s) we need for the xyz limits we're using. For bottom layer data these datasets are renamed for identification later.

In [40]:

# Salinity?
if variable_id in ["so", "thetao"]:

    # Only has to be defined once
    gcs = gcsfs.GCSFileSystem(token = 'anon')

    # Loop over each row in the query results
    for i in range(len(filteredModels_grid)):

        # Record information on the source, member, experiment, and variable
        source_id     = filteredModels_grid.source_id[i]
        member_id     = filteredModels_grid.member_id[i]
        experiment_id = filteredModels_grid.experiment_id[i]
        variable_id   = filteredModels_grid.variable_id[i]
        
        # Set export path(s)

        # Toggle for surface/bottom - sets filename convention for saving
        if TOP == True:
            savePath = f'{path}RawTmpFiles/Surface_{variable_id}_{source_id}_{member_id}_{experiment_id}.nc'
        else:
            savePath = f'{path}RawTmpFiles/{variable_id}_{source_id}_{member_id}_{experiment_id}.nc'
        
        # Skip if the download exists
        if os.path.exists(savePath):
            print(f"Skipping {savePath} as it already exists.")
            continue
        
        # Proceed if not
        else :
            # Progress update
            print(f"Working on {savePath}")
            
            # get the path to a specific zarr store 0 index is first on list
            zstore = filteredModels_grid.zstore.values[i]

            # create a mutable-mapping-styly interface to the store
            mapper = gcs.get_mapper(zstore)

            # open it using xarray and zarr
            ds = xr.open_zarr(mapper, consolidated=True)
            # ds = ds.compute() # Forces it to load locally

            # Pull coordinates
            lonNames = list(ds.cf[['longitude']].coords)
            latNames = list(ds.cf[['latitude']].coords)

            # ?
            try:
                vertNames = list(ds.cf[['vertical']].coords)
            except KeyError:
                vertNames = list(ds.cf[['Z']].coords)

            # Lists of possible coordinate names:
            lons = ['lon', 'longitude', 'nav_lon']
            lats = ['lat', 'latitude', 'nav_lat']
            verts = ['lev', 'olevel']

            # Grab the coordinate used as the generic "x_coord" etc.
            x_coord = list(set(lonNames).intersection(lons))[0]
            y_coord = list(set(latNames).intersection(lats))[0]
            depth_coord = list(set(vertNames).intersection(verts))[0]

            # Handling multiIndex
            if len(ds[variable_id][x_coord].dims) == 2:
                multiIndex = True
            else:
                multiIndex = False

            try:
                levUnits = ds[depth_coord].units
            
            except AttributeError:
                print('No depth units')
                print(ds[depth_coord])
                print('Enter units')
                levUnits = input()

            # Set max depth - adjust for units
            if levUnits in ['m', 'meters']:
                maxDepth = mod_lims["max_z"]

            elif levUnits in ['cm', 'centimeters']:
                maxDepth = mod_lims["max_z"] * 100

            else:
                ds[depth_coord]
                print('Check attributes')

            # Download atlantic data using xlims/ylims

            # Determine whether its 0-360 or -180 to 180
            x_coordMin = ds[x_coord].values.min()
            x_coordMax = ds[x_coord].values.max()

            if x_coordMin < 0:
                xmin = mod_lims["xmin"]
                xmax = mod_lims["xmax"]
            else:
                xmin = mod_lims["xmin"] + 360
                xmax = mod_lims["xmax"] + 360
            
            # Get subset logic for slicing coordinates
            kwlon = {x_coord: slice(xmin, xmax)}
            kwlat = {y_coord: slice(mod_lims["ymin"], mod_lims["ymax"])}
            
            if multiIndex == True:
                
                # for multi index - subset xy limits

                # # This needs compute() to work, which would explode, was repeated below
                # atlantic = ds.where(
                #     (xmin < ds[x_coord]) & 
                #     (ds[x_coord] < xmax) & 
                #     (mod_lims["ymin"] < ds[y_coord]) & 
                #     (ds[y_coord] < mod_lims["ymax"]),
                #     drop=True)

                # changed to this
                # "x" & "y" are not consistent coords either
                atlantic = ds.sel(
                    x = slice(xmin, xmax), 
                    y = slice(mod_lims["ymin"], mod_lims["ymax"])
                )       
                
                if experiment_id == 'historical':
                    atlantic = atlantic.sel(time=slice('1950-01-01', None))
                elif experiment_id == experiment_run:   # Need to update this experiment ID for each run
                    atlantic = atlantic.isel(time=slice(None, 1032))
                else:
                    print("Need to enter date range")
            else:
                # Single index
                if experiment_id == 'historical':
                    atlantic = ds.sel(**kwlon, **kwlat, time=slice("1950-01-01", None))

                elif experiment_id == experiment_run: # Need to update this experiment ID for each run
                    atlantic = ds.sel(**kwlon, **kwlat, time=slice(None, '2100-12-31'))
                
                else:
                    print("Need to enter date range")

            # Subset top layer if pulling surface
            if TOP == True:
                kwargs = {depth_coord: 0}
                ds = atlantic.isel(**kwargs)
                ds = ds.rename({depth_coord: 'surface'})

            # Pull up to max depth for bottom
            else:

                kwargs = {depth_coord: slice(0, maxDepth)}
                bottom_400 = atlantic.sel(**kwargs)
                temp_array = bottom_400[variable_id]

                if multiIndex == True:
                    dims0 = bottom_400[y_coord].dims[0]
                    dims1 = bottom_400[y_coord].dims[1]
                else:
                    dims0 = y_coord
                    dims1 = x_coord

                # Get the bottom z index in space
                depth_indices = fcts.find_deepest_depth_indices_CMIP6(bottom_400, dims0, dims1, variable_id, y_coord, x_coord)
                ind = xr.DataArray(depth_indices, dims=[dims0, dims1])

                kwdepth = {depth_coord: ind}
                dsSel = temp_array.isel(**kwdepth)
                ds = dsSel.to_dataset()
                ds = ds.rename({depth_coord: 'bottom'})

            # Prepare to save
            delayed_obj = ds.to_netcdf(savePath, compute=False)

            # Use compute to bring everything over locally
            with ProgressBar():
                results = delayed_obj.compute()


            # Handling cases for each variable where experiment_id not historical
            if variable_id == "so" and experiment_id not in "historical":
                folder = glob.glob(f'{path}SurSalinity/StGrid/*')
                for file in folder:
                    df = fcts.checkDates(file)
                    ncTimes = ncTimes.append(df, ignore_index=True)
            
            # Some min/max checking log?
            # if variable_id == "thetao" and experiment_id not in "historical":
            #     folder = glob.glob(f'{path}BottomT/StGrid/*')
            #     minmax = {'name': [name], 'minVal': [minVal], 'maxVal': [maxVal]}
            #     minmaxdf = pd.DataFrame(data=minmax)
            #     for file in folder:
            #         df = fcts.checkMinMax(file, variable_id)
            #         minmaxdf = minmaxdf.append(df, ignore_index=True)

            print(f'Finished {variable_id}_{source_id}_{member_id}_{experiment_id}.nc')



# Sea Surface Temperature
if variable_id == 'tos':

    # Iterate over
    for i in range(len(filteredModels_grid)):

        # Record information on the source, member, experiment, and variable
        source_id     = filteredModels_grid.source_id[i]
        member_id     = filteredModels_grid.member_id[i]
        experiment_id = filteredModels_grid.experiment_id[i]
        variable_id   = filteredModels_grid.variable_id[i]
        savePath      = f'{path}RawTmpFiles/{variable_id}_{source_id}_{member_id}_{experiment_id}.nc'
        
        # If file exists, skip to next one
        if os.path.exists(savePath):
            print(f"Skipping {savePath} as it already exists.")
            continue
        
        # If it isn't downloaded, work on that
        else :

            # get the path to a specific zarr store 0 index is first on list
            zstore = filteredModels_grid.zstore.values[i]

            # create a mutable-mapping-styly interface to the store
            mapper = gcs.get_mapper(zstore)

            # open it using xarray and zarr
            ds = xr.open_zarr(mapper, consolidated=True)
            # ds = ds.compute()

            # Lists of possible coordinate names:
            lons = ['lon', 'longitude', 'nav_lon']
            lats = ['lat', 'latitude', 'nav_lat']

            # Pull coordinates
            lonNames = list(ds.cf[['longitude']].coords)
            latNames = list(ds.cf[['latitude']].coords)

            # Grab the coordinate used as the generic "x_coord" etc.
            x_coord = list(set(lonNames).intersection(lons))[0]
            y_coord = list(set(latNames).intersection(lats))[0]

            # Determine whether its 0-360 or -180 to 180
            x_coordMin = ds[x_coord].values.min()
            x_coordMax = ds[x_coord].values.max()
            
            # Adjust xlim cropping based on longitude coordinate system
            if x_coordMin < 0:
                xmin = mod_lims["xmin"]
                xmax = mod_lims["xmax"]
            else:
                xmin = mod_lims["xmin"] + 360
                xmax = mod_lims["xmax"] + 360

            # Check if its a multiIndex situation
            if len(ds[variable_id][x_coord].dims) == 2:
                multiIndex = True
            else:
                multiIndex = False

            # Get subset logic for slicing coordinates
            kwlon = {x_coord: slice(xmin, xmax)}
            kwlat = {y_coord: slice(mod_lims["ymin"], mod_lims["ymax"])}

            # Handling multiIndex Subsetting differences
            if multiIndex == True:
    
                # for multi index - subset xy limits
                atlantic = ds.sel(
                    x = slice(xmin, xmax), 
                    y = slice(mod_lims["ymin"], mod_lims["ymax"])
                )
                
                # Filter dates
                if experiment_id == 'historical':
                    atlantic = atlantic.sel(time=slice('1950-01-01', None))

                elif experiment_id == experiment_run:  # Need to update this experiment ID for each run
                    atlantic = atlantic.isel(time=slice(None, 1032))
                else:
                    print("Need to enter date range")
            
            # single index
            else:
                if experiment_id == 'historical':
                    atlantic = ds.sel(**kwlon, **kwlat, time=slice("1950-01-01", None))

                elif experiment_id == experiment_run:  # Need to update this experiment ID for each run
                    atlantic = ds.sel(**kwlon, **kwlat, time=slice(None, '2100-12-31'))

                else:
                    print("Need to enter date range")

            # Prepare to save
            delayed_obj = atlantic.to_netcdf(savePath, compute=False)

            # Use compute to finish
            with ProgressBar():
                results = delayed_obj.compute()

# # Log some basic info?
# names = {'name': [], 'minDate': [], 'maxData': [], 'length': []}
# ncTimes = pd.DataFrame(data = names)

Skipping /Users/adamkemberling/Library/CloudStorage/Box-Box/RES_Data/CMIP6/SSP5_85/RawTmpFiles/thetao_CESM2_r10i1p1f1_ssp585.nc as it already exists.
Skipping /Users/adamkemberling/Library/CloudStorage/Box-Box/RES_Data/CMIP6/SSP5_85/RawTmpFiles/thetao_CESM2-WACCM_r1i1p1f1_ssp585.nc as it already exists.
Skipping /Users/adamkemberling/Library/CloudStorage/Box-Box/RES_Data/CMIP6/SSP5_85/RawTmpFiles/thetao_CMCC-CM2-SR5_r1i1p1f1_ssp585.nc as it already exists.
Skipping /Users/adamkemberling/Library/CloudStorage/Box-Box/RES_Data/CMIP6/SSP5_85/RawTmpFiles/thetao_CNRM-CM6-1_r1i1p1f2_ssp585.nc as it already exists.
Skipping /Users/adamkemberling/Library/CloudStorage/Box-Box/RES_Data/CMIP6/SSP5_85/RawTmpFiles/thetao_CNRM-ESM2-1_r1i1p1f2_ssp585.nc as it already exists.
Skipping /Users/adamkemberling/Library/CloudStorage/Box-Box/RES_Data/CMIP6/SSP5_85/RawTmpFiles/thetao_CanESM5_r10i1p1f1_ssp585.nc as it already exists.
Skipping /Users/adamkemberling/Library/CloudStorage/Box-Box/RES_Data/CMIP6/SSP

KeyError: "'x' is not a valid dimension or coordinate for Dataset with dimensions FrozenMappingWarningOnValuesAccess({'i': 360, 'j': 385, 'lev': 70, 'bnds': 2, 'time': 1032, 'vertices': 4})"

---

# Debugging failed downloads:

Due to inconsistencies between the models, the code above will fail on occassion. These can be due to a number of problems including: trying to load too much data at a time with compute(), variable/index name differences, and multiIndex usage.


#### Presently, These are the model runs that are failingand their causes:

thetao_GFDL-CM4_r1i1p1f1_ssp585.nc <- Problem was loading with compute() at beginning

thetao_EC-CESM2_r1i1p1f1_ssp585.nc <- x is not a valid dimension

thetao_EC-Earth3-Veg_r1i1p1f1_ssp585.nc <- x is not a valid dimension

thetao_EC-Earth3-Veg-LR_r1i1p1f1_ssp585.nc <- x is not a valid dimension

thetao_HadGEM3-GC31-LL_r1i1p1f3_ssp585.nc <- x is not a valid dimension

thetao_MIROC6_r10i1p1f1_ssp585.nc <- No depth units

thetao_NESM3_r1i1p1f1_ssp585.nc <- 'x' is not a valid dimension

thetao_NorESM2-LM_r1i1p1f1_ssp585.nc <- if len(ds[variable_id][x_coord].dims) == 2: list index out of range

thetao_NorESM2-MM_r1i1p1f1_ssp585.nc <- 'x' is not a valid dimension

They can be opened for inspection this way:


In [None]:
# Get the row/index

# Known values
variable_id   = "thetao"
source_id     = "NESM3"
member_id     = "r1i1p1f1"
experiment_id = "ssp585"

# Find the index by matching
df = filteredModels_grid
matching_index = df[(df['variable_id'] == variable_id) & (df['source_id'] == source_id) & (df['member_id'] == member_id) & (df['experiment_id'] == experiment_id)].index

# Check it
print("Matching index:", matching_index[0]) 
df[(df['variable_id'] == variable_id) & (df['source_id'] == source_id) & (df['member_id'] == member_id) & (df['experiment_id'] == experiment_id)]


Matching index: 17


Unnamed: 0,source_id,member_id,Number_of_exp,activity_id,institution_id,experiment_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
17,NESM3,r1i1p1f1,1,ScenarioMIP,NUIST,ssp585,Omon,thetao,gn,gs://cmip6/CMIP6/ScenarioMIP/NUIST/NESM3/ssp58...,,20190731


Use those indices to open them for inspection to debug

In [31]:
# Only has to be defined once
gcs = gcsfs.GCSFileSystem(token = 'anon')

# get the path to a specific zarr store 0 index is first on list
zstore = filteredModels_grid.zstore.values[matching_index[0]]

# create a mutable-mapping-styly interface to the store
mapper = gcs.get_mapper(zstore)

# open it using xarray and zarr
ds = xr.open_zarr(mapper, consolidated=True)

# Don't load the whole thing
#ds = ds.compute()

# Inspect it
ds

Unnamed: 0,Array,Chunk
Bytes,412.91 kiB,412.91 kiB
Shape,"(292, 362)","(292, 362)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 412.91 kiB 412.91 kiB Shape (292, 362) (292, 362) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",362  292,

Unnamed: 0,Array,Chunk
Bytes,412.91 kiB,412.91 kiB
Shape,"(292, 362)","(292, 362)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,736 B,736 B
Shape,"(46, 2)","(46, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 736 B 736 B Shape (46, 2) (46, 2) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",2  46,

Unnamed: 0,Array,Chunk
Bytes,736 B,736 B
Shape,"(46, 2)","(46, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,412.91 kiB,412.91 kiB
Shape,"(292, 362)","(292, 362)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 412.91 kiB 412.91 kiB Shape (292, 362) (292, 362) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",362  292,

Unnamed: 0,Array,Chunk
Bytes,412.91 kiB,412.91 kiB
Shape,"(292, 362)","(292, 362)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16.12 kiB,16.12 kiB
Shape,"(1032, 2)","(1032, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 16.12 kiB 16.12 kiB Shape (1032, 2) (1032, 2) Dask graph 1 chunks in 2 graph layers Data type object numpy.ndarray",2  1032,

Unnamed: 0,Array,Chunk
Bytes,16.12 kiB,16.12 kiB
Shape,"(1032, 2)","(1032, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,18.69 GiB,222.58 MiB
Shape,"(1032, 46, 292, 362)","(12, 46, 292, 362)"
Dask graph,86 chunks in 2 graph layers,86 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 18.69 GiB 222.58 MiB Shape (1032, 46, 292, 362) (12, 46, 292, 362) Dask graph 86 chunks in 2 graph layers Data type float32 numpy.ndarray",1032  1  362  292  46,

Unnamed: 0,Array,Chunk
Bytes,18.69 GiB,222.58 MiB
Shape,"(1032, 46, 292, 362)","(12, 46, 292, 362)"
Dask graph,86 chunks in 2 graph layers,86 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.61 MiB,1.61 MiB
Shape,"(292, 362, 4)","(292, 362, 4)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.61 MiB 1.61 MiB Shape (292, 362, 4) (292, 362, 4) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",4  362  292,

Unnamed: 0,Array,Chunk
Bytes,1.61 MiB,1.61 MiB
Shape,"(292, 362, 4)","(292, 362, 4)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.61 MiB,1.61 MiB
Shape,"(292, 362, 4)","(292, 362, 4)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.61 MiB 1.61 MiB Shape (292, 362, 4) (292, 362, 4) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",4  362  292,

Unnamed: 0,Array,Chunk
Bytes,1.61 MiB,1.61 MiB
Shape,"(292, 362, 4)","(292, 362, 4)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


Now that we can look at it, here are the things we're pulling out:

 1. lat/lon/depth coordinate
 2. variables by name

In [14]:

# Get the depth index names
# ?
try:
    vertNames = list(ds.cf[['vertical']].coords)
except KeyError:
    vertNames = list(ds.cf[['Z']].coords)

In [None]:
# Pull coordinates
lonNames = list(ds.cf[['longitude']].coords)
latNames = list(ds.cf[['latitude']].coords)

# Lists of possible coordinate names:
lons = ['lon', 'longitude', 'nav_lon']
lats = ['lat', 'latitude', 'nav_lat']
verts = ['lev', 'olevel']

# Grab the coordinate used as the generic "x_coord" etc.
x_coord = list(set(lonNames).intersection(lons))[0]
y_coord = list(set(latNames).intersection(lats))[0]
depth_coord = list(set(vertNames).intersection(verts))[0]

# Handling multiIndex
if len(ds[variable_id][x_coord].dims) == 2:
    multiIndex = True
else:
    multiIndex = False

try:
    levUnits = ds[depth_coord].units

except AttributeError:
    print('No depth units')
    print(ds[depth_coord])
    print('Enter units')
    levUnits = input()


In [None]:

# Set max depth - adjust for units
if levUnits in ['m', 'meters']:
    maxDepth = mod_lims["max_z"]

elif levUnits in ['cm', 'centimeters']:
    maxDepth = mod_lims["max_z"] * 100

else:
    ds[depth_coord]
    print('Check attributes')

# Download atlantic data using xlims/ylims

# Determine whether its 0-360 or -180 to 180
x_coordMin = ds[x_coord].values.min()
x_coordMax = ds[x_coord].values.max()

if x_coordMin < 0:
    xmin = mod_lims["xmin"]
    xmax = mod_lims["xmax"]
else:
    xmin = mod_lims["xmin"] + 360
    xmax = mod_lims["xmax"] + 360

# Get subset logic for slicing coordinates
kwlon = {x_coord: slice(xmin, xmax)}
kwlat = {y_coord: slice(mod_lims["ymin"], mod_lims["ymax"])}



{'lon': slice(-100, -40, None)}

## Check the subsetting

In [24]:
ds.sel(x = slice(xmin, xmax), y = slice(mod_lims["ymin"], mod_lims["ymax"]))

Unnamed: 0,Array,Chunk
Bytes,286.99 kiB,286.99 kiB
Shape,"(310, 237)","(310, 237)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 286.99 kiB 286.99 kiB Shape (310, 237) (310, 237) Dask graph 1 chunks in 3 graph layers Data type float32 numpy.ndarray",237  310,

Unnamed: 0,Array,Chunk
Bytes,286.99 kiB,286.99 kiB
Shape,"(310, 237)","(310, 237)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.12 MiB,1.12 MiB
Shape,"(310, 237, 4)","(310, 237, 4)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.12 MiB 1.12 MiB Shape (310, 237, 4) (310, 237, 4) Dask graph 1 chunks in 3 graph layers Data type float32 numpy.ndarray",4  237  310,

Unnamed: 0,Array,Chunk
Bytes,1.12 MiB,1.12 MiB
Shape,"(310, 237, 4)","(310, 237, 4)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,560 B,560 B
Shape,"(35, 2)","(35, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 560 B 560 B Shape (35, 2) (35, 2) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",2  35,

Unnamed: 0,Array,Chunk
Bytes,560 B,560 B
Shape,"(35, 2)","(35, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,286.99 kiB,286.99 kiB
Shape,"(310, 237)","(310, 237)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 286.99 kiB 286.99 kiB Shape (310, 237) (310, 237) Dask graph 1 chunks in 3 graph layers Data type float32 numpy.ndarray",237  310,

Unnamed: 0,Array,Chunk
Bytes,286.99 kiB,286.99 kiB
Shape,"(310, 237)","(310, 237)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.12 MiB,1.12 MiB
Shape,"(310, 237, 4)","(310, 237, 4)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.12 MiB 1.12 MiB Shape (310, 237, 4) (310, 237, 4) Dask graph 1 chunks in 3 graph layers Data type float32 numpy.ndarray",4  237  310,

Unnamed: 0,Array,Chunk
Bytes,1.12 MiB,1.12 MiB
Shape,"(310, 237, 4)","(310, 237, 4)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16.12 kiB,16.12 kiB
Shape,"(1032, 2)","(1032, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 16.12 kiB 16.12 kiB Shape (1032, 2) (1032, 2) Dask graph 1 chunks in 2 graph layers Data type object numpy.ndarray",2  1032,

Unnamed: 0,Array,Chunk
Bytes,16.12 kiB,16.12 kiB
Shape,"(1032, 2)","(1032, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,9.89 GiB,9.81 MiB
Shape,"(1032, 35, 310, 237)","(1, 35, 310, 237)"
Dask graph,1032 chunks in 3 graph layers,1032 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 9.89 GiB 9.81 MiB Shape (1032, 35, 310, 237) (1, 35, 310, 237) Dask graph 1032 chunks in 3 graph layers Data type float32 numpy.ndarray",1032  1  237  310  35,

Unnamed: 0,Array,Chunk
Bytes,9.89 GiB,9.81 MiB
Shape,"(1032, 35, 310, 237)","(1, 35, 310, 237)"
Dask graph,1032 chunks in 3 graph layers,1032 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [None]:
# Subset
if multiIndex == True:
    
    # for multi index - subset xy limits
    atalantic = ds.sel(
        x = slice(xmin, xmax), 
        y = slice(mod_lims["ymin"], mod_lims["ymax"])
    )
    
    # If historical, start at 1950
    if experiment_id == 'historical':
        atlantic = atlantic.sel(time=slice('1950-01-01', None))
    
    # If its an experimental run, start whenever possible
    elif experiment_id == experiment_run:   # Need to update this experiment ID for each run
        atlantic = atlantic.isel(time=slice(None, 1032))
    else:
        print("Need to enter date range")

# For single index
else:
    # Single index
    if experiment_id == 'historical':
        atlantic = ds.sel(**kwlon, **kwlat, time=slice("1950-01-01", None))

    elif experiment_id == experiment_run: # Need to update this experiment ID for each run
        atlantic = ds.sel(**kwlon, **kwlat, time=slice(None, '2100-12-31'))
    
    else:
        print("Need to enter date range")

# Subset top layer if pulling surface
if TOP == True:
    kwargs = {depth_coord: 0}
    ds = atlantic.isel(**kwargs)
    ds = ds.rename({depth_coord: 'surface'})

# Pull up to max depth for bottom
else:

    kwargs = {depth_coord: slice(0, maxDepth)}
    bottom_400 = atlantic.sel(**kwargs)
    temp_array = bottom_400[variable_id]

    if multiIndex == True:
        dims0 = bottom_400[y_coord].dims[0]
        dims1 = bottom_400[y_coord].dims[1]
    else:
        dims0 = y_coord
        dims1 = x_coord

    # Get the bottom z index in space
    depth_indices = fcts.find_deepest_depth_indices_CMIP6(bottom_400, dims0, dims1, variable_id, y_coord, x_coord)
    ind = xr.DataArray(depth_indices, dims=[dims0, dims1])

    kwdepth = {depth_coord: ind}
    dsSel = temp_array.isel(**kwdepth)
    ds = dsSel.to_dataset()
    ds = ds.rename({depth_coord: 'bottom'})