In [3]:

import os, sys

import numpy as np
import pandas as pd
import xarray as xr
from glob import glob # useful package for opening files


# This just changes the display of xarray datasets so they don't automatically
# have expanded attributes (attrs)
xr.set_options(display_expand_attrs=False)

# Can change this to False to stop xarray displaying all the data
xr.set_options(display_expand_data=True)

<xarray.core.options.set_options at 0x7bf1361259f0>

In [4]:
# Making sure the notebook can access the files in your google drive.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
ROOT_DIR = '/content/drive/My Drive/Colab Notebooks'

In [6]:
# Listing all the files in the directory
os.listdir(ROOT_DIR)

['ds1900.nc',
 'ds1901.nc',
 'ds1902.nc',
 'ds1904.nc',
 'supervision_notebook_01.ipynb',
 'supervision_notebook_02.ipynb']

In [8]:
# The xr.open_mfdataset function can be used to open all files in a directory
# that have '.nc'. The  wild card (*) symbol just means anything can appear
# there.
# Note os.path.join just addeds '/' between ROOT_DIR and '*.nc'
ds_joined = xr.open_mfdataset(os.path.join(ROOT_DIR, 'ds*.nc'))
ds_joined

Unnamed: 0,Array,Chunk
Bytes,398.54 kiB,103.52 kiB
Shape,"(77, 25, 53)","(20, 25, 53)"
Dask graph,4 chunks in 9 graph layers,4 chunks in 9 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 398.54 kiB 103.52 kiB Shape (77, 25, 53) (20, 25, 53) Dask graph 4 chunks in 9 graph layers Data type float32 numpy.ndarray",53  25  77,

Unnamed: 0,Array,Chunk
Bytes,398.54 kiB,103.52 kiB
Shape,"(77, 25, 53)","(20, 25, 53)"
Dask graph,4 chunks in 9 graph layers,4 chunks in 9 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [9]:
# If you don't want all the files, then you can use 'glob'
all_netcdf_files = glob(os.path.join(ROOT_DIR, '*.nc'))
all_netcdf_files

['/content/drive/My Drive/Colab Notebooks/ds1900.nc',
 '/content/drive/My Drive/Colab Notebooks/ds1901.nc',
 '/content/drive/My Drive/Colab Notebooks/ds1902.nc',
 '/content/drive/My Drive/Colab Notebooks/ds1904.nc']

In [10]:
# Get only the last two netcdf files.
subset_of_netcdf_files = all_netcdf_files[:2]
subset_of_netcdf_files

['/content/drive/My Drive/Colab Notebooks/ds1900.nc',
 '/content/drive/My Drive/Colab Notebooks/ds1901.nc']

In [13]:
# The xr.open_mfdataset function can then be used to open all files in this list
ds_joined = xr.open_mfdataset(subset_of_netcdf_files)
ds_joined

Unnamed: 0,Array,Chunk
Bytes,201.86 kiB,103.52 kiB
Shape,"(39, 25, 53)","(20, 25, 53)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 201.86 kiB 103.52 kiB Shape (39, 25, 53) (20, 25, 53) Dask graph 2 chunks in 5 graph layers Data type float32 numpy.ndarray",53  25  39,

Unnamed: 0,Array,Chunk
Bytes,201.86 kiB,103.52 kiB
Shape,"(39, 25, 53)","(20, 25, 53)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


# Clients

This content is a bit tricker here, but I have added it anyway. This will come in handy at some point. <p> When using large data, it is effict to split to have that computions being handled by multiple cores (essentially multiple computers). Xarray is great, as it can automatially do this for you. However, you

In [None]:
# Don't worry about this, just creating a (somewhat) large array
shape = (2000, 100)
data = np.random.rand(365, *shape)  # Adding 365 time steps
coords = {
    'time': pd.date_range('2000-01-01', periods=365, freq='D'),  # Daily frequency for a year
    'lat': np.linspace(-90, 90, shape[0]),
    'lon': np.linspace(-180, 180, shape[1])
}
da = xr.DataArray(data, coords=coords, dims=['time', 'lat', 'lon']).chunk('auto')

In [None]:
# This array contains dimensions time, lat, lon.
# This array has been 'chunked'. This means that the lat, lon and time
# are all split up into little section - each of these sections can be sent off
# and have a calculation perfomed on it, and then repieced.
da

Unnamed: 0,Array,Chunk
Bytes,556.95 MiB,127.82 MiB
Shape,"(365, 2000, 100)","(365, 459, 100)"
Dask graph,5 chunks in 1 graph layer,5 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 556.95 MiB 127.82 MiB Shape (365, 2000, 100) (365, 459, 100) Dask graph 5 chunks in 1 graph layer Data type float64 numpy.ndarray",100  2000  365,

Unnamed: 0,Array,Chunk
Bytes,556.95 MiB,127.82 MiB
Shape,"(365, 2000, 100)","(365, 459, 100)"
Dask graph,5 chunks in 1 graph layer,5 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray


Chunking is a really usefull think to do to help speed of the compution of your
code. However, there are a few things you need to do to make this efficient of you may end up making your code slower.


1.   Chunks need to be kept to ~100mb
2.   The dimension you are calculting along needs to be 1 chunk (if possible).



In [None]:
# 1 Chunks need to be kept to ~100mb

# To rechunk you can alter the chunks of your data
# E.g. below I have created chunks that are 320 lat by 200 lons by 300 in time
da.chunk({'lat':320, 'lon':200, 'time':300})

Unnamed: 0,Array,Chunk
Bytes,556.95 MiB,73.24 MiB
Shape,"(365, 2000, 100)","(300, 320, 100)"
Dask graph,14 chunks in 2 graph layers,14 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 556.95 MiB 73.24 MiB Shape (365, 2000, 100) (300, 320, 100) Dask graph 14 chunks in 2 graph layers Data type float64 numpy.ndarray",100  2000  365,

Unnamed: 0,Array,Chunk
Bytes,556.95 MiB,73.24 MiB
Shape,"(365, 2000, 100)","(300, 320, 100)"
Dask graph,14 chunks in 2 graph layers,14 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [None]:
# 2.   The dimension you are calculting along needs to be 1 chunk (if possible).

# What this means, is that if you are doing an average in time, then you don't
# want time to be split up accorss one chunk - each chunk should contain all
# of time.
# E.g. in the example below shows you what that looks like

In [None]:
# This would be a good way of chunking if you are doing an average or resmample with time
# 'time':-1 just means time will be one chunk
da.chunk({'time':-1})

Unnamed: 0,Array,Chunk
Bytes,556.95 MiB,127.82 MiB
Shape,"(365, 2000, 100)","(365, 459, 100)"
Dask graph,5 chunks in 1 graph layer,5 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 556.95 MiB 127.82 MiB Shape (365, 2000, 100) (365, 459, 100) Dask graph 5 chunks in 1 graph layer Data type float64 numpy.ndarray",100  2000  365,

Unnamed: 0,Array,Chunk
Bytes,556.95 MiB,127.82 MiB
Shape,"(365, 2000, 100)","(365, 459, 100)"
Dask graph,5 chunks in 1 graph layer,5 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [None]:
# This would be a good way of chunking if you are doing an average in space
# 'time':-1 just means time will be one chunk
da.chunk({'time':70, 'lat':-1, 'lon':-1})

Unnamed: 0,Array,Chunk
Bytes,556.95 MiB,106.81 MiB
Shape,"(365, 2000, 100)","(70, 2000, 100)"
Dask graph,6 chunks in 2 graph layers,6 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 556.95 MiB 106.81 MiB Shape (365, 2000, 100) (70, 2000, 100) Dask graph 6 chunks in 2 graph layers Data type float64 numpy.ndarray",100  2000  365,

Unnamed: 0,Array,Chunk
Bytes,556.95 MiB,106.81 MiB
Shape,"(365, 2000, 100)","(70, 2000, 100)"
Dask graph,6 chunks in 2 graph layers,6 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
