In [None]:
#libs for reading data
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import intake
import dask

#libs for dask gateway
from dask_gateway import Gateway
from dask.distributed import Client

### Start a cluster, a group of computers that will work together.

(A cluster is the key to big data analysis on on Cloud.)

- This will set up a [dask kubernetes](https://docs.dask.org/en/latest/setup/kubernetes.html) cluster for your analysis and give you a path that you can paste into the top of the Dask dashboard to visualize parts of your cluster.  
- You don't need to paste the link below into the Dask dashboard for this to work, but it will help you visualize progress.
- Try 20 workers to start (during the tutorial) but you can increase to speed things up later

In [None]:
gateway = Gateway()
cluster = gateway.new_cluster()
cluster.adapt(minimum=1, maximum=20)
client = Client(cluster)
cluster

** ☝️ Don’t forget to click the link above or copy it to the Dask dashboard ![images.png](attachment:images.png) on the left to view the scheduler dashboard! **

### Initialize Dataset

Here we load the dataset from the zarr store. Note that this very large dataset (273 GB) initializes nearly instantly, and we can see the full list of variables and coordinates.

### Examine Metadata

For those unfamiliar with this dataset, the variable metadata is very helpful for understanding what the variables actually represent
Printing the dataset will show you the dimensions, coordinates, and data variables with clickable icons at the end that show more metadata and size.

In [None]:
%%time
cat_pangeo = intake.open_catalog("https://raw.githubusercontent.com/pangeo-data/pangeo-datastore/master/intake-catalogs/master.yaml")

ds_ccmp = cat_pangeo.atmosphere.nasa_ccmp_wind_vectors.to_dask()

ds_ccmp['wspd'] = np.sqrt(ds_ccmp.uwnd**2 + ds_ccmp.vwnd**2)

ds_ccmp

# time series plot

In [None]:
%%time

ds_ccmp.sel(latitude=slice(0,50),longitude=slice(180,210)).mean({'latitude','longitude'}).wspd.plot()

# year average plot

In [None]:
%%time

ds_ccmp.sel(time=slice('2000-01-01','2000-12-31')).mean({'time'}).wspd.plot()

# hovmoller type plot

In [None]:
%%time

ds_ccmp.sel(latitude=0.125,longitude=slice(120,275)).wspd.plot(vmin=3,vmax=15,cmap='magma')

# read data NOT using intake

In [None]:
%%time
import gcsfs
zstore = 'gs://pangeo-nasa-ccmp/zarr'

fs = gcsfs.GCSFileSystem(requester_pays=True)

ds = xr.open_zarr(fs.get_mapper(zstore), consolidated=True)

ds['wspd'] = np.sqrt(ds.uwnd**2 + ds.vwnd**2)

ds