In [None]:
%run ../talktools.py

# xarray

<img src="https://docs.xarray.dev/en/stable/_static/dataset-diagram-logo.png">

Adding dimensions names and coordinate indexes to numpy’s ndarray makes many powerful array operations possible:

- Apply operations over dimensions by name: `x.sum('time')`.
- Select values by label instead of integer location: `x.loc['2022-01-01']` or `x.sel(time='2022-01-01')`.
- Mathematical operations (e.g., `x - y`) vectorize across multiple dimensions (array broadcasting) based on dimension names, not shape.
- Flexible split-apply-combine operations with groupby: `x.groupby('time.dayofyear').mean()`.
- Database like alignment based on coordinate labels that smoothly handles missing values: `x, y = xr.align(x, y, join='outer')`.
- Keep track of arbitrary metadata in the form of a Python dictionary: x.attrs.

Works with dask too.

- **DataArray**: labeled, N-dimensional array. It is an N-D generalization of a pandas.Series. 

- **Dataset** multi-dimensional, in-memory array database. It is a dict-like container of DataArray objects aligned along any number of shared dimensions, and serves a similar purpose in xarray to the pandas.DataFrame

<img src="https://docs.xarray.dev/en/stable/_images/dataset-diagram.png">

http://xarray.pydata.org/en/stable/why-xarray.html#features

In [None]:
#!conda install xarray netCDF4 -y

In [None]:
import xarray as xr
import numpy as np
import pandas as pd

Let's create a small timeseries of data of wind speed at different California airports.

In [None]:
wind_array = 10*np.random.rand(4, 3) + 3

locs = ["OAK", "LAX", "SFO"]

times = pd.date_range("2022-02-28", periods=4)

In [None]:
foo = xr.DataArray(wind_array, coords=[times, locs], dims=["time", "space"])

In [None]:
foo

In [None]:
foo.dims

In [None]:
type(foo.values)

In [None]:
foo.attrs

In [None]:
foo.name = "airport speed"

foo.attrs["units"] = "m/s"

In [None]:
# positional and by integer label, like numpy
foo[[0]]

In [None]:
foo["airport_ranking"] = ("space", [1, 3, 2])

In [None]:
# by dimension name and coordinate label
foo.sel(space=['OAK'])

In [None]:
foo.mean(dim='time')

In [None]:
!rm /tmp/wind.nc
foo.to_netcdf("/tmp/wind.nc")

In [None]:
ds1 = xr.open_dataset("/tmp/wind.nc")

In [None]:
ds1.sel(space="LAX")

## OPeNDAP

`xarray` includes support for OPeNDAP (via the netCDF4 library or Pydap), which lets us access large datasets over HTTP. Let's get some  climate data from http://thredds.northwestknowledge.net:8080/thredds/catalog.html

In [None]:
data_path = "http://thredds.northwestknowledge.net:8080/thredds/dodsC/agg_macav2metdata_tasmax_BNU-ESM_r1i1p1_historical_1950_2005_CONUS_monthly.nc"

max_temp_xr  = xr.open_dataset(data_path)  
# View xarray object
max_temp_xr

In [None]:
# View first 5 latitude values
max_temp_xr["air_temperature"]["lat"].values[:5]

print("The min and max latitude values in the data is:", 
      max_temp_xr["air_temperature"]["lat"].values.min(), 
      max_temp_xr["air_temperature"]["lat"].values.max())
print("The min and max longitude values in the data is:", 
      max_temp_xr["air_temperature"]["lon"].values.min(), 
      max_temp_xr["air_temperature"]["lon"].values.max())

print("The earliest date in the data is:", max_temp_xr["air_temperature"]["time"].values.min())
print("The latest date in the data is:", max_temp_xr["air_temperature"]["time"].values.max())    


In [None]:
max_temp_xr.attrs

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#!pip install nc-time-axis

In [None]:
key=500
longitude = max_temp_xr["air_temperature"]["lon"].values[key]
latitude = max_temp_xr["air_temperature"]["lat"].values[key]

one_point = max_temp_xr["air_temperature"].sel(lat=latitude,
                                               lon=longitude)
plt.figure(figsize=(14,8))

one_point.plot()
