# xarray

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pandas as pd
import numpy as np
import xarray as xr

def show_component(cmp):
    s = '\n{}'.format(cmp)
    return s.replace('\n', '\n\t')

def show_df(df):
    from IPython.display import display, HTML
    html = '<div style="margin-left:55px">{}</div>'.format(df.to_html())
    display(HTML(html))

## DataArray

### 创建 DataArray 对象

- 1D 数据 => DataArray

In [None]:
data = np.arange(1, 5)
print('* when "data" is: {}'.format(show_component(data)))

da = xr.DataArray(data=data, name='data')
print('  then DataArray "da" is: {}'.format(show_component(da)))
print('  and value of "da" is: {}'.format(show_component(da.values)))
print('  and dims of "da" are: {}'.format(da.dims))
print('  and dataframe of "da" is')
show_df(da.to_dataframe())

- 2D 数据 => DataArray

In [None]:
data = np.random.rand(4, 3)
print('* when "data" is: {}'.format(show_component(data)))

da = xr.DataArray(data=data, name='data')
print('  then DataArray "da" is: {}'.format(show_component(da)))
print('  and value of "da" is: {}'.format(show_component(da.values)))
print('  and dims of "da" are: {}'.format(da.dims))
print('  and dataframe of "da" is')
show_df(da.to_dataframe())

- 3D 数据 => DataArray

In [None]:
data = np.random.rand(4, 3, 2)
print('* when "data" is: {}'.format(show_component(data)))

da = xr.DataArray(data=data, name='data')
print('  then DataArray "da" is: {}'.format(show_component(da)))
print('  and value of "da" is: {}'.format(show_component(da.values)))
print('  and dataframe of "da" is')
show_df(da.to_dataframe())

- 为维度（dimensions）命名
    - 维度的值对应“索引”，即数据的“坐标”
    - 默认情况，维度命名为`dim_0`, `dim_1`, ...，可以为维度重新命名

In [None]:
data = np.random.rand(4, 3)
print('* when "data" is: {}'.format(show_component(data)))

da = xr.DataArray(data=data, dims=['time', 'space'], name='data')
print('  then DataArray "da" is: {}'.format(show_component(da)))
print('  and value of "da" is: {}'.format(show_component(da.values)))
print('  and dataframe of "da" is')
show_df(da.to_dataframe())

- 为索引（坐标）设置值
    - 默认情况下，每个维度的索引均是从`0`开始，可以为其指定特殊值
    - 注意，每个维度设置的索引值必须和该维度数据量相匹配

In [None]:
data = np.reshape(np.arange(1, 7), (2, 3))
print('* when "data" is: {}'.format(show_component(data)))

da = xr.DataArray(data=data, 
                  dims=['a', 'b'],
                  coords=[[1, 2], [4, 5, 6]],
                  name='numbers')
print('  then DataArray "da" is: {}'.format(show_component(da)))
print('  and value of "da" is: {}'.format(show_component(da.values)))
print('  and dataframe of "da" is')
show_df(da.to_dataframe())

- 为命名维度指定索引值（specific index）

In [None]:
data = np.random.rand(4, 3)
print('* when "data" is: {}'.format(show_component(data)))

locs = ['IA', 'IL', 'IN']
print('  and "locs" is: {}'.format(show_component(locs)))

times = pd.date_range('2000-01-01', periods=4)
print('  and "times" is: {}'.format(show_component(times.values)))

da = xr.DataArray(data,
                  dims=['time', 'space'],
                  coords={'time': times, 'space': locs},
                  name='data')

print('\n  then DataArray "da" with index("time"=times, "space"=locs) is: {}'.format(show_component(da)))
print('  and value of "da" is: {}'.format(show_component(da.values)))
print('  and dataframe of "da" is')
show_df(da.to_dataframe())

- 增加属性 (attrs)

In [None]:
data = np.random.rand(4, 3)
locs = ['IA', 'IL', 'IN']
times = pd.date_range('2000-01-01', periods=4)

da = xr.DataArray(data,
                  dims=['time', 'space'],
                  coords={'time': times, 'space': locs},
                  attrs={'for': 'testing', 'ver': 1.0},
                  name='data')
print('* when DataArray "da" is: {}'.format(show_component(da)))

if da.attrs['ver'] == 1.0:
    da.attrs['ver'] = 2.0
print('\n  then DataArray "da" is: {}'.format(show_component(da)))

### 索引操作

- 获取指定维度的索引

In [None]:
data = np.random.rand(4, 3)
locs = ['IA', 'IL', 'IN']
times = pd.date_range('2000-01-01', periods=4)

da = xr.DataArray(data,
                  dims=['time', 'space'],
                  coords={'time': times, 'space': locs},
                  name='data')
print('* when DataArray "da" is')
show_df(da.to_dataframe())

print('\n  then values of "da[\'time\']" is: {}'.format(show_component(da['time'].values)))
print('  and values of "da[\'space\']" is: {}'.format(show_component(da['space'].values)))

- 删除指定索引

In [None]:
data = np.random.rand(4, 3)
locs = ['IA', 'IL', 'IN']
times = pd.date_range('2000-01-01', periods=4)

da = xr.DataArray(data,
                  dims=['time', 'space'],
                  coords={'time': times, 'space': locs},
                  name='data')
print('* when DataArray "da" is')
show_df(da.to_dataframe())

dim1_data = da.sel(time=[pd.to_datetime('2000-01-01'), pd.to_datetime('2000-01-03')])
dim2_data = dim1_data.sel(space=['IL', 'IA'])

print('\n  then values of index time=["2000-01-01", "2000-01-03"] is: {}'.format(show_component(dim1_data.values)))
print('  and values of index time="2000-01-03" and space=["IL", "IA"] is: {}'.format(show_component(dim2_data.values)))

dim1_data = da.isel(time=[0, 1])
dim2_data = dim1_data.isel(space=[1, 2])
print('\n  then values of index time=[0, 1] is: {}'.format(show_component(dim1_data.values)))
print('  and values of index time=[0, 1] and space=[1, 2] is: {}'.format(show_component(dim2_data.values)))

droped_da = da.drop_sel(labels={'time': [pd.to_datetime('2000-01-01'), pd.to_datetime('2000-01-03')]})
print('\n  then drop index time=["2000-01-01", "2000-01-03"] is: {}')
show_df(da.to_dataframe())

droped_da = da.drop_sel(space=['IL', 'IN'])
show_df(droped_da.to_dataframe())

- 获取数据集概览
    - `da.head()`获取前`n`条数据
    - `da.tail()`获取后`n`条数据
    - `da.thin()`获取其中的`n`条数据

In [None]:
data = np.random.rand(4, 3)
locs = ['IA', 'IL', 'IN']
times = pd.date_range('2000-01-01', periods=4)

da = xr.DataArray(data,
                  dims=['time', 'space'],
                  coords={'time': times, 'space': locs},
                  name='data')
print('* when DataArray "da" is')
show_df(da.to_dataframe())

head = da.head(indexers=2)  # left 2 pieces data by all index
print('\n  then "da.head(indexers=2)" is')
show_df(head.to_dataframe())

tail = da.tail(indexers={'space': 2})  # left 2 pieces data by index 'space'
print('\n  then "da.tail(indexers={\'space\': 2})" is')
show_df(tail.to_dataframe())

thin = da.thin(time=2)  # left 2 pieces data by index 'time'
print('\n  then "da.thin(time=2)" is')
show_df(thin.to_dataframe())

### 计算

In [343]:
data = np.random.rand(4, 3)
locs = ['IA', 'IL', 'IN']
times = pd.date_range('2000-01-01', periods=4)

da = xr.DataArray(data,
                  dims=['time', 'space'],
                  coords={'time': times, 'space': locs},
                  name='data')
print('* when DataArray "da" is')
show_df(da.to_dataframe())

groupby = da.groupby(...)
print(groupby.count().values)

* when DataArray "da" is


Unnamed: 0_level_0,Unnamed: 1_level_0,data
time,space,Unnamed: 2_level_1
2000-01-01,IA,0.761668
2000-01-01,IL,0.635032
2000-01-01,IN,0.154558
2000-01-02,IA,0.246071
2000-01-02,IL,0.061431
2000-01-02,IN,0.177915
2000-01-03,IA,0.149839
2000-01-03,IL,0.334726
2000-01-03,IN,0.227742
2000-01-04,IA,0.692295


ValueError: cannot reduce over dimensions ['space']. expected either '...' to reduce over all dimensions or one or more of ('stacked_time_space',).

In [331]:
da = xr.DataArray(np.linspace(0, 1826, num=1827),
                  coords=[pd.date_range('1/1/2000', '31/12/2004', freq='D')],
                  dims='time',
                  name='data')

groupby = da.groupby('time.dayofyear') - da.groupby('time.dayofyear').mean('time')
print(groupby.count().values)


# <xarray.DataArray(time: 1827) >
# array([-730.8, -730.8, -730.8, ..., 730.2, 730.2, 730.5])
# Coordinates:
#     * time(time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 ...
#     dayofyear(time) int64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...

1827


In [347]:
np.random.seed(123)

times = pd.date_range("2000-01-01", "2001-12-31", name="time")
annual_cycle = np.sin(2 * np.pi * (times.dayofyear.values / 365.25 - 0.28))

base = 10 + 15 * annual_cycle.reshape(-1, 1)
tmin_values = base + 3 * np.random.randn(annual_cycle.size, 3)
tmax_values = base + 10 + 3 * np.random.randn(annual_cycle.size, 3)

ds = xr.Dataset(
    {
        "tmin": (("time", "location"), tmin_values),
        "tmax": (("time", "location"), tmax_values),
    },
    {"time": times, "location": ["IA", "IN", "IL"]},
)
show_df(ds.to_dataframe())

Unnamed: 0_level_0,Unnamed: 1_level_0,tmin,tmax
location,time,Unnamed: 2_level_1,Unnamed: 3_level_1
IA,2000-01-01,-8.037369,12.980549
IA,2000-01-02,-9.341157,0.447856
IA,2000-01-03,-12.139719,5.322699
IA,2000-01-04,-7.492914,1.889425
IA,2000-01-05,-0.447129,0.791176
IA,2000-01-06,-6.24854,5.965144
IA,2000-01-07,-1.953092,0.472022
IA,2000-01-08,-0.508395,4.359156
IA,2000-01-09,-8.753138,3.291817
IA,2000-01-10,-9.284007,7.679195


In [None]:
temp = 15 + 8 * np.random.randn(2, 2, 3)
print('* when "temp" is: {}'.format(show_component(temp)))

precip = 10 * np.random.rand(2, 2, 3)
print('  and "precip" is: {}'.format(show_component(precip)))

lon = [[-99.83, -99.32], [-99.79, -99.23]]
print('  and "lon" is: {}'.format(show_component(lon)))

lat = [[42.25, 42.21], [42.63, 42.59]]
print('  and "lat" is: {}'.format(show_component(lat)))

print('\n  then "dataset" is:')
ds = xr.Dataset(
    {
        'temperature': (['x', 'y', 'time'], temp),
        'precipitation': (['x', 'y', 'time'], precip)
    },
    coords={
        'lon': (['x', 'y'], lon),
        'lat': (['x', 'y'], lat),
        'time': pd.date_range('2014-09-06', periods=3),
        'reference_time': pd.Timestamp('2014-09-05')
    }
)
show_df(ds.to_dataframe())