# Extract climate timeseries over the ecoregions

And save to disk as this process is time consuming.


***

**To Do:**
* Consider sourcing climate data from somewhere other than ANUClim, perhaps ERA5

In [None]:
%matplotlib inline
import os
import sys
import math
import pickle
import warnings
import dask
import xarray as xr
import rioxarray as rxr
import geopandas as gpd
import numpy as np

from odc.geo.xr import assign_crs
from odc.geo.geom import Geometry


In [None]:
sys.path.append('/g/data/os22/chad_tmp/AusEFlux/src/')
from _utils import start_local_dask
start_local_dask()

## Analysis Parameters


In [None]:
# save_file = '/g/data/os22/chad_tmp/Aus_phenology/data/ecoregions_NDVI_timeseries.pkl'
# save_file = '/g/data/os22/chad_tmp/Aus_phenology/data/IBRA_regions_NDVI_timeseries.pkl'
save_file = '/g/data/os22/chad_tmp/Aus_phenology/data/IBRA_subregions_climate_timeseries.pkl'

# ecoregions_file = '/g/data/os22/chad_tmp/Aus_phenology/data/vectors/Ecoregions2017_modified.geojson'
# ecoregions_file = '/g/data/os22/chad_tmp/Aus_phenology/data/vectors/IBRAv7_regions_modified.geojson'
ecoregions_file = '/g/data/os22/chad_tmp/Aus_phenology/data/vectors/IBRAv7_subregions_modified.geojson'

# var='ECO_NAME'
# var='REG_NAME_7'
var='SUB_NAME_7'

## Load climate data

In [None]:
base_clim = '/g/data/os22/chad_tmp/AusENDVI/data/5km/'
co2 = xr.open_dataset(base_clim+'CO2_5km_monthly_1982_2022.nc')
rain = xr.open_dataset(base_clim+'rain_5km_monthly_1981_2022.nc').sel(time=slice('1982','2022')).drop_vars('spatial_ref')
srad = xr.open_dataset(base_clim+'srad_5km_monthly_1982_2022.nc').drop_vars('spatial_ref')
tavg = xr.open_dataset(base_clim+'tavg_5km_monthly_1982_2022.nc').drop_vars('spatial_ref')
vpd = xr.open_dataset(base_clim+'vpd_5km_monthly_1982_2022.nc').drop_vars('spatial_ref')

climate = xr.merge([co2, rain, srad, tavg, vpd])
climate = assign_crs(climate, crs='EPSG:4326')
climate = climate.transpose('time', 'latitude','longitude')
climate = climate.sel(time=slice('1982', '2022'))

for v in climate.data_vars:
    try:
        del climate[v].attrs['grid_mapping']
    except:
        continue

In [None]:
# save to disk for multiprocessing next
climate.to_netcdf('/g/data/os22/chad_tmp/Aus_phenology/data/climate.nc')

### Summarise climate data over polygons

Slow so using Dask to multiprocess

In [None]:
gdf = gpd.read_file(ecoregions_file)

In [None]:
#decorate the function
@dask.delayed
def clim_zonal_timeseries(index, ds, gdf, var):
    
    ds = assign_crs(ds, crs='EPSG:4326')
    geom = Geometry(geom=gdf.iloc[index].geometry, crs=gdf.crs)
    yy = ds.odc.mask(poly=geom)
    yy = yy.dropna(dim='longitude',
          how='all').dropna(dim='latitude', how='all')

    #summarise into 1d timeseries
    yy = yy.mean(['latitude', 'longitude'])

    if np.isnan(yy['rain']).sum() == len(yy.time):
        yy=np.nan

    return yy

#delay open datasets
path='/g/data/os22/chad_tmp/Aus_phenology/data/climate.nc'
dss = dask.delayed(xr.open_dataset)(path)
gdff = dask.delayed(gpd.read_file)(ecoregions_file)

results_clim={}
# lazily loop through polygons
for index, row in gdf.iterrows():
    zz = clim_zonal_timeseries(index, dss, gdff, var)
    results_clim[row[var]] = zz

In [None]:
%%time
results_clim = dask.compute(results_clim)[0] #bring into memory

# remove NaNs
results_clim = {k: results_clim[k] for k in results_clim if not type(results_clim[k]) is float}

## Save data

In [None]:
with open(save_file, 'wb') as f:
    pickle.dump(results_clim, f)