## Develop a method for harmonizing data from OzWALD

Requirements:
* Must be reproducible in an operational context i.e. minumum of fuss to rerun the whole process each year, but first off we need a ~20yr archive to build the models and run historic predictions
* For now, run at 5 km resolution
* Intermediate files are fine, but lets keep the number of steps to a minimum
* Some variables are already computed by OzWALD, but others need to be either computed on-the-fly or saved and stored as intermediate files.
* Many of the pre-computed variables available in OzWALD require resampling spatially and temporally
* A python environment is required, but should be a small as possible (but will undoubtedly still be cumbersome)
* There is a soft requirement that the model be built on features as close to possible as the published 'AusEFlux' article in Biogeoscieneces.



In [None]:
import os
import pandas as pd
import xarray as xr
import numpy as np
from odc.geo.xr import assign_crs
from odc.geo.geobox import zoom_out

import sys
sys.path.append('/g/data/os22/chad_tmp/AusEFlux/src/')
from _utils import start_local_dask, round_coords

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
client = start_local_dask(mem_safety_margin='2Gb')
client

In [None]:
base = '/g/data/ub8/au/'
results='/g/data/os22/chad_tmp/AusEFlux/data/5km/'
years = [str(i) for i in range(2004,2023)]
chunks=dict(latitude=500, longtiude=500, time=-1)

In [None]:
def pre_preprocess(ds, var):
    ds = ds.transpose('latitude', 'longitude', 'time')
    ds = assign_crs(ds, crs='epsg:4326')
    if var=='rain':
        ds = ds.resample(time='MS', loffset=pd.Timedelta(14, 'd')).sum()
    else:
        ds = ds.resample(time='MS', loffset=pd.Timedelta(14, 'd')).mean()
    ds = ds.to_array()
    ds.attrs['nodata'] = np.nan
    ds = ds.odc.reproject(common_grid, resampling='average')
    ds = ds.squeeze().drop('variable')
    ds = round_coords(ds)
    ds.attrs['nodata'] = np.nan
    return ds

In [None]:
#grab the common grid to reproject too
xx = xr.open_dataset(base+'MODIS/mosaic/MYD11A1.006/MYD11A1.006.LST_Day_1km.2019.nc')
xx = assign_crs(xx, crs='epsg:4326')
common_grid = zoom_out(xx.odc.geobox, 5)

#loop through each year
for year in years:
    
    print(year)
    
    inputs = {
        # 'rain': 'OzWALD/daily/meteo/Pg/OzWALD.daily.Pg.'+year+'.nc',
        # 'NDVI': 'OzWALD/8day/NDVI/OzWALD.NDVI.'+year+'.nc',
        # 'LST':'MODIS/mosaic/MYD11A1.006/MYD11A1.006.LST_Day_1km.'+year+'.nc',
        'Tmin': 'OzWALD/daily/meteo/Tmin/OzWALD.Tmin.'+year+'.nc', #tons of memory, deal with differently
        # 'Tmax': 'OzWALD/daily/meteo/Tmax/OzWALD.Tmax.'+year+'.nc',
        # 'VPeff': 'OzWALD/daily/meteo/VPeff/OzWALD.VPeff.'+year+'.nc',
        # 'PV': 'OzWALD/8day/PV/OzWALD.PV.'+year+'.nc',
        # 'BS': 'OzWALD/8day/BS/OzWALD.BS.'+year+'.nc',
        # 'WCF': 'OzWALD/annual/OzWALD.WCF.Annual.nc',
        # 'VegH': '/g/data/os22/chad_tmp/NEE_modelling/data/1km/VegH_1km_monthly_2002_2022.nc'
         }
    
    for k,i in inputs.items():
        if os.path.exists(results+k+'/'+k+'_5km_monthly_'+year+'.nc'):
            print('skipping model '+name)
            continue
        else:
            print('', k)

        # Loop through the dataset and process them
        # Handle some datasets differently
        different = ['WCF', 'VegH']

        if any(k in j for j in different):
            if k=='VegH':
                ds = xr.open_dataset(i, chunks=chunks)
                ds = ds.isel(time=-1).squeeze().drop('time') #grab any time, its all the same
                
            if k=='WCF':    
                ds = xr.open_dataset(base+i, chunks=chunks)
                ds = ds.sel(time=year).squeeze().drop('time')
                ds = ds.transpose('latitude', 'longitude')
                ds = assign_crs(ds, crs='epsg:4326')

            ds = ds.to_array()
            ds.attrs['nodata'] = np.nan
            ds = ds.odc.reproject(common_grid, resampling='average')
            time = pd.date_range(year+"-01", year+"-12", freq='MS') 
            time = [t+pd.Timedelta(14, 'd') for t in time]
            ds = ds.expand_dims(time=time)
            ds= ds.squeeze().drop('variable')
            ds = round_coords(ds)
            ds.attrs['nodata'] = np.nan

        else:
            ds = xr.open_dataset(base+i, chunks=chunks)
            ds = pre_preprocess(ds, k)

        ds = ds.rename(k)

        #bring into memory
        ds = ds.compute() 

        #export result
        folder = '/g/data/os22/chad_tmp/AusEFlux/data/5km/'+k

        if not os.path.exists(folder):
            os.makedirs(folder)

        ds.to_netcdf(results+k+'/'+k+'_5km_monthly_'+year+'.nc')

        del ds
