## Develop a method for harmonizing data from OzWALD

Until we do this we can't extract training data...

Try to develop a 1-year prediction. But we'll need to train a model on the full archive...
So the training data will require extraction of ~20years of data. Lets test this at 5km resolution...rainfall is 5km anyway so we can't do better than that for now.


In [None]:
import os
import pandas as pd
import xarray as xr
import numpy as np
from odc.geo.xr import assign_crs
from odc.geo.geobox import zoom_out

import sys
sys.path.append('/g/data/os22/chad_tmp/AusEFlux/src/')
from dask_utils import start_local_dask
from _collect_prediction_data import round_coords

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
client = start_local_dask(mem_safety_margin='2Gb')
client

In [None]:
base = '/g/data/ub8/au/'
results='/g/data/os22/chad_tmp/AusEFlux/data/5km/'
years = [str(i) for i in range(2004,2023)]
chunks=dict(latitude=500, longtiude=500, time=-1)

In [None]:
def pre_preprocess(ds, var):
    ds = ds.transpose('latitude', 'longitude', 'time')
    ds = assign_crs(ds, crs='epsg:4326')
    if var=='rain':
        ds = ds.resample(time='MS', loffset=pd.Timedelta(14, 'd')).sum()
    else:
        ds = ds.resample(time='MS', loffset=pd.Timedelta(14, 'd')).mean()
    ds = ds.to_array()
    ds.attrs['nodata'] = np.nan
    ds = ds.odc.reproject(common_grid, resampling='average')
    ds = ds.squeeze().drop('variable')
    ds = round_coords(ds)
    ds.attrs['nodata'] = np.nan
    return ds

In [None]:
#grab the common grid to reproject too
xx = xr.open_dataset(base+'MODIS/mosaic/MYD11A1.006/MYD11A1.006.LST_Day_1km.2019.nc')
xx = assign_crs(xx, crs='epsg:4326')
common_grid = zoom_out(xx.odc.geobox, 5)

#loop through each year
for year in years:
    
    print(year)
    
    inputs = {
        'rain': 'OzWALD/daily/meteo/Pg/OzWALD.daily.Pg.'+year+'.nc',
        'NDVI': 'OzWALD/8day/NDVI/OzWALD.NDVI.'+year+'.nc',
        'LST':'MODIS/mosaic/MYD11A1.006/MYD11A1.006.LST_Day_1km.'+year+'.nc',
        # 'Tmin': 'OzWALD/daily/meteo/Tmin/OzWALD.Tmin.'+year+'.nc', #tons of memory, deal with differently
        # 'Tmax': 'OzWALD/daily/meteo/Tmax/OzWALD.Tmax.'+year+'.nc',
        'VPeff': 'OzWALD/daily/meteo/VPeff/OzWALD.VPeff.'+year+'.nc',
        'PV': 'OzWALD/8day/PV/OzWALD.PV.'+year+'.nc',
        'BS': 'OzWALD/8day/BS/OzWALD.BS.'+year+'.nc',
        'WCF': 'OzWALD/annual/OzWALD.WCF.Annual.nc',
        'VegH': '/g/data/os22/chad_tmp/NEE_modelling/data/1km/VegH_1km_monthly_2002_2022.nc'
         }
    
    for k,i in inputs.items():
        if os.path.exists(results+k+'/'+k+'_5km_monthly_'+year+'.nc'):
            print('skipping model '+name)
            continue
        else:
            print('', k)

        # Loop through the dataset and process them
        # Handle some datasets differently
        different = ['WCF', 'VegH']

        if any(k in j for j in different):
            if k=='VegH':
                ds = xr.open_dataset(i, chunks=chunks)
                ds = ds.isel(time=-1).squeeze().drop('time') #grab any time, its all the same
                
            if k=='WCF':    
                ds = xr.open_dataset(base+i, chunks=chunks)
                ds = ds.sel(time=year).squeeze().drop('time')
                ds = ds.transpose('latitude', 'longitude')
                ds = assign_crs(ds, crs='epsg:4326')

            ds = ds.to_array()
            ds.attrs['nodata'] = np.nan
            ds = ds.odc.reproject(common_grid, resampling='average')
            time = pd.date_range(year+"-01", year+"-12", freq='MS') 
            time = [t+pd.Timedelta(14, 'd') for t in time]
            ds = ds.expand_dims(time=time)
            ds= ds.squeeze().drop('variable')
            ds = round_coords(ds)
            ds.attrs['nodata'] = np.nan

        else:
            ds = xr.open_dataset(base+i, chunks=chunks)
            ds = pre_preprocess(ds, k)

        ds = ds.rename(k)

        #bring into memory
        ds = ds.compute() 

        #export result
        folder = '/g/data/os22/chad_tmp/AusEFlux/data/5km/'+k

        if not os.path.exists(folder):
            os.makedirs(folder)

        ds.to_netcdf(results+k+'/'+k+'_5km_monthly_'+year+'.nc')

        del ds


# Extract training data from the OzFlux eddy covariance sites

More datasets are here https://dap.tern.org.au/thredds/catalog/ecosystem_process/ozflux/catalog.html



## Load modules

In [None]:
import sys
sys.path.append('/g/data/os22/chad_tmp/AusEFlux/src/')
# from _collect_training_data import extract_ec_gridded_data

## Collect training data from EC towers, and gridded data at pixel location of EC tower

This data is used for training the models.


In [None]:
suffixes = [
    'Emerald/2022_v2/L6/default/Emerald_L6_20110610_20131231_Monthly.nc',
    'GreatWesternWoodlands/2022_v2/L6/default/GreatWesternWoodlands_L6_20130101_20220601_Monthly.nc',
    'CowBay/2022_v2/L6/default/CowBay_L6_20090101_20220816_Monthly.nc',
    'Tumbarumba/2022_v2/L6/default/Tumbarumba_L6_20020107_20220530_Monthly.nc',
    'Whroo/2022_v2/L6/default/Whroo_L6_20111201_20220821_Monthly.nc',
    'WombatStateForest/2022_v2/L6/default/WombatStateForest_L6_20100120_20210529_Monthly.nc',
    'WallabyCreek/2022_v2/L6/default/WallabyCreek_L6_20050825_20130409_Monthly.nc',
    'RobsonCreek/2022_v2/L6/default/RobsonCreek_L6_20130801_20220816_Monthly.nc',
    'CapeTribulation/2022_v2/L6/default/CapeTribulation_L6_20100101_20181102_Monthly.nc',
    'AliceSpringsMulga/2022_v2/L6/default/AliceSpringsMulga_L6_20100903_20220806_Monthly.nc',
    'CalperumChowilla/2022_v2/L6/default/Calperum_L6_20100730_20220829_Monthly.nc',
    'DryRiver/2022_v2/L6/default/DryRiver_L6_20091025_20220820_Monthly.nc',
    'Litchfield/2022_v2/L6/default/Litchfield_L6_20150623_20220513_Monthly.nc',
    'SturtPlains/2022_v2/L6/default/SturtPlains_L6_20080828_20220513_Monthly.nc',
    'RiggsCreek/2022_v2/L6/default/RiggsCreek_L6_20110101_20170712_Monthly.nc', #dryland cropping
    'Otway/2022_v2/L6/default/Otway_L6_20070811_20110101_Monthly.nc', # pasture site
    'Yanco/2022_v2/L6/default/Yanco_L6_20130101_20220821_Monthly.nc', # soil site
    
    ## check the location of these ones below as they are new additions
    'Gingin/2022_v2/L6/default/Gingin_L6_20111013_20210614_Monthly.nc', # native Banksia woodland
    'LongreachMitchellGrassRangeland/2022_v2/L6/default/Longreach_L6_20181009_20220607_Monthly.nc',
    'Ridgefield/2022_v2/L6/default/Ridgefield_L6_20160101_20220821_Monthly.nc', #  dryland agriculture
    'SilverPlains/2022_v2/L6/default/SilverPlains_L6_20200101_20211231_Monthly.nc', #Australian Mountain Reseach 
    'Collie/2022_v2/L6/default/Collie_L6_20170804_20191111_Monthly.nc', # wandoo woodland
    'AdelaideRiver/2022_v2/L6/default/AdelaideRiver_L6_20071017_20090524_Monthly.nc', #  Savanna dominated by Eucalyptus
    'Warra/2022_v2/L6/default/Warra_L6_20130305_20210921_Monthly.nc', #tasmania
    
    ### ----------Problematic sites--------
    #'DalyPasture/2022_v2/L6/default/DalyPasture_L6_20080101_20130908_Monthly.nc', #fragmented landscape, pasture among native veg fragments
    #'DalyUncleared/2022_v1/L6/default/DalyUncleared_L6_20080101_20220217_Monthly.nc', #Woodland savanna LOTS OF UNCERTAINTY IN THIS SITE - Peter Isaac
    'CumberlandPlain/2022_v2/L6/default/CumberlandPlain_L6_20140101_20220820_Monthly.nc',# small remnant bushland block in ag district
    'SamfordEcologicalResearchFacility/2022_v2/L6/default/Samford_L6_20100602_20171231_Monthly.nc', #fragmented landscape 5km resolution not good enough
    'Boyagin/2022_v2/L6/default/Boyagin_L6_20171020_20220821_Monthly.nc', # in remnant forest around crops, BORDERLINE MIGHT BE OKAY
    'TiTreeEast/2022_v1/L6/default/TiTreeEast_L6_20120718_20220117_Monthly.nc', 
    #'RedDirtMelonFarm/2021_v1/L6/default/RedDirtMelonFarm_L6_20110923_20130721_Monthly.nc', #small farm in the middle of savannah woodland
    'FoggDam/2022_v2/L6/default/FoggDam_L6_20060207_20081031_Monthly.nc', # wetland that floods seasonally
    #'Loxton/2022_v1/L6/default/Loxton_L6_20080819_20090609_Monthly.nc' # almond tress, 1 year of data
]

i=0
sites = []
for suffix in suffixes:
    print(" {:02}/{:02}\r".format(i + 1, len(suffixes)), end="")
    xx = extract_ec_gridded_data(suffix,
                                 scale='1km',
                                 save_ec_data=False,
                                 verbose=False,
                                 add_comparisons=False,
                                 export_path='/g/data/os22/chad_tmp/AusEFlux/results/training_data/'
                                )
    sites.append(xx)
    i+=1
    