# Create gridded feature datasets

The goal here is to take the spatio-temporally harmonised files in `interim/` that were output from `1_Spatiotemporal_harmonization.ipynb` and stack them into ready-to-use netcdf files in the `5km` data folder. We want one netcdf per feature. Additionally, we create a few new features, for example vegetation fractions, anomalies etc.


In [None]:
import os
import pandas as pd
import xarray as xr
import numpy as np
from odc.geo.xr import assign_crs

import warnings
warnings.simplefilter(action='ignore')

import sys
sys.path.append('/g/data/os22/chad_tmp/AusEFlux/src/')
from _utils import start_local_dask

In [None]:
client = start_local_dask(mem_safety_margin='2Gb')
client

In [None]:
base = '/g/data/os22/chad_tmp/AusEFlux/data/interim/'
results='/g/data/os22/chad_tmp/AusEFlux/data/5km/'

#any datasets we want to exclude?
exclude = ['.ipynb_checkpoints', 'kTavg', 'Tmax', 'Tmin', 'WCF']

## Loop through folders and join interim files

In [None]:
folders = [i for i in os.listdir(base) if i not in exclude]

for f in folders[1:]:
    print(f)
    files = [f'{base}{f}/{i}' for i in os.listdir(base+f) if i.endswith(".nc")]
    files.sort()

    #combine annual files into one file
    ds = xr.open_mfdataset(files)
    ds = ds.chunk(dict(time=-1, latitude=250, longitude=250))
                  
    # Gapfill NDWI differently (has real gaps)
    if f=='NDWI':
        # seperate into climatologies and anomalies
        ds_monthly = ds.groupby('time.month').mean()
        ds_anom = ds.groupby('time.month') - ds_monthly  
        
        # fill linearly by max 2 steps
        ds_anom = ds_anom.interpolate_na(dim='time', method='linear', limit=2)
        
        #recombine anomalies and climatology
        ds = ds_anom.groupby('time.month') + ds_monthly
        ds = ds.drop('month')
        
        #fill remaining gaps with climatology
        ds = ds.groupby("time.month").fillna(ds_monthly)

    # ensure no gaps in other datasets (there shouldn't be any)
    # this is just to be cautious
    else:
        ds_monthly = ds.groupby('time.month').mean()
        ds = ds.groupby("time.month").fillna(ds_monthly)
    
    ds = ds.drop('month').compute()
    ds.to_netcdf(results+f+'_5km_2003_2022.nc')

    break

## Create new features