# Generate an ensemble of gridded predictions

Using the models produced in `4_Generate_ensemble_of_models.ipynb`, we will generate an ensemble of predictions. From this ensemble we will produce an uncertainty envelope, and a median prediction.


In [None]:
import os
import sys
import warnings
import xarray as xr
import numpy as np
import pandas as pd
from joblib import load
from matplotlib import pyplot as plt
warnings.filterwarnings("ignore")

sys.path.append('/g/data/os22/chad_tmp/AusEFlux/src/')
from _prediction import collect_prediction_data, predict_xr, HiddenPrints
from _utils import start_local_dask, round_coords

In [None]:
client = start_local_dask(mem_safety_margin='2Gb')
client

## Analysis Parameters

In [None]:
model_var = 'NEE'
base = '/g/data/os22/chad_tmp/AusEFlux/'

results_path = f'{base}results/predictions/ensemble/{model_var}/'
models_folder = f'{base}results/models/ensemble/{model_var}/'
features_list = f'{base}results/variables.txt'

t1, t2='2003','2022'

## Get paths to models

In [None]:
model_list = [file for file in os.listdir(models_folder) if file.endswith(".joblib")]

## Open predictor data

At 1 km resolution, we need to pull the gridded feature layers in as dask arrays and compute on each time-step individually as the total memory requirements are very large. At 5 km resolution, its better to load the entire feature layer data into memory as it speeds up predictions.

In [None]:
%%time
## open data
data = collect_prediction_data(data_path=f'{base}/data/5km/',
                             time_range=(t1,t2),
                             verbose=False,
                             export=False,
                             chunks=dict(time=1)
                             )

# data = data.compute()

## Create no-data & urban masks

If we haven't already

In [None]:
# mask = data[['EVI', 'NDWI', 'VegH', 'SRAD']].to_array().isnull().any('variable')
# mask.compute().to_netcdf(f'{base}data/mask_5km_monthly_{t1}_{t2}.nc')

# #create an urban mask once, then next time load it.
# urban = xr.open_dataarray('/g/data/os22/chad_tmp/NEE_modelling/data/urban_mask_1km.nc')
# urban = urban.odc.reproject(mask.odc.geobox, resampling='mode')
# urban=round_coords(urban)
# urban.name='urban_mask'
# urban = urban.astype(bool).rename({'latitude':'y', 'longitude':'x'})
# urban.compute().to_netcdf(f'{base}data/urban_mask_5km.nc')

#open the mask if already created.
mask = xr.open_dataarray(f'{base}data/mask_5km_monthly_{t1}_{t2}.nc')
urban = xr.open_dataset(f'{base}data/urban_mask_5km.nc')['urban_mask']

### Index by variables and check variable order

In [None]:
train_vars = list(pd.read_csv(features_list))[0:-1]
train_vars.remove('site')
train_vars=[i[:-3] for i in train_vars]

data = data[train_vars]

if train_vars == list(data.data_vars):
    print('Variables match, n:', len(data.data_vars))
else:
    raise ValueError("Variables don't match")

## Predict

Loop through each model, and each time-step.  Mask the output with the urban mask.


In [None]:
# Loop through models
for m in model_list:
    name = m.split('.')[0]
    
    if os.path.exists(f'{results_path}{name}.nc'):
        print('skipping model '+name)
        continue
    
    print('Model: ', name)
    
    warnings.filterwarnings("ignore")
    model = load(models_folder+m).set_params(n_jobs=1)
    
    results = []
    i=0
    #loop through the time-steps
    for i in range(0, len(data.time)): 
        print("  {:03}/{:03}\r".format(i + 1, len(range(0, len(data.time)))), end="")

        with HiddenPrints():
            warnings.filterwarnings("ignore")
            predicted = predict_xr(model,
                                data.isel(time=i),
                                proba=False,
                                clean=True,
                                #chunk_size=875000, #this number is optimized to maximise pred speed.
                                  ).compute()

                #mask no-data areas
            predicted = predicted.Predictions.where(~mask.isel(time=i))
        
            #add back time dim
            predicted['time'] = data.isel(time=i).time.values
        
            #append to list
            results.append(predicted.astype('float32'))
            i+=1 

        #join together into a Dataset
        ds = xr.concat(results, dim='time').sortby('time').rename(model_var).astype('float32')
        
        #mask urban areas
        ds = ds.where(urban!=1).astype('float32')

        #save results
        ds.to_netcdf(f'{results_path}{name}.nc')
    