# Exploring flux data and environmental covariables

tern data: https://portal.tern.org.au/#/d0436eef

## Load modules

In [None]:
# !pip install dea-tools
# !pip install odc-algo==0.2.2
# !pip install mlforecast
# !pip install dask_ml==1.9.0

In [None]:
import xarray as xr
import numpy as np
import pandas as pd
# import seaborn as sb
from joblib import dump
from pprint import pprint
import multiprocessing

from datacube.utils.dask import start_local_dask
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, ShuffleSplit, KFold

import sys
sys.path.append('/g/data/os22/chad_tmp/NEE_modelling/')
# from preprocess_input_data import preprocess_data_insitu
from preprocess_input_data import preprocess_data_gridded

In [None]:
client = start_local_dask(mem_safety_margin='2Gb')
client

## Analysis Parameters

In [None]:
model_name = 'AUS'

In [None]:
ncpus=multiprocessing.cpu_count()
print('ncpus = '+str(ncpus))

## Prepare Data

In [None]:
# ~ EBF 
aa = preprocess_data_insitu('Tumbarumba/2021_v1/L6/default/Tumbarumba_L6_20020107_20191231_Monthly.nc')
bb = preprocess_data_insitu('CumberlandPlain/2022_v1/L6/default/CumberlandPlain_L6_20140101_20220101_Monthly.nc')
cc = preprocess_data_insitu('Whroo/2021_v1/L6/default/Whroo_L6_20111201_20210724_Monthly.nc')
dd = preprocess_data_insitu('WombatStateForest/2022_v1/L6/default/WombatStateForest_L6_20100120_20210529_Monthly.nc')
ee = preprocess_data_insitu('WallabyCreek/2022_v1/L6/default/WallabyCreek_L6_20050825_20130409_Monthly.nc') # fire in 2010

# tropical forest
ff =  preprocess_data_insitu('RobsonCreek/2022_v1/L6/default/RobsonCreek_L6_20130801_20211218_Monthly.nc') 
gg =  preprocess_data_insitu('CapeTribulation/2022_v1/L6/default/CapeTribulation_L6_20100101_20181102_Monthly.nc')

#Savannah/woody-savannah 
hh = preprocess_data_insitu('AliceSpringsMulga/2022_v1/L6/default/AliceSpringsMulga_L6_20100903_20211231_Monthly.nc')
ii = preprocess_data_insitu('CalperumChowilla/2022_v1/L6/default/Calperum_L6_20100730_20220216_Monthly.nc')
jj = preprocess_data_insitu('DryRiver/2022_v1/L6/default/DryRiver_L6_20091025_20220218_Monthly.nc')
kk = preprocess_data_insitu('Litchfield/2021_v1/L6/default/Litchfield_L6_20150623_20210725_Monthly.nc')
ll = preprocess_data_insitu('Gingin/2021_v1/L6/default/Gingin_L6_20111013_20201231_Monthly.nc')

# grasslands
mm = preprocess_data_insitu('TiTreeEast/2022_v1/L6/default/TiTreeEast_L6_20120718_20220117_Monthly.nc')
nn = preprocess_data_insitu('SturtPlains/2021_v1/L6/default/SturtPlains_L6_20080828_20210724_Monthly.nc')
oo = preprocess_data_insitu('RiggsCreek/2022_v1/L6/default/RiggsCreek_L6_20110101_20170712_Monthly.nc') # pasture
pp = preprocess_data_insitu('DalyPasture/2022_v1/L6/default/DalyPasture_L6_20080101_20130908_Monthly.nc')# pasture
qq = preprocess_data_insitu('Otway/2021_v1/L6/default/Otway_L6_20070811_20110101_Monthly.nc')

# croplands

#soil site
# dd = preprocess_data(base, 'Yanco/2021_v1/L6/default/Yanco_L6_20130101_20210724_Monthly.nc')

#COLLIE SITE AT Level 5
# 'ozflux/Collie/2021_v1/L5/default/Collie_L5_20170804_20191111.nc'

### Write out predictior variables to text file

Will use these later to ensure input data Dataset is in the correct order

In [None]:
col = list(aa.columns[1:])

textfile = open("/g/data/os22/chad_tmp/NEE_modelling/results/variables.txt", "w")
for element in col:
    textfile.write(element + ",")
textfile.close()

## Modelling

In [None]:
sites=[aa,bb,cc,dd,ee,ff,gg,hh,ii,jj,kk,ll,mm,nn,oo,pp,qq]

In [None]:
xx = []
yy = []
for site in sites:
    df_var=site.drop('NEE_SOLO', axis=1) 
    df_nee=site['NEE_SOLO']
    x = df_var.reset_index(drop=True).to_numpy()
    y = df_nee.reset_index(drop=True).to_numpy()
    xx.append(x)
    yy.append(y)

x = np.concatenate([x for x in xx])
y = np.concatenate([y for y in yy])

print(x.shape)

## Test model robustness with nested K-fold cross validation

In [None]:
inner_cv_splits = 5

outer_cv_splits = 5

test_size = 0.20

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'criterion': ["squared_error", "absolute_error"],
    'max_features': ['auto', 'log2', None],
    'n_estimators': [200,300,400]
}

In [None]:
outer_cv = KFold(n_splits=outer_cv_splits, shuffle=True,
                        random_state=0)

# lists to store results of CV testing
acc = []
rmse=[]
r2=[]
i = 1
for train_index, test_index in outer_cv.split(x, y):
    print(f"Working on {i}/5 outer cv split", end='\r')
    model = RandomForestRegressor(random_state=1, n_jobs=ncpus)

    # index training, testing, and coordinate data
    X_tr, X_tt = x[train_index, :], x[test_index, :]
    y_tr, y_tt = y[train_index], y[test_index]
    
    # inner split on data within outer split
    inner_cv = KFold(n_splits=inner_cv_splits,
                     shuffle=True,
                     random_state=0)
    
    clf = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='r2',
        n_jobs=ncpus,
        refit=True,
        cv=inner_cv.split(X_tr, y_tr),
    )

    clf.fit(X_tr, y_tr)
    # predict using the best model
    best_model = clf.best_estimator_
    pred = best_model.predict(X_tt)

    # evaluate model w/ multiple metrics
    # r2
    r2_ = r2_score(y_tt, pred)
    r2.append(r2_)
    # Overall accuracy
    ac = mean_absolute_error(y_tt, pred)
    acc.append(ac)
    # F1 scores
    rmse_ = np.sqrt(mean_squared_error(y_tt, pred))
    rmse.append(rmse_)
    i += 1

In [None]:
print("Mean MAE accuracy: "+ str(round(np.mean(acc), 2)))
print("Std dev of MAE accuracy: "+ str(round(np.std(acc), 2)))
print('\n')
print("Mean RMSE: "+ str(round(np.mean(rmse), 2)))
print("Std dev RMSE: "+ str(round(np.std(rmse), 2)))
print('\n')
print("Mean r2: "+ str(round(np.mean(r2), 2)))
print("Std dev r2: "+ str(round(np.std(r2), 2)))

## Optimize model

In [None]:
#generate n_splits of train-test_split
rs = ShuffleSplit(n_splits=outer_cv_splits, test_size=test_size, random_state=0)

#instatiate a gridsearchCV
clf = GridSearchCV(RandomForestRegressor(),
                   param_grid,
                   scoring='r2',
                   verbose=1,
                   cv=rs.split(x, y),
                   n_jobs=ncpus)

clf.fit(x, y)

In [None]:
print("The most accurate combination of tested parameters is: ")
pprint(clf.best_params_)
print('\n')
print("The r2 score using these parameters is: ")
print(round(clf.best_score_, 2))

## Fit on all data using best params

In [None]:
model = RandomForestRegressor(**clf.best_params_, random_state=1, n_jobs=ncpus)
model.fit(x, y)

In [None]:
order = np.argsort(model.feature_importances_)

plt.figure(figsize=(15,4))
plt.bar(x=np.array(df_var.columns.values)[order],
        height=model.feature_importances_[order])
plt.gca().set_ylabel('Importance', labelpad=10)
plt.gca().set_xlabel('Feature', labelpad=10)
plt.tight_layout()

## Save the model

In [None]:
dump(model, '/g/data/os22/chad_tmp/NEE_modelling/results/'+model_name+'_NEE_model.joblib')

## Predict on all the data

In [None]:
y_pred = model.predict(x)

rmse = float(format(np.sqrt(mean_squared_error(y, y_pred)), '.3f'))
print("RMSE:", rmse);

### Compare at site 1

In [None]:
compare = pd.DataFrame(data=y_pred[0:len(aa)], index=aa.index).rename({0:'NEE_pred'}, axis=1)
aa.join(compare).plot(y=['NEE_LT', 'NEE_pred'], figsize=(11,5))
plt.title('Prediction of NEE using RF Regressor');

### Compare at site 2

In [None]:
compare = pd.DataFrame(data=y_pred[len(aa):len(aa)+len(bb)], index=bb.index).rename({0:'NEE_pred'}, axis=1)
bb.join(compare).plot(y=['NEE_LT', 'NEE_pred'], figsize=(11,5))
plt.title('Prediction of NEE using RF Regressor');

---

## Leave one out: Predict, then compare on site not included in training data

In [None]:
x=olo.drop('NEE_LT', axis=1) # predictors
y=olo['NEE_LT'] # target

In [None]:
y_pred = model.predict(x)

In [None]:
compare = pd.DataFrame(data=y_pred, index=olo.index).rename({0:'NEE_pred'}, axis=1)
olo.join(compare).plot(y=['NEE_LT', 'NEE_pred'], figsize=(11,5))
plt.title('Prediction of NEE using RF Regressor trained on other sites');

rmse = float(format(np.sqrt(mean_squared_error(y, y_pred)), '.3f'))
print("RMSE:", rmse);

In [None]:
# import mlforecast as mlf
# from mlforecast.core import TimeSeries
# from mlforecast.forecast import Forecast
# from mlforecast.distributed.models.xgb import XGBForecast
# import dask.dataframe as dd
# from mlforecast.distributed.forecast import DistributedForecast
# from window_ops.expanding import expanding_mean
# from window_ops.rolling import rolling_mean

# from statsmodels.tsa.vector_ar.var_model import VAR
# from statsmodels.tsa.stattools import adfuller
# from statsmodels.tools.eval_measures import rmse
# from statsmodels.tsa.stattools import grangercausalitytests