# Evaluate and fit a ML model on the EC flux tower data 


## Load modules

In [None]:
import os
import xarray as xr
import numpy as np
import pandas as pd
import seaborn as sb
from joblib import dump
import multiprocessing
from pprint import pprint
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
import lightgbm as lgbm
import shap
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import sys
sys.path.append('/g/data/os22/chad_tmp/dea-notebooks/Tools')
from dea_tools.classification import spatial_clusters

import warnings
warnings.filterwarnings("ignore")

## Analysis Parameters

In [None]:
model_name = 'AUS'
model_var = 'NEE'

In [None]:
ncpus=multiprocessing.cpu_count()
print('ncpus = '+str(ncpus))

## Prepare Data

In [None]:
base = '/g/data/os22/chad_tmp/NEE_modelling/results/training_data/'
sites = os.listdir('/g/data/os22/chad_tmp/NEE_modelling/results/training_data/')

td = []
for site in sites:
    if '.csv' in site:
        xx = pd.read_csv(base+site, index_col='time', parse_dates=True)
        xx['site'] = site[0:5]
        td.append(xx)

In [None]:
variables = [
            #'LAI_anom_RS',
             'kNDVI_anom_RS',
             'FPAR_RS',
             'LST_RS',
             'tree_cover_RS',
             'nontree_cover_RS',
             'nonveg_cover_RS',
             'LST-Tair_RS',
             'TWI_RS',
             'NDWI_RS',
             'rain_anom_RS',
             'rain_cml3_anom_RS',
             'rain_cml6_anom_RS',
             'rain_cml12_anom_RS',
             'srad_anom_RS',
             'vpd_RS',
             'tavg_anom_RS',
             'SOC_RS',
             #'CO2_RS',
             'site'
            ]

## Modelling

In [None]:
xx = []
yy = []

for t in td:
    #t = t.drop('PFT_RS', axis=1)  
    t = t.dropna()  # remove NaNS
    df = t.drop(['NEE_SOLO_EC','GPP_SOLO_EC','ER_SOLO_EC'], axis=1) # seperate carbon fluxes
    
    #df = df.filter(regex='RS') # only use remote sensing variables   
    df = df[variables]
    
    if model_var == 'ET':
        df_var=t[model_var+'_EC']
    else:
        df_var=t[model_var+'_SOLO_EC'] # seperate out the variable we're modelling
    
    x = df.reset_index(drop=True)#.to_numpy()
    y = df_var.reset_index(drop=True)#.to_numpy()
    xx.append(x)
    yy.append(y)

x = pd.concat(xx)
y = pd.concat(yy)

print(x.shape)

### Specify monotonic constraints for CO2

In [None]:
# m_con= ['1' if col == 'CO2_RS' else '0' for col in x.columns]

## Feature Selection

http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/#sequentialfeatureselector-the-popular-forward-and-backward-feature-selection-approaches-including-floating-variants

## Test model robustness with time-series K-fold cross validation

* Need to understand parameters of LightGBM better to define a sensible `param_grid`, and prevent overfitting
* If you set boosting as RF then the lightgbm algorithm behaves as random forest. According to the documentation, to use RF you must use bagging_fraction and feature_fraction smaller than 1


<img src="results/figs/cross_validation.png" width=700>

### Generate five sets of train-test indices

For each site, grab a sequential set of test samples (time-series-split methods), the remaining points (either side of test samples) go into training.  A single K-fold contains test and training samples from every site.

In [None]:
sites = x['site'].unique()
x['original_index'] = [i for i in range(0,len(x))]

train_1=[]
train_2=[]
train_3=[]
train_4=[]
train_5=[]

test_1=[]
test_2=[]
test_3=[]
test_4=[]
test_5=[]

for site in sites:
    df = x.loc[x['site'] == site]
    tscv = TimeSeriesSplit(n_splits=5)
    i=1
    for train, test in tscv.split(df):
        all_indices=np.concatenate([train,test])
        left_over = df.loc[~df.index.isin(all_indices)].index.values
        train = np.concatenate([train, left_over])
        if i==1:
            train_1.append(df.iloc[train]['original_index'].values)
            test_1.append(df.iloc[test]['original_index'].values)
        if i==2:
            train_2.append(df.iloc[train]['original_index'].values)
            test_2.append(df.iloc[test]['original_index'].values)
        if i==3:
            train_3.append(df.iloc[train]['original_index'].values)
            test_3.append(df.iloc[test]['original_index'].values)
        if i==4:
            train_4.append(df.iloc[train]['original_index'].values)
            test_4.append(df.iloc[test]['original_index'].values)
        if i==4:
            train_5.append(df.iloc[train]['original_index'].values)
            test_5.append(df.iloc[test]['original_index'].values)
        i+=1

train_1 = np.concatenate(train_1)
train_2 = np.concatenate(train_2)
train_3 = np.concatenate(train_3)
train_4 = np.concatenate(train_4)
train_5 = np.concatenate(train_5)

test_1 = np.concatenate(test_1)
test_2 = np.concatenate(test_2)
test_3 = np.concatenate(test_3)
test_4 = np.concatenate(test_4)
test_5 = np.concatenate(test_5)

train = [train_1, train_2, train_3, train_4, train_5]
test = [test_1, test_2, test_3, test_4, test_5]

#check there are no train indices in the test indices
for i,j in zip(train, test):
    assert (np.sum(np.isin(i,j)) == 0)

#remove the columns we no longer need
x = x.drop(['site', 'original_index'], axis=1)

### Write out predictor variables to text file

In [None]:
textfile = open("/g/data/os22/chad_tmp/NEE_modelling/results/variables.txt", "w")
for element in x.columns:
    textfile.write(element + ",")
textfile.close()

In [None]:
# Create the parameter grid based on the results of random search 
# param_grid = {
#     'num_leaves': [7, 14, 21, 28, 31, 50, 70],
#     'boosting_type ': ['gbdt','dart'],
#     'max_depth': [-1, 3, 5, 10, 20],
#     'n_estimators': [100, 200, 300],
# }

# Create the parameter grid based on the results of random search 
param_grid = {
    # 'num_leaves': [7,21,31,50],
    #'boosting_type ': ['gbdt', 'dart'],
    # 'max_depth': [-1, 5, 10, 20],
    'n_estimators': [100,
                     # 200,
                     # 300
                    ],
}

In [None]:
# lists to store results of CV testing
acc = []
rmse=[]
r2=[]

i = 1
for train_index, test_index in zip(train, test):
    print(f"Working on {i}/{len(train)} outer cv split", end='\r')
    model = LGBMRegressor(random_state=1,
                          n_jobs=ncpus,
                          # monotone_constraints=m_con,
                          # monotone_constraints_method='intermediate'
                          )

    # index training, testing
    X_tr, X_tt = x.iloc[train_index, :], x.iloc[test_index, :]
    y_tr, y_tt = y.iloc[train_index], y.iloc[test_index]
    
    # def index_gen(listTrain, listTest):
    #     yield listTrain, listTest
    # inner split on data within outer split
    inner_cv = KFold(n_splits=2,
                     shuffle=True,
                     random_state=0)
    
    clf = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='r2',
        n_jobs=ncpus,
        refit=True,
        cv=inner_cv.split(X_tr, y_tr),
    )

    clf.fit(X_tr, y_tr)
    # predict using the best model
    best_model = clf.best_estimator_
    pred = best_model.predict(X_tt)

    # evaluate model w/ multiple metrics
    # r2
    r2_ = r2_score(y_tt, pred)
    r2.append(r2_)
    # Overall accuracy
    ac = mean_absolute_error(y_tt, pred)
    acc.append(ac)
    # F1 scores
    rmse_ = np.sqrt(mean_squared_error(y_tt, pred))
    rmse.append(rmse_)
    
    #1:1 plots in a plot
    fig,ax = plt.subplots(1,1, figsize=(6,6))
    sb.scatterplot(x=y_tt,y=pred,color="#338844", edgecolor="white", s=50, lw=1, alpha=0.5, ax=ax)
    sb.regplot(x=y_tt, y=pred, scatter=False, color='m', ax=ax)
    sb.regplot(x=y_tt, y=y_tt, color='black', scatter=False, line_kws={'linestyle':'dashed'}, ax=ax);
    
    plt.xlabel('True')
    plt.ylabel('Prediction');
    plt.tight_layout()
    fig.savefig("/g/data/os22/chad_tmp/NEE_modelling/results/"+str(i)+"_"+model_var+"_lgbm.png")
    plt.close()
    
    i += 1

In [None]:
print("Mean MAE % accuracy: "+ str(round(np.mean(acc), 2)))
print("Std dev of MAE % accuracy: "+ str(round(np.std(acc), 2)))
print('\n')
print("Mean RMSE: "+ str(round(np.mean(rmse), 2)))
print("Std dev RMSE: "+ str(round(np.std(rmse), 2)))
print('\n')
print("Mean r2: "+ str(round(np.mean(r2), 2)))
print("Std dev r2: "+ str(round(np.std(r2), 2)))

## Optimize model

In [None]:
param_grid = {
    'num_leaves': [7, 14, 21, 28, 31, 50],
    'min_data_in_leaf':[15, 20, 30],
    #'boosting_type ': ['gbdt', 'dart'],
    'max_depth': [3, 5, 10, 20],
    'n_estimators': [100, 200, 300, 400],
}

In [None]:
#generate n_splits of train-test_split
#rs = ShuffleSplit(n_splits=outer_cv_splits, test_size=test_size, random_state=1)

#instatiate a gridsearchCV
clf = GridSearchCV(LGBMRegressor(
                                 # boosting_type='rf',
                                 # bagging_freq=1, 
                                 # bagging_fraction=0.8
                                ),
                   param_grid,
                   scoring='r2',
                   verbose=0,
                   cv=zip(train, test),
                   n_jobs=ncpus)

clf.fit(x, y, callbacks=None)

In [None]:
print("The most accurate combination of tested parameters is: ")
pprint(clf.best_params_)
print('\n')
print("The r2 score using these parameters is: ")
print(round(clf.best_score_, 2))

## Fit on all data using best params


In [None]:
model = LGBMRegressor(**clf.best_params_,
                     # monotone_constraints=m_con,
                     # monotone_constraints_method='intermediate'
                     )

model.fit(x,y)

## Save the model

In [None]:
dump(model, '/g/data/os22/chad_tmp/NEE_modelling/results/models/'+model_name+'_'+model_var+'_LGBM_model.joblib')

## Predict on all the data

In [None]:
y_pred = model.predict(x)

rmse = float(format(np.sqrt(mean_squared_error(y, y_pred)), '.3f'))
print("RMSE:", rmse);

In [None]:
sb.scatterplot(x=y,y=y_pred,color="#338844", edgecolor="white", s=50, lw=1, alpha=0.5)
sb.regplot(x=y, y=y_pred, scatter=False, color='m')
sb.regplot(x=y, y=y, color='black', scatter=False, line_kws={'linestyle':'dashed'});
plt.xlabel('True')
plt.ylabel('Prediction');

## Examine feature importance

SHAP (SHapley Additive exPlanations) is a game theoretic approach to explain the output of any machine learning model

https://github.com/slundberg/shap

In [None]:
# explain the model's predictions using SHAP
explainer = shap.Explainer(model)
shap_values = explainer(x)

# visualize the first prediction's explanation
# shap.plots.waterfall(shap_values[0])
# shap.plots.beeswarm(shap_values)
shap.plots.bar(shap_values)