# Modelling uncertainty in ML predictions

Quantile regression in this context doesn't work as an estimate of Aus-wide uncertainty.

Instead, we will attempt to model the uncertainty that comes from the training data and optimization algorithm.  To do this, we will generate 30 models. For each iteration, two site's entire time-series will be removed from the training data and both a  LGBM model will be fit on the remaining data.  This will result in 29 models that later we can use to make predictions with. The envelope of our predictions will inform our uncertainity


## Load modules

In [1]:
import os
import xarray as xr
import numpy as np
import pandas as pd
from joblib import dump
from scipy import stats
import matplotlib.pyplot as plt
from pprint import pprint
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgbm
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings("ignore")

## Analysis Parameters

In [2]:
model_var = 'NEE'
features_list = '/g/data/os22/chad_tmp/NEE_modelling/results/feature_select_'+model_var+'.txt'

## Prepare Data

In [3]:
base = '/g/data/os22/chad_tmp/NEE_modelling/results/training_data/'
sites = os.listdir('/g/data/os22/chad_tmp/NEE_modelling/results/training_data/')

td = []
for site in sites:
    if '.csv' in site:
        xx = pd.read_csv(base+site, index_col='time', parse_dates=True)
        xx['site'] = site[0:5]
        td.append(xx)

ts = pd.concat(td).dropna() #we'll use this later

In [4]:
variables = [
            #'LAI_anom_RS',
             'kNDVI_anom_RS',
             'FPAR_RS',
             'LST_RS',
             'tree_cover_RS',
             'nontree_cover_RS',
             'nonveg_cover_RS',
             'LST-Tair_RS',
             'TWI_RS',
             'NDWI_RS',
             'rain_anom_RS',
             'rain_cml3_anom_RS',
             'rain_cml6_anom_RS',
             'rain_cml12_anom_RS',
             'srad_anom_RS',
             'vpd_RS',
             'tavg_anom_RS',
             'SOC_RS',
             'C4_percent_RS',
             'elevation_RS',
             'VegH_RS',
             'site'
            ]

In [5]:
xx = []
yy = []

for t in td:
    # t = t.drop(['Fluxcom_RS-Meteo_NEE', 'Fluxcom_RS_NEE', 'ThisStudy_NEE', 'Cable_NEE',
    #    'Fluxcom_RS_GPP', 'Fluxcom_RS-meteo_GPP', 'ThisStudy_GPP', 'Cable_GPP',
    #    'MODIS_GPP', 'GOSIF_GPP'], axis=1)  
    
    t = t.dropna()  # remove NaNS
    df = t.drop(['NEE_SOLO_EC','GPP_SOLO_EC','ER_SOLO_EC'], axis=1) # seperate carbon fluxes
    
    #df = df.filter(regex='RS') # only use remote sensing variables   
    df = df[variables]
    
    if model_var == 'ET':
        df_var=t[model_var+'_EC']
    else:
        df_var=t[[model_var+'_SOLO_EC', 'site']] # seperate out the variable we're modelling
    
    x = df.reset_index(drop=True)#.to_numpy()
    y = df_var.reset_index(drop=True)#.to_numpy()
    xx.append(x)
    yy.append(y)

x = pd.concat(xx)
y = pd.concat(yy)

print(x.shape)

(2744, 21)


### Remove features not selected in earlier fitting

In [6]:
train_vars = list(pd.read_csv(features_list))[0:-1]
train_vars=[i[:-3] for i in train_vars]
train_vars = [i+'_RS' for i in train_vars]
train_vars.append('site')

x = x[train_vars]

### Generate 30 iterations of models

For two regression methods (RF and LigtGBM), we remove two randomly selected sites from the training data

Then, we do the per site TSCV: For each site, grab a sequential set of test samples (time-series-split methods), the remaining points (either side of test samples) go into training.  A single K-fold contains test and training samples from every site.

A model is built and saved that is trained of 15 iterations of site removal


In [9]:
%%time
i=0
for m in range(1,16): # 15 iterations for each model
    print(" {:03}/{:03}\r".format(m, len(range(1,16))))
          
    #randomly select two sites to remove from dataset
    subset=np.random.choice(x['site'].unique(), size=2)
    x_n = x[~x.site.isin(subset)]
    y_n = y[~y.site.isin(subset)]

    sites_n = x_n['site'].unique()
    x_n['original_index'] = [i for i in range(0,len(x_n))]
    
    #build TSCV splits across all remaining sites
    train_1=[]
    train_2=[]
    train_3=[]
    train_4=[]
    train_5=[]

    test_1=[]
    test_2=[]
    test_3=[]
    test_4=[]
    test_5=[]

    for site_n in sites_n:
        df = x_n.loc[x_n['site'] == site_n]
        tscv = TimeSeriesSplit(n_splits=5)
        i=1
        for train, test in tscv.split(df):
            all_indices=np.concatenate([train,test])
            left_over = df.loc[~df.index.isin(all_indices)].index.values
            train = np.concatenate([train, left_over])
            if i==1:
                train_1.append(df.iloc[train]['original_index'].values)
                test_1.append(df.iloc[test]['original_index'].values)
            if i==2:
                train_2.append(df.iloc[train]['original_index'].values)
                test_2.append(df.iloc[test]['original_index'].values)
            if i==3:
                train_3.append(df.iloc[train]['original_index'].values)
                test_3.append(df.iloc[test]['original_index'].values)
            if i==4:
                train_4.append(df.iloc[train]['original_index'].values)
                test_4.append(df.iloc[test]['original_index'].values)
            if i==4:
                train_5.append(df.iloc[train]['original_index'].values)
                test_5.append(df.iloc[test]['original_index'].values)
            i+=1

    train_1 = np.concatenate(train_1)
    train_2 = np.concatenate(train_2)
    train_3 = np.concatenate(train_3)
    train_4 = np.concatenate(train_4)
    train_5 = np.concatenate(train_5)

    test_1 = np.concatenate(test_1)
    test_2 = np.concatenate(test_2)
    test_3 = np.concatenate(test_3)
    test_4 = np.concatenate(test_4)
    test_5 = np.concatenate(test_5)

    train = [train_1, train_2, train_3, train_4, train_5]
    test = [test_1, test_2, test_3, test_4, test_5]

    #check there are no train indices in the test indices
    for i,j in zip(train, test):
        assert (np.sum(np.isin(i,j)) == 0)

    #remove the columns we no longer need
    x_n = x_n.drop(['site', 'original_index'], axis=1)
    y_n = y_n.drop('site', axis=1)

    #loop through the two regression methods
    for regressor in [LGBMRegressor,
                      RandomForestRegressor]:

        if isinstance(regressor(), lgbm.sklearn.LGBMRegressor):
            m_name='_lgbm_'

            param_grid = {
                'num_leaves': stats.randint(5,40),
                'min_child_samples':stats.randint(10,30),
                'boosting_type': ['gbdt', 'dart'],
                'max_depth': stats.randint(5,25),
                'n_estimators': [300, 400, 500],
            }

        else:
            m_name='_rf_'

            param_grid = {
                'max_depth': stats.randint(5,25),
                'max_features': ['log2', None, "sqrt"],
                'n_estimators': [300,400,500]}

        print('  Model:', m_name)
        
        clf = RandomizedSearchCV(
               regressor(random_state=0, n_jobs=-1),
               param_grid,
               scoring='r2',
               verbose=0,
               n_iter=200,
               n_jobs=-1,
               cv=zip(train, test)
              )
        
        if m_name=='_rf_':
            clf.fit(x_n, y_n.values.ravel())
        
        else:
            clf.fit(x_n, y_n)
        
        print('  r2 score ', round(clf.best_score_, 2))

        #fit model and save
        model = regressor(**clf.best_params_)
        
        if m_name=='_rf_':
            model.fit(x_n, y_n.values.ravel())
        
        else:
            model.fit(x_n, y_n)

        dump(model, '/g/data/os22/chad_tmp/NEE_modelling/results/models_uncertainty/'+model_var+m_name+str(m)+'.joblib')

    i+=1 

 001/015
  Model: _lgbm_
  r2 score  0.59
  Model: _rf_
  r2 score  0.61
 002/015
  Model: _lgbm_
  r2 score  0.57
  Model: _rf_
  r2 score  0.57
 003/015
  Model: _lgbm_
  r2 score  0.58
  Model: _rf_
  r2 score  0.59
 004/015
  Model: _lgbm_
  r2 score  0.6
  Model: _rf_
  r2 score  0.6
 005/015
  Model: _lgbm_
  r2 score  0.6
  Model: _rf_
  r2 score  0.6
 006/015
  Model: _lgbm_
  r2 score  0.62
  Model: _rf_
  r2 score  0.62
 007/015
  Model: _lgbm_
  r2 score  0.62
  Model: _rf_
  r2 score  0.63
 008/015
  Model: _lgbm_
  r2 score  0.54
  Model: _rf_
  r2 score  0.53
 009/015
  Model: _lgbm_
  r2 score  0.61
  Model: _rf_
  r2 score  0.62
 010/015
  Model: _lgbm_
  r2 score  0.62
  Model: _rf_
  r2 score  0.62
 011/015
  Model: _lgbm_
  r2 score  0.58
  Model: _rf_
  r2 score  0.58
 012/015
  Model: _lgbm_
  r2 score  0.55
  Model: _rf_
  r2 score  0.54
 013/015
  Model: _lgbm_
  r2 score  0.62
  Model: _rf_
  r2 score  0.61
 014/015
  Model: _lgbm_
  r2 score  0.62
  Model: _rf_

In [8]:
    # if os.path.exists('/g/data/os22/chad_tmp/NEE_modelling/results/models_uncertainty/'+model_var+m_name+str(i)+'.joblib'):
    #     print('skipping iteration as model already exists')
    #     continue