# Notebook Objective
The aim of this notebook is to validate the XGB model when used to predict from data that was not part of the training dataset (hidden data).

In [1]:
# Import libraries
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# To work with numpy arrays
import numpy as np
# ML model building, training and testing
import sklearn
# Kfold split and scoring
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# Build ML pipeline
from sklearn.pipeline import Pipeline
# Liner regression model building
from sklearn.linear_model import LinearRegression
# Polynomial regression
from sklearn.preprocessing import PolynomialFeatures
# sklearn included transformer
from sklearn.preprocessing import QuantileTransformer
# to split data into test and train
from sklearn.model_selection import train_test_split
# Scaling transform for PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# To run PCA feature reduction
from sklearn.decomposition import PCA
# XGBoost model
from xgboost import XGBRegressor,XGBClassifier
# Random forest regressor
from sklearn.ensemble import RandomForestRegressor
# for regression error calculation
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import learning_curve
# for regression r2 score calculation
from sklearn.metrics import r2_score
# for regression mse score calculation
from sklearn.metrics import mean_squared_error
# For hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
# To transform the target variable into a normal distribution
from sklearn.compose import TransformedTargetRegressor
# visualisation
from matplotlib import pyplot

In [2]:
#import csv file
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
# Defining a function to train and test models
def train_test_eval_model(df, reg, features, scale = False, test_size= 0.25, n_pca = None, random_state = 42
                     #, early_stop = False
                     ):
  # takes as an input df: dataframe with all independent variables and the output variable. Output variable must be the last column.
  #                   reg: model constructor, example: reg = LinearRegression()
  #                   features: series of features from the dataset to be used for train/test, compute this using X.columns
  #                   scale = Option to scale features prior to training, default = False
  #                   test_size = float, percentage of data to be used for testing. Default = 0.2 (20%)
  #                   n_pca = int, number of principal components to reduce the independent varibles to. Default = None (no pca is done). Recommended to use scale = True if using pca.
  #                   random_state = int, number to set random split. Defeault = 42.
  print(type(reg).__name__)

  X = df.iloc[:,:-1] # Select all independent variables as X
  y = df.iloc[:,-1]  # Select the last variable in the dataframe as the target variable y

  # Splitting the data into test (25%) and train (75%)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state= random_state)

  # Normalize independent variables (both train and test) if required
  if scale == True:
    scaler = StandardScaler()
    fit_scaler = scaler.fit(X_train)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
  else:
    fit_scaler = None

  # Run PCA if required. Recommended to normalize if PCA will be used
  if n_pca is not None:
    pca = PCA(n_components= n_pca)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

  # Features into a dataframe
  X_train_df= pd.DataFrame(X_train)
  X_test_df= pd.DataFrame(X_test)

  #if early_stop is not False:
  #    reg.fit(X_train_df,y_train, eval_set = [(X_train, y_train), (X_test, y_test)])
  #else:
  #  reg.fit(X_train_df,y_train)

  eval_set = [(X_train_df, y_train), (X_test_df, y_test)]
  #eval_set = [(X_train, y_train), (X_test, y_test)]

  estimator = reg.fit(X_train_df,y_train, eval_metric=[
      #"error", # use for classification
      #'log_loss' # use for classification
      "rmse",
      'mae'
      ], eval_set = eval_set)

  y_pred_reg = reg.predict(X_test_df)

  results = {}
  r2 = r2_score(y_test, y_pred_reg)
  n = len(y_test)
  k = len(features)
  adj_r2_score = 1-(((1-r2)*(n-1))/(n-k-1))
  mae = mean_absolute_error(y_test,y_pred_reg)
  mse = mean_squared_error(y_test,y_pred_reg)

  print("Features used for model training = ", features)
  print("Number of principal components used for model training = ", n_pca)
  print("MAE = ", mae)
  print("MSE = ", mse)
  print("R2 Score = ", r2 )
  print("Adjusted R2 Score = ", adj_r2_score)

  results['estimator'] = estimator
  #results['scaler'] = fit_scaler
  results['X_train'] = X_train
  results['y_train'] = y_train
  results['X_test'] = X_test
  results['y_test'] = y_test
  results['y_pred_reg'] = y_pred_reg
  results['MAE'] = mae
  results['MSE'] = mse
  results['R2 Score'] = r2
  results['Adjusted R2 Score'] = adj_r2_score
  results['name'] = type(reg).__name__
  results['scaler'] = fit_scaler

  return results

## Importing the data of the full Three Rivers 2022 study area

In [24]:
train_df_val = pd.read_csv('/content/drive/threerivers_2022.csv')

In [None]:
#train_df_val.head()

In [None]:
train_df_val.head()

Unnamed: 0,aspect,slope,NDMI,MSI,SR_B5,SAVI,EVI,NDVI,ST_B10,MNDWI,SR_B7,SR_B6,MSAVI,smi
0,148.0,34.0,-0.163047,1.389619,0.23659,0.177631,0.163218,0.27839,315.953187,-0.55147,0.24924,0.32877,0.156521,0.083039
1,128.0,32.0,-0.154491,1.365439,0.230045,0.21535,0.204381,0.354969,316.144596,-0.589347,0.227158,0.314112,0.189767,0.075184
2,116.0,30.0,-0.148676,1.349282,0.23549,0.210712,0.197411,0.340363,315.823302,-0.576456,0.225425,0.317742,0.186172,0.079032
3,105.0,32.0,-0.150462,1.354221,0.235545,0.205507,0.19137,0.330471,315.324271,-0.580547,0.223197,0.31898,0.181468,0.090574
4,105.0,32.0,-0.124275,1.283822,0.235737,0.225911,0.214676,0.369313,314.794478,-0.581383,0.206477,0.302645,0.200023,0.105643


In [25]:
# Droppping some unneeded columns
train_df_val = train_df_val[['aspect','slope','NDMI', 'MSI'
                                ,'SR_B5','SAVI','EVI','NDVI','ST_B10'
                                ,'MNDWI','SR_B7','SR_B6','MSAVI','smi']]

In [26]:
# Splitting independant and target variables
X = train_df_val.drop('smi',1)
y = train_df_val['smi']

  X = train_df_val.drop('smi',1)


In [None]:
X.head()

Unnamed: 0,aspect,slope,NDMI,MSI,SR_B5,SAVI,EVI,NDVI,ST_B10,MNDWI,SR_B7,SR_B6,MSAVI
0,148.0,34.0,-0.163047,1.389619,0.23659,0.177631,0.163218,0.27839,315.953187,-0.55147,0.24924,0.32877,0.156521
1,128.0,32.0,-0.154491,1.365439,0.230045,0.21535,0.204381,0.354969,316.144596,-0.589347,0.227158,0.314112,0.189767
2,116.0,30.0,-0.148676,1.349282,0.23549,0.210712,0.197411,0.340363,315.823302,-0.576456,0.225425,0.317742,0.186172
3,105.0,32.0,-0.150462,1.354221,0.235545,0.205507,0.19137,0.330471,315.324271,-0.580547,0.223197,0.31898,0.181468
4,105.0,32.0,-0.124275,1.283822,0.235737,0.225911,0.214676,0.369313,314.794478,-0.581383,0.206477,0.302645,0.200023


## Model training, testing and validation

#### 1- Using Kfold split:
The model is trained on 9/10 parts of the data (K = 10) and tested on the 10th part of the data. Prediction accuracy is measured by r2 for each fold.

In [27]:
# Creating a cross validation object
cv = KFold(n_splits=10, random_state=0, shuffle=True)

In [28]:
# Creating an XGBoost model (best parameters so far)
#build multiple linear regression model
xgb_reg = XGBRegressor(
      max_depth= 10
    , learning_rate= 0.1 # learning speed
    , n_estimators = 100 # the number of runs XGBoost will try to learn
    , colsample_bylevel = 0.8 # sample of columns to be used to avoid overfitting
    #, gamma=0.1
    #, early_stopping_rounds = 10
    )

In [29]:
# Estimating r2 score for each k-fold CV to evaluate the model accuracy
scores = cross_val_score(xgb_reg, X, y, scoring='r2',
                         cv=cv, n_jobs=-1, verbose = 5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   37.2s finished


In [30]:
scores

array([0.98203061, 0.98355169, 0.9831965 , 0.98242982, 0.98231374,
       0.98262088, 0.98298449, 0.98397617, 0.98347316, 0.9837771 ])

In [31]:
scores.mean()

0.9830354161015757

#### 2- Using part of the data (25%) as a hidden/holdout dataset:
The Three Rivers 2022 data is split into 75% training/testing and 25% percent is for hidden validation.
The 75% of the data which is used for training/testing is split as 75% training and 25% testing.
*   Full data length = 100%
*   Training data length = 100% * 75% * 75%
*   Testing data length = 100% * 75% * 25%
*   Validation (hidden) data length = 100% * 25%

In [32]:
# Getting the top 80% rows to be used as training/validation data
ThreeRivers_2022_train_df = train_df_val.iloc[:int(len(train_df_val)*0.75), :]

In [33]:
# Getting the bottom 20% rows to be used as the hidden testing data
ThreeRivers_2022_test_df = train_df_val.iloc[int(len(train_df_val)*0.75):, :]

In [34]:
# Confirming that the data was split correctly 80% : 20%
len(ThreeRivers_2022_train_df) + len(ThreeRivers_2022_test_df) - len(train_df_val)

0

In [35]:
len(ThreeRivers_2022_test_df) / len(ThreeRivers_2022_train_df)

0.3333333333333333

In [36]:
# Getting the column names of the training data (required by the training function)
features = list(ThreeRivers_2022_train_df.columns)[:-1]

In [37]:
results_train_noPCA = train_test_eval_model(ThreeRivers_2022_train_df, xgb_reg, features = features, scale = False, test_size= 0.25, n_pca = None)

XGBRegressor
[0]	validation_0-rmse:0.16559	validation_0-mae:0.13153	validation_1-rmse:0.16685	validation_1-mae:0.13220
[1]	validation_0-rmse:0.14966	validation_0-mae:0.11884	validation_1-rmse:0.15089	validation_1-mae:0.11955
[2]	validation_0-rmse:0.13530	validation_0-mae:0.10741	validation_1-rmse:0.13651	validation_1-mae:0.10815
[3]	validation_0-rmse:0.12240	validation_0-mae:0.09714	validation_1-rmse:0.12362	validation_1-mae:0.09792
[4]	validation_0-rmse:0.11083	validation_0-mae:0.08794	validation_1-rmse:0.11206	validation_1-mae:0.08877




[5]	validation_0-rmse:0.10043	validation_0-mae:0.07966	validation_1-rmse:0.10171	validation_1-mae:0.08056
[6]	validation_0-rmse:0.09108	validation_0-mae:0.07223	validation_1-rmse:0.09243	validation_1-mae:0.07320
[7]	validation_0-rmse:0.08271	validation_0-mae:0.06556	validation_1-rmse:0.08416	validation_1-mae:0.06666
[8]	validation_0-rmse:0.07520	validation_0-mae:0.05959	validation_1-rmse:0.07681	validation_1-mae:0.06084
[9]	validation_0-rmse:0.06846	validation_0-mae:0.05423	validation_1-rmse:0.07020	validation_1-mae:0.05562
[10]	validation_0-rmse:0.06245	validation_0-mae:0.04944	validation_1-rmse:0.06435	validation_1-mae:0.05099
[11]	validation_0-rmse:0.05706	validation_0-mae:0.04515	validation_1-rmse:0.05916	validation_1-mae:0.04689
[12]	validation_0-rmse:0.05223	validation_0-mae:0.04131	validation_1-rmse:0.05455	validation_1-mae:0.04326
[13]	validation_0-rmse:0.04794	validation_0-mae:0.03791	validation_1-rmse:0.05048	validation_1-mae:0.04005
[14]	validation_0-rmse:0.04411	validation_

In [38]:
# Model prediction accuracy on the validation data
results_train_noPCA['R2 Score']

0.9825933644407593

In [39]:
# Getting model prediction from hidden (holdout) part of the data
X_hidden = ThreeRivers_2022_test_df.drop('smi',1)
y_hidden = ThreeRivers_2022_test_df['smi']
y_pred_hidden = results_train_noPCA['estimator'].predict(X_hidden)

  X_hidden = ThreeRivers_2022_test_df.drop('smi',1)


In [40]:
# Model prediction accuracy on the hidden test data
hidden_test_r2 = r2_score(y_hidden,y_pred_hidden)
print('r2 score for the hidden test prediction', hidden_test_r2)

r2 score for the hidden test prediction 0.9684379267763817


#### 3- Training the model on Barstow 2017 data and testing on hidden 2022 data

In [5]:
updated_barstow_2017 = pd.read_csv('/content/drive/barstow_2017.csv')

In [6]:
updated_barstow_2022 = pd.read_csv('/content/drive/barstow_lg_2022.csv')

In [7]:
# Droppping some unneeded columns
updated_barstow_2017 = updated_barstow_2017[['aspect','slope','NDMI'
                                ,'SR_B5','SAVI','EVI','NDVI','ST_B10'
                                ,'MNDWI','SR_B7','SR_B6','MSAVI','smi']]
updated_barstow_2022 = updated_barstow_2022[['aspect','slope','NDMI'
                                ,'SR_B5','SAVI','EVI','NDVI','ST_B10'
                                ,'MNDWI','SR_B7','SR_B6','MSAVI','smi']]
X_2017 = updated_barstow_2017.drop('smi',1)
y_2017 = updated_barstow_2017['smi']
X_2022 = updated_barstow_2022.drop('smi',1)
y_2022 = updated_barstow_2022['smi']

  X_2017 = updated_barstow_2017.drop('smi',1)
  X_2022 = updated_barstow_2022.drop('smi',1)


In [8]:
features = list(updated_barstow_2017.columns)[:-1]

In [41]:
training_results_barstow_2017 = train_test_eval_model(updated_barstow_2017, xgb_reg, features = features, scale = False, test_size= 0.25, n_pca = None)

XGBRegressor




[0]	validation_0-rmse:0.06727	validation_0-mae:0.04985	validation_1-rmse:0.06745	validation_1-mae:0.04992
[1]	validation_0-rmse:0.06242	validation_0-mae:0.04623	validation_1-rmse:0.06262	validation_1-mae:0.04631
[2]	validation_0-rmse:0.05818	validation_0-mae:0.04306	validation_1-rmse:0.05842	validation_1-mae:0.04315
[3]	validation_0-rmse:0.05449	validation_0-mae:0.04031	validation_1-rmse:0.05475	validation_1-mae:0.04040
[4]	validation_0-rmse:0.05123	validation_0-mae:0.03787	validation_1-rmse:0.05152	validation_1-mae:0.03797
[5]	validation_0-rmse:0.04839	validation_0-mae:0.03574	validation_1-rmse:0.04871	validation_1-mae:0.03585
[6]	validation_0-rmse:0.04588	validation_0-mae:0.03388	validation_1-rmse:0.04624	validation_1-mae:0.03401
[7]	validation_0-rmse:0.04369	validation_0-mae:0.03225	validation_1-rmse:0.04410	validation_1-mae:0.03239
[8]	validation_0-rmse:0.04177	validation_0-mae:0.03082	validation_1-rmse:0.04223	validation_1-mae:0.03097
[9]	validation_0-rmse:0.04014	validation_0-mae

In [42]:
# Model prediction accuracy on the validation data
training_results_barstow_2017['R2 Score']

0.8459780873676558

In [43]:
# Getting model prediction from hidden (holdout) part of the data
y_pred_2022 = training_results_barstow_2017['estimator'].predict(X_2022)

In [44]:
# Model prediction accuracy on the hidden (holdout) part of the data
r2_2022 = r2_score(y_2022, y_pred_2022)
print('r2 score for the hidden test prediction', r2_2022)

r2 score for the hidden test prediction -3.0106336917920276


https://stackoverflow.com/questions/23036866/scikit-learn-is-returning-coefficient-of-determination-r2-values-less-than-1
Since we will train the model on a dataset then attmpt to get predictions on another dataset, the training data needs to be scaled as part of the pre-processing prior to the model training. Scaling will cause the data to have a mean of 0 and a standard deviation of 1. The reason why this is required is that the regression model will train by calculating and reducing the errors between the mean of the data points and the best fit trend. The mean of the training data will differ from that of the test data (since it is a different dataset). This will lead to a r2 value which is outside of the [-1:1] range

#### Reattempt method 3 while using a standard scaler

In [45]:
training_results_barstow_2017_scale = train_test_eval_model(updated_barstow_2017, xgb_reg, features = features, scale = True, test_size= 0.25, n_pca = None)

XGBRegressor




[0]	validation_0-rmse:0.06727	validation_0-mae:0.04985	validation_1-rmse:0.06745	validation_1-mae:0.04992
[1]	validation_0-rmse:0.06242	validation_0-mae:0.04623	validation_1-rmse:0.06262	validation_1-mae:0.04631
[2]	validation_0-rmse:0.05818	validation_0-mae:0.04306	validation_1-rmse:0.05842	validation_1-mae:0.04315
[3]	validation_0-rmse:0.05449	validation_0-mae:0.04031	validation_1-rmse:0.05475	validation_1-mae:0.04040
[4]	validation_0-rmse:0.05123	validation_0-mae:0.03787	validation_1-rmse:0.05152	validation_1-mae:0.03797
[5]	validation_0-rmse:0.04839	validation_0-mae:0.03574	validation_1-rmse:0.04871	validation_1-mae:0.03585
[6]	validation_0-rmse:0.04588	validation_0-mae:0.03388	validation_1-rmse:0.04624	validation_1-mae:0.03401
[7]	validation_0-rmse:0.04369	validation_0-mae:0.03225	validation_1-rmse:0.04410	validation_1-mae:0.03239
[8]	validation_0-rmse:0.04177	validation_0-mae:0.03081	validation_1-rmse:0.04223	validation_1-mae:0.03097
[9]	validation_0-rmse:0.04014	validation_0-mae

In [46]:
training_results_barstow_2017_scale['R2 Score']

0.8456761950110466

In [47]:
# Applying standard scalar to the 2022 data X features. The scaler should be fit to the training data (since the testing data is hidden)
scaled_X_2022 = training_results_barstow_2017_scale['scaler'].transform(X_2022.values)



In [48]:
# Getting model prediction from hidden (holdout) part of the data
y_pred_2022_scale = training_results_barstow_2017_scale['estimator'].predict(scaled_X_2022)

In [50]:
# Model prediction accuracy on the hidden 2022 test data
hidden_2022_test_r2_scaled = r2_score(y_2022,y_pred_2022_scale)
hidden_2022_test_r2_scaled

-3.017788476297018

In [51]:
hidden_2022_test_mae = mean_absolute_error(y_2022,y_pred_2022_scale)
hidden_2022_test_mae

0.14746839935350964

In [52]:
hidden_2022_test_mse = mean_squared_error(y_2022,y_pred_2022_scale)
hidden_2022_test_mse

0.025725778376965693

#### 4- Train model on ThreeRivers22 data and test on ThreeRivers21 data (with and without scaling)

In [73]:
train_df = pd.read_csv('/content/drive/threerivers_2022.csv')

In [74]:
train_df = train_df[['aspect','slope','NDMI'
                                 ,'SR_B5','SAVI','EVI','NDVI','ST_B10'
                                 ,'MNDWI','SR_B7','SR_B6','MSAVI','smi']]

In [75]:
test_df = pd.read_csv('/content/drive/threerivers_2021.csv')

In [76]:
test_df = test_df[['aspect','slope','NDMI'
                                 ,'SR_B5','SAVI','EVI','NDVI','ST_B10'
                                 ,'MNDWI','SR_B7','SR_B6','MSAVI','smi']]

In [77]:
X_2022 = train_df.drop('smi',1)
y_2022 = train_df['smi']
X_2021 = test_df.drop('smi',1)
y_2021 = test_df['smi']

  X_2022 = train_df.drop('smi',1)
  X_2021 = test_df.drop('smi',1)


In [81]:
features = list(train_df.columns)[:-1]

In [79]:
training_results_threerivers_2022 = train_test_eval_model(train_df, xgb_reg, features = features, scale = False, test_size= 0.25, n_pca = None)

XGBRegressor
[0]	validation_0-rmse:0.17064	validation_0-mae:0.13990	validation_1-rmse:0.16922	validation_1-mae:0.13824
[1]	validation_0-rmse:0.15413	validation_0-mae:0.12631	validation_1-rmse:0.15290	validation_1-mae:0.12488
[2]	validation_0-rmse:0.13931	validation_0-mae:0.11411	validation_1-rmse:0.13828	validation_1-mae:0.11290
[3]	validation_0-rmse:0.12598	validation_0-mae:0.10312	validation_1-rmse:0.12518	validation_1-mae:0.10217




[4]	validation_0-rmse:0.11403	validation_0-mae:0.09326	validation_1-rmse:0.11345	validation_1-mae:0.09254
[5]	validation_0-rmse:0.10330	validation_0-mae:0.08441	validation_1-rmse:0.10295	validation_1-mae:0.08390
[6]	validation_0-rmse:0.09365	validation_0-mae:0.07645	validation_1-rmse:0.09352	validation_1-mae:0.07617
[7]	validation_0-rmse:0.08502	validation_0-mae:0.06933	validation_1-rmse:0.08512	validation_1-mae:0.06928
[8]	validation_0-rmse:0.07726	validation_0-mae:0.06293	validation_1-rmse:0.07758	validation_1-mae:0.06309
[9]	validation_0-rmse:0.07031	validation_0-mae:0.05720	validation_1-rmse:0.07090	validation_1-mae:0.05761
[10]	validation_0-rmse:0.06409	validation_0-mae:0.05207	validation_1-rmse:0.06498	validation_1-mae:0.05275
[11]	validation_0-rmse:0.05856	validation_0-mae:0.04750	validation_1-rmse:0.05971	validation_1-mae:0.04842
[12]	validation_0-rmse:0.05361	validation_0-mae:0.04343	validation_1-rmse:0.05507	validation_1-mae:0.04460
[13]	validation_0-rmse:0.04920	validation_0

In [84]:
trained_model = training_results_threerivers_2022['estimator']

In [85]:
# splitting the test data to separate dependant and indepenedant variables .. will use y for a hidden prediction
X = test_df.drop('smi', axis = 1)
y = test_df['smi']

Prediction with no scaling in preprocessing

In [86]:
y_pred_threerivers21 = trained_model.predict(X)

In [87]:
r2_threerivers21 = r2_score(y, y_pred_threerivers21)
r2_threerivers21

0.6726809150054167

In [88]:
# Train/test predictions with scaling
train_results_threerivers22_scale = train_test_eval_model(train_df, xgb_reg, features = features, scale = True, test_size= 0.2, n_pca = None)

XGBRegressor




[0]	validation_0-rmse:0.17056	validation_0-mae:0.13989	validation_1-rmse:0.16912	validation_1-mae:0.13794
[1]	validation_0-rmse:0.15406	validation_0-mae:0.12631	validation_1-rmse:0.15284	validation_1-mae:0.12465
[2]	validation_0-rmse:0.13924	validation_0-mae:0.11410	validation_1-rmse:0.13821	validation_1-mae:0.11270
[3]	validation_0-rmse:0.12592	validation_0-mae:0.10312	validation_1-rmse:0.12509	validation_1-mae:0.10198
[4]	validation_0-rmse:0.11397	validation_0-mae:0.09326	validation_1-rmse:0.11336	validation_1-mae:0.09236
[5]	validation_0-rmse:0.10325	validation_0-mae:0.08440	validation_1-rmse:0.10284	validation_1-mae:0.08372
[6]	validation_0-rmse:0.09361	validation_0-mae:0.07645	validation_1-rmse:0.09345	validation_1-mae:0.07602
[7]	validation_0-rmse:0.08498	validation_0-mae:0.06933	validation_1-rmse:0.08505	validation_1-mae:0.06913
[8]	validation_0-rmse:0.07723	validation_0-mae:0.06295	validation_1-rmse:0.07757	validation_1-mae:0.06298
[9]	validation_0-rmse:0.07029	validation_0-mae

In [89]:
trained_model_scale = train_results_threerivers22_scale['estimator']

In [90]:
scaled_X = train_results_threerivers22_scale['scaler'].transform(X.values)



Prediction with scaling in preprocessing

In [91]:
y_pred_threerivers21_scale = trained_model_scale.predict(scaled_X)

In [92]:
r2_threerivers21_scale = r2_score(y, y_pred_threerivers21_scale)
r2_threerivers21_scale

0.6728824954203343

###5- Train model on ThreeRivers22 data and test on Mariposa21 data

In [105]:
test_df = pd.read_csv('/content/drive/mariposa_2021.csv')

In [106]:
test_df = test_df[['aspect','slope','NDMI'
                                 ,'SR_B5','SAVI','EVI','NDVI','ST_B10'
                                 ,'MNDWI','SR_B7','SR_B6','MSAVI','smi']]

In [107]:
X = test_df.drop('smi', axis = 1)
y = test_df['smi']

In [108]:
training_results_threerivers_2022 = train_test_eval_model(train_df, xgb_reg, features = features, scale = False, test_size= 0.25, n_pca = None)

XGBRegressor
[0]	validation_0-rmse:0.17064	validation_0-mae:0.13990	validation_1-rmse:0.16922	validation_1-mae:0.13824
[1]	validation_0-rmse:0.15413	validation_0-mae:0.12631	validation_1-rmse:0.15290	validation_1-mae:0.12488
[2]	validation_0-rmse:0.13931	validation_0-mae:0.11411	validation_1-rmse:0.13828	validation_1-mae:0.11290
[3]	validation_0-rmse:0.12598	validation_0-mae:0.10312	validation_1-rmse:0.12518	validation_1-mae:0.10217




[4]	validation_0-rmse:0.11403	validation_0-mae:0.09326	validation_1-rmse:0.11345	validation_1-mae:0.09254
[5]	validation_0-rmse:0.10330	validation_0-mae:0.08441	validation_1-rmse:0.10295	validation_1-mae:0.08390
[6]	validation_0-rmse:0.09365	validation_0-mae:0.07645	validation_1-rmse:0.09352	validation_1-mae:0.07617
[7]	validation_0-rmse:0.08502	validation_0-mae:0.06933	validation_1-rmse:0.08512	validation_1-mae:0.06928
[8]	validation_0-rmse:0.07726	validation_0-mae:0.06293	validation_1-rmse:0.07758	validation_1-mae:0.06309
[9]	validation_0-rmse:0.07031	validation_0-mae:0.05720	validation_1-rmse:0.07090	validation_1-mae:0.05761
[10]	validation_0-rmse:0.06409	validation_0-mae:0.05207	validation_1-rmse:0.06498	validation_1-mae:0.05275
[11]	validation_0-rmse:0.05856	validation_0-mae:0.04750	validation_1-rmse:0.05971	validation_1-mae:0.04842
[12]	validation_0-rmse:0.05361	validation_0-mae:0.04343	validation_1-rmse:0.05507	validation_1-mae:0.04460
[13]	validation_0-rmse:0.04920	validation_0

Prediction with no scaling in preprocessing

In [109]:
y_pred_mariposa21 = trained_model.predict(X)

In [110]:
r2_Mariposa21 = r2_score(y, y_pred_mariposa21)
r2_Mariposa21

-1.297874833163192

###6- Train model on  Mariposa21 and test on  Mariposa22

In [111]:
train_df = pd.read_csv('/content/drive/mariposa_2021.csv')
test_df = pd.read_csv('/content/drive/mariposa_2022.csv')

In [112]:
train_df = train_df[['aspect', 'slope', 'EVI', 'MNDWI', 'MSAVI', 'NDMI','NDVI', 'SAVI', 'SR_B5', 'SR_B6', 'SR_B7','ST_B10','smi']]
test_df = test_df[['aspect', 'slope', 'EVI', 'MNDWI', 'MSAVI', 'NDMI','NDVI', 'SAVI', 'SR_B5', 'SR_B6', 'SR_B7','ST_B10','smi']]

In [113]:
# splitting train_df to train and val
X = train_df.drop('smi', axis = 1)
y = train_df['smi']

In [114]:
features = list(X.columns)

In [115]:
train_results_mariposa21 = train_test_eval_model(train_df, xgb_reg, features = features, scale = False, test_size= 0.25, n_pca = None)

XGBRegressor
[0]	validation_0-rmse:0.12168	validation_0-mae:0.09370	validation_1-rmse:0.12302	validation_1-mae:0.09507
[1]	validation_0-rmse:0.11238	validation_0-mae:0.08636	validation_1-rmse:0.11409	validation_1-mae:0.08794
[2]	validation_0-rmse:0.10412	validation_0-mae:0.07986	validation_1-rmse:0.10627	validation_1-mae:0.08169
[3]	validation_0-rmse:0.09688	validation_0-mae:0.07416	validation_1-rmse:0.09941	validation_1-mae:0.07623




[4]	validation_0-rmse:0.09043	validation_0-mae:0.06910	validation_1-rmse:0.09342	validation_1-mae:0.07146
[5]	validation_0-rmse:0.08476	validation_0-mae:0.06465	validation_1-rmse:0.08820	validation_1-mae:0.06733
[6]	validation_0-rmse:0.07976	validation_0-mae:0.06072	validation_1-rmse:0.08368	validation_1-mae:0.06371
[7]	validation_0-rmse:0.07535	validation_0-mae:0.05725	validation_1-rmse:0.07981	validation_1-mae:0.06058
[8]	validation_0-rmse:0.07148	validation_0-mae:0.05418	validation_1-rmse:0.07644	validation_1-mae:0.05781
[9]	validation_0-rmse:0.06812	validation_0-mae:0.05149	validation_1-rmse:0.07361	validation_1-mae:0.05543
[10]	validation_0-rmse:0.06514	validation_0-mae:0.04911	validation_1-rmse:0.07115	validation_1-mae:0.05335
[11]	validation_0-rmse:0.06257	validation_0-mae:0.04703	validation_1-rmse:0.06909	validation_1-mae:0.05159
[12]	validation_0-rmse:0.06027	validation_0-mae:0.04517	validation_1-rmse:0.06737	validation_1-mae:0.05008
[13]	validation_0-rmse:0.05823	validation_0

In [116]:
trained_model = train_results_mariposa21['estimator']

In [117]:
X = test_df.drop('smi', axis = 1)
y = test_df['smi']

In [118]:
y_pred_mariposa22 = trained_model.predict(X)

In [119]:
r2_Mariposa22 = r2_score(y, y_pred_mariposa22)
r2_Mariposa22

0.7458687065063063