<a href="https://colab.research.google.com/github/belalabouzaid/siads699_team13_collab/blob/main/Notebooks/5b-XGB_Model_Train_Test_Validate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook Objective
The aim of this notebook is to validate the XGB model when used to predict from data that was not part of the training dataset (hidden data).

In [None]:
# Import libraries
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# To work with numpy arrays
import numpy as np
# ML model building, training and testing
import sklearn
# Kfold split and scoring
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# Build ML pipeline
from sklearn.pipeline import Pipeline
# Liner regression model building
from sklearn.linear_model import LinearRegression
# Polynomial regression
from sklearn.preprocessing import PolynomialFeatures
# sklearn included transformer
from sklearn.preprocessing import QuantileTransformer
# to split data into test and train
from sklearn.model_selection import train_test_split
# Scaling transform for PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# To run PCA feature reduction
from sklearn.decomposition import PCA
# XGBoost model
from xgboost import XGBRegressor,XGBClassifier
# Random forest regressor
from sklearn.ensemble import RandomForestRegressor
# for regression error calculation
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import learning_curve
# for regression r2 score calculation
from sklearn.metrics import r2_score
# for regression mse score calculation
from sklearn.metrics import mean_squared_error
# For hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
# To transform the target variable into a normal distribution
from sklearn.compose import TransformedTargetRegressor
# visualisation
from matplotlib import pyplot

In [None]:
# Defining a function to train and test models
def train_test_eval_model(df, reg, features, scale = False, test_size= 0.25, n_pca = None, random_state = 42
                     #, early_stop = False
                     ):
  # takes as an input df: dataframe with all independent variables and the output variable. Output variable must be the last column.
  #                   reg: model constructor, example: reg = LinearRegression()
  #                   features: series of features from the dataset to be used for train/test, compute this using X.columns
  #                   scale = Option to scale features prior to training, default = False
  #                   test_size = float, percentage of data to be used for testing. Default = 0.2 (20%)
  #                   n_pca = int, number of principal components to reduce the independent varibles to. Default = None (no pca is done). Recommended to use scale = True if using pca.
  #                   random_state = int, number to set random split. Defeault = 42.
  print(type(reg).__name__)

  X = df.iloc[:,:-1] # Select all independent variables as X
  y = df.iloc[:,-1]  # Select the last variable in the dataframe as the target variable y

  # Splitting the data into test (25%) and train (75%)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state= random_state)

  # Normalize independent variables (both train and test) if required
  if scale == True:
    scaler = StandardScaler()
    fit_scaler = scaler.fit(X_train)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
  else:
    fit_scaler = None

  # Run PCA if required. Recommended to normalize if PCA will be used
  if n_pca is not None:
    pca = PCA(n_components= n_pca)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

  # Features into a dataframe
  X_train_df= pd.DataFrame(X_train)
  X_test_df= pd.DataFrame(X_test)

  #if early_stop is not False:
  #    reg.fit(X_train_df,y_train, eval_set = [(X_train, y_train), (X_test, y_test)])
  #else:
  #  reg.fit(X_train_df,y_train)

  eval_set = [(X_train_df, y_train), (X_test_df, y_test)]
  #eval_set = [(X_train, y_train), (X_test, y_test)]

  estimator = reg.fit(X_train_df,y_train, eval_metric=[
      "rmse",
      'mae'
      ], eval_set = eval_set)

  y_pred_reg_test = reg.predict(X_test_df)
  y_pred_reg_train = reg.predict(X_train_df)

  results = {}
  r2_test = r2_score(y_test, y_pred_reg_test)
  r2_train = r2_score(y_train, y_pred_reg_train)
  n = len(y_test)
  k = len(features)
  adj_r2_score = 1-(((1-r2_test)*(n-1))/(n-k-1))
  mae = mean_absolute_error(y_test,y_pred_reg_test)
  mse = mean_squared_error(y_test,y_pred_reg_test)
  rmse = np.sqrt(mse)

  print("Features used for model training = ", features)
  print("Number of principal components used for model training = ", n_pca)
  print("Min predicted value = ", min(y_pred_reg_test))
  print("Max predicted value = ", max(y_pred_reg_test))
  print("MAE = ", mae)
  print("MSE = ", mse)
  print("RMSE = ", rmse)
  print("R2 test Score = ", r2_test )
  print("R2 train Score = ", r2_train )
  print("Adjusted test R2 Score = ", adj_r2_score)

  results['estimator'] = estimator
  results['scaler'] = fit_scaler
  results['X_train'] = X_train
  results['y_train'] = y_train
  results['X_test'] = X_test
  results['y_test'] = y_test
  results['y_pred_reg'] = y_pred_reg_test
  results['MAE'] = mae
  results['MSE'] = mse
  results['R2 test Score'] = r2_test
  results['R2 train Score'] = r2_train
  results['Adjusted test R2 Score'] = adj_r2_score
  results['name'] = type(reg).__name__
  results['scaler'] = fit_scaler

  return results

In [None]:
# Creating an XGBoost model (best parameters so far)
#build multiple linear regression model
xgb_reg = XGBRegressor(
      max_depth= 10
    , learning_rate= 0.1 # learning speed
    , n_estimators = 100 # the number of runs XGBoost will try to learn
    , colsample_bylevel = 0.7 # sample of columns to be used to avoid overfitting
    , objective = 'reg:squarederror'
    #, gamma=0.1
    #, early_stopping_rounds = 10
    )

# Triaining Data: ThreeRivers 22

## Train model on ThreeRivers22 data and test on ThreeRivers21 data

In [None]:
train_df = pd.read_csv('https://github.com/belalabouzaid/siads699_team13_collab/raw/main/Data/threerivers_2022.csv')

In [None]:
train_df = train_df[['aspect','slope','NDMI', 'MSI'
                                 ,'SR_B5','SAVI','EVI','NDVI','ST_B10'
                                 ,'MNDWI','SR_B7','SR_B6','MSAVI','smi']]

In [None]:
test_df = pd.read_csv('https://github.com/belalabouzaid/siads699_team13_collab/raw/main/Data/threerivers_2021.csv')

In [None]:
test_df = test_df[['aspect','slope','NDMI', 'MSI'
                                 ,'SR_B5','SAVI','EVI','NDVI','ST_B10'
                                 ,'MNDWI','SR_B7','SR_B6','MSAVI','smi']]

In [None]:
X_2022 = train_df.drop(columns = 'smi')
y_2022 = train_df['smi']
X_2021 = test_df.drop(columns = 'smi')
y_2021 = test_df['smi']

In [None]:
features = list(train_df.columns)[:-1]

In [None]:
training_results_threerivers_2022 = train_test_eval_model(train_df, xgb_reg, features = features, scale = False, test_size= 0.25, n_pca = None)

XGBRegressor




[0]	validation_0-rmse:0.03938	validation_0-mae:0.03031	validation_1-rmse:0.03984	validation_1-mae:0.03024
[1]	validation_0-rmse:0.03607	validation_0-mae:0.02759	validation_1-rmse:0.03668	validation_1-mae:0.02760
[2]	validation_0-rmse:0.03313	validation_0-mae:0.02516	validation_1-rmse:0.03390	validation_1-mae:0.02527
[3]	validation_0-rmse:0.03049	validation_0-mae:0.02299	validation_1-rmse:0.03145	validation_1-mae:0.02321
[4]	validation_0-rmse:0.02814	validation_0-mae:0.02106	validation_1-rmse:0.02928	validation_1-mae:0.02139
[5]	validation_0-rmse:0.02604	validation_0-mae:0.01934	validation_1-rmse:0.02739	validation_1-mae:0.01979
[6]	validation_0-rmse:0.02417	validation_0-mae:0.01782	validation_1-rmse:0.02573	validation_1-mae:0.01839
[7]	validation_0-rmse:0.02252	validation_0-mae:0.01648	validation_1-rmse:0.02427	validation_1-mae:0.01717
[8]	validation_0-rmse:0.02106	validation_0-mae:0.01530	validation_1-rmse:0.02297	validation_1-mae:0.01611
[9]	validation_0-rmse:0.01974	validation_0-mae

In [None]:
trained_model = training_results_threerivers_2022['estimator']

In [None]:
# splitting the test data to separate dependant and indepenedant variables .. will use y for a hidden prediction
X = test_df.drop(columns = 'smi')
y = test_df['smi']

In [None]:
y_pred_threerivers21 = trained_model.predict(X)

In [None]:
r2_threerivers21 = r2_score(y, y_pred_threerivers21)
r2_threerivers21

0.7655014386797723

In [None]:
val_TR22_TR21_mae = mean_absolute_error(y_2021,y_pred_threerivers21)
val_TR22_TR21_mse = mean_squared_error(y_2021,y_pred_threerivers21)
val_TR22_TR21_rmse = np.sqrt(val_TR22_TR21_mse)

In [None]:
val_TR22_TR21_rmse

0.016971943399630627

## Train model on ThreeRivers22 data and test on Mariposa21 data

In [None]:
test_df = pd.read_csv('https://github.com/belalabouzaid/siads699_team13_collab/raw/main/Data/mariposa_2021.csv')

In [None]:
test_df = test_df[['aspect','slope','NDMI', 'MSI'
                                 ,'SR_B5','SAVI','EVI','NDVI','ST_B10'
                                 ,'MNDWI','SR_B7','SR_B6','MSAVI','smi']]

In [None]:
X = test_df.drop('smi', axis = 1)
y = test_df['smi']

In [None]:
training_results_threerivers_2022 = train_test_eval_model(train_df, xgb_reg, features = features, scale = False, test_size= 0.25, n_pca = None)

XGBRegressor




[0]	validation_0-rmse:0.03938	validation_0-mae:0.03031	validation_1-rmse:0.03984	validation_1-mae:0.03024
[1]	validation_0-rmse:0.03607	validation_0-mae:0.02759	validation_1-rmse:0.03668	validation_1-mae:0.02760
[2]	validation_0-rmse:0.03313	validation_0-mae:0.02516	validation_1-rmse:0.03390	validation_1-mae:0.02527
[3]	validation_0-rmse:0.03049	validation_0-mae:0.02299	validation_1-rmse:0.03145	validation_1-mae:0.02321
[4]	validation_0-rmse:0.02814	validation_0-mae:0.02106	validation_1-rmse:0.02928	validation_1-mae:0.02139
[5]	validation_0-rmse:0.02604	validation_0-mae:0.01934	validation_1-rmse:0.02739	validation_1-mae:0.01979
[6]	validation_0-rmse:0.02417	validation_0-mae:0.01782	validation_1-rmse:0.02573	validation_1-mae:0.01839
[7]	validation_0-rmse:0.02252	validation_0-mae:0.01648	validation_1-rmse:0.02427	validation_1-mae:0.01717
[8]	validation_0-rmse:0.02106	validation_0-mae:0.01530	validation_1-rmse:0.02297	validation_1-mae:0.01611
[9]	validation_0-rmse:0.01974	validation_0-mae

In [None]:
y_pred_mariposa21 = trained_model.predict(X)

In [None]:
r2_Mariposa21 = r2_score(y, y_pred_mariposa21)
r2_Mariposa21

0.28933526722082037

In [None]:
val_TR22_M21_mae = mean_absolute_error(y,y_pred_mariposa21)
val_TR22_M21_mse = mean_squared_error(y,y_pred_mariposa21)
val_TR22_M21_rmse = np.sqrt(val_TR22_M21_mse)

In [None]:
val_TR22_M21_rmse

0.11161015324497962

# Training Data: Mariposa 21

## Train model on  Mariposa21 and test on  Mariposa22

In [None]:
train_df = pd.read_csv('https://github.com/belalabouzaid/siads699_team13_collab/raw/main/Data/mariposa_2021.csv')
test_df = pd.read_csv('https://github.com/belalabouzaid/siads699_team13_collab/raw/main/Data/mariposa_2022.csv')

In [None]:
train_df = train_df[['aspect','slope','NDMI', 'MSI'
                                 ,'SR_B5','SAVI','EVI','NDVI','ST_B10'
                                 ,'MNDWI','SR_B7','SR_B6','MSAVI','smi']]
test_df = test_df[['aspect','slope','NDMI', 'MSI'
                                 ,'SR_B5','SAVI','EVI','NDVI','ST_B10'
                                 ,'MNDWI','SR_B7','SR_B6','MSAVI','smi']]

In [None]:
# splitting train_df to train and val
X = train_df.drop(columns = 'smi')
y = train_df['smi']

In [None]:
features = list(X.columns)

In [None]:
train_results_mariposa21 = train_test_eval_model(train_df, xgb_reg, features = features, scale = False, test_size= 0.25, n_pca = None)

XGBRegressor




[0]	validation_0-rmse:0.12168	validation_0-mae:0.09371	validation_1-rmse:0.12303	validation_1-mae:0.09507
[1]	validation_0-rmse:0.11239	validation_0-mae:0.08638	validation_1-rmse:0.11406	validation_1-mae:0.08793
[2]	validation_0-rmse:0.10414	validation_0-mae:0.07990	validation_1-rmse:0.10625	validation_1-mae:0.08168
[3]	validation_0-rmse:0.09691	validation_0-mae:0.07419	validation_1-rmse:0.09938	validation_1-mae:0.07620
[4]	validation_0-rmse:0.09046	validation_0-mae:0.06910	validation_1-rmse:0.09342	validation_1-mae:0.07144
[5]	validation_0-rmse:0.08481	validation_0-mae:0.06465	validation_1-rmse:0.08819	validation_1-mae:0.06729
[6]	validation_0-rmse:0.07978	validation_0-mae:0.06070	validation_1-rmse:0.08365	validation_1-mae:0.06365
[7]	validation_0-rmse:0.07539	validation_0-mae:0.05723	validation_1-rmse:0.07978	validation_1-mae:0.06051
[8]	validation_0-rmse:0.07149	validation_0-mae:0.05412	validation_1-rmse:0.07643	validation_1-mae:0.05775
[9]	validation_0-rmse:0.06809	validation_0-mae

In [None]:
trained_model = train_results_mariposa21['estimator']

In [None]:
X = test_df.drop(columns = 'smi')
y = test_df['smi']

In [None]:
y_pred_mariposa22 = trained_model.predict(X)

In [None]:
r2_Mariposa22 = r2_score(y, y_pred_mariposa22)
r2_Mariposa22

0.7386435345667016

In [None]:
val_M21_M22_mae = mean_absolute_error(y,y_pred_mariposa22)
val_M21_M22_mse = mean_squared_error(y,y_pred_mariposa22)
val_M21_M22_rmse = np.sqrt(val_M21_M22_mse)

In [None]:
val_M21_M22_rmse

0.05970099511990619

## Train model on Mariposa21 and test on Three Rivers 22

In [None]:
train_df = pd.read_csv('https://github.com/belalabouzaid/siads699_team13_collab/raw/main/Data/mariposa_2021.csv')
test_df = pd.read_csv('https://github.com/belalabouzaid/siads699_team13_collab/raw/main/Data/threerivers_2022.csv')

In [None]:
train_df = train_df[['aspect','slope','NDMI', 'MSI'
                                 ,'SR_B5','SAVI','EVI','NDVI','ST_B10'
                                 ,'MNDWI','SR_B7','SR_B6','MSAVI','smi']]
test_df = test_df[['aspect','slope','NDMI', 'MSI'
                                 ,'SR_B5','SAVI','EVI','NDVI','ST_B10'
                                 ,'MNDWI','SR_B7','SR_B6','MSAVI','smi']]

In [None]:
# splitting train_df to train and val
X = train_df.drop(columns = 'smi')
y = train_df['smi']

In [None]:
features = list(X.columns)

In [None]:
train_results_mariposa21 = train_test_eval_model(train_df, xgb_reg, features = features, scale = False, test_size= 0.25, n_pca = None)

XGBRegressor
[0]	validation_0-rmse:0.12168	validation_0-mae:0.09371	validation_1-rmse:0.12303	validation_1-mae:0.09507
[1]	validation_0-rmse:0.11239	validation_0-mae:0.08638	validation_1-rmse:0.11406	validation_1-mae:0.08793
[2]	validation_0-rmse:0.10414	validation_0-mae:0.07990	validation_1-rmse:0.10625	validation_1-mae:0.08168




[3]	validation_0-rmse:0.09691	validation_0-mae:0.07419	validation_1-rmse:0.09938	validation_1-mae:0.07620
[4]	validation_0-rmse:0.09046	validation_0-mae:0.06910	validation_1-rmse:0.09342	validation_1-mae:0.07144
[5]	validation_0-rmse:0.08481	validation_0-mae:0.06465	validation_1-rmse:0.08819	validation_1-mae:0.06729
[6]	validation_0-rmse:0.07978	validation_0-mae:0.06070	validation_1-rmse:0.08365	validation_1-mae:0.06365
[7]	validation_0-rmse:0.07539	validation_0-mae:0.05723	validation_1-rmse:0.07978	validation_1-mae:0.06051
[8]	validation_0-rmse:0.07149	validation_0-mae:0.05412	validation_1-rmse:0.07643	validation_1-mae:0.05775
[9]	validation_0-rmse:0.06809	validation_0-mae:0.05141	validation_1-rmse:0.07355	validation_1-mae:0.05533
[10]	validation_0-rmse:0.06513	validation_0-mae:0.04906	validation_1-rmse:0.07111	validation_1-mae:0.05327
[11]	validation_0-rmse:0.06252	validation_0-mae:0.04696	validation_1-rmse:0.06905	validation_1-mae:0.05152
[12]	validation_0-rmse:0.06021	validation_0-

In [None]:
trained_model = train_results_mariposa21['estimator']

In [None]:
X = test_df.drop(columns = 'smi')
y = test_df['smi']

In [None]:
y_pred_threerivers22 = trained_model.predict(X)

In [None]:
r2_threeriveres22 = r2_score(y, y_pred_threerivers22)
r2_threeriveres22

-4.831797454666241

In [None]:
val_M21_TR22_mae = mean_absolute_error(y,y_pred_threerivers22)
val_M21_TR22_mse = mean_squared_error(y,y_pred_threerivers22)
val_M21_TR22_rmse = np.sqrt(val_M21_TR22_mse)

In [None]:
val_M21_TR22_rmse

0.10416867012074704