In [None]:
# Importing necessary libraries
import os
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from yellowbrick.regressor import prediction_error
print("Make sure to install shap as: !pip install shap")
!pip install shap
import shap

In [None]:
#check for shap installed
stream = os.popen('pip list')

pip_list = stream.read()

Package=list(pip_list.split(" "))
# Count variable
c = 0
for i in Package:
    if "shap" in i:
        c = 1

# Checking the value of c
if c==1:
  print("Shap Installed")
else :
  print("Shap is not installed")

Shap Installed


In [None]:

# Constructing paths based on zone and resolution
home = input("Enter the home directory where the datasets are located (example:/content/ARM_ERA_12)")

# Asking the user for the resolution of the dataset
resolution = [input("Enter the name/resolution of the dataset(example: 1000m ): ")]

#1000m_dataV1_train.csv
# Asking for the name of the training dataset and constructing the full path
train_dataset_name = input("Enter the name of the training dataset (including file extension - example: 1000m_dataV1_train.csv): ")
train_file = home + '/' + train_dataset_name

# Checking if the training file exists and prompting the user until a valid file is provided
while not os.path.exists(train_file):
    print("The specified training file does not exist. Please enter the correct dataset  with the correct format.")
    train_dataset_name = input("Enter the name of the training dataset (including file extension): ")
    train_file = home + '/' + train_dataset_name



Enter the home directory where the datasets are located (example:/content/ARM_ERA_12)/content/Airmoss
Enter the name/resolution of the dataset(example: 1000m ): 1400m
Enter the name of the training dataset (including file extension - example: 1000m_dataV1_train.csv): A4_1400m_processed_train.csv
The specified training file does not exist. Please enter the correct dataset  with the correct format.
Enter the name of the training dataset (including file extension): A4_1400m_processed__train.csv


In [None]:
#_dataV1_test.csv
# Asking for the name of the testing dataset and constructing the full path
test_dataset_name = input("Enter the name of the testing dataset (including file extension - example: 1000m_dataV1_test.csv): ")
test_file = home + '/' + test_dataset_name

# Checking if the testing file exists and prompting the user until a valid file is provided
while not os.path.exists(test_file):
    print("The specified testing file does not exist. Please enter the correct dataset name with the correct format.")
    test_dataset_name = input("Enter the name of the testing dataset (including file extension - example: 1000m_dataV1_test.csv): ")
    test_file = home + '/' + test_dataset_name

Enter the name of the testing dataset (including file extension - example: 1000m_dataV1_test.csv): A4_1400m_processed_test.csv
The specified testing file does not exist. Please enter the correct dataset name with the correct format.
Enter the name of the testing dataset (including file extension - example: 1000m_dataV1_test.csv): A4_1400m_processed__test.csv


In [None]:
# Reading training data
train_data = pd.read_csv(train_file,
                         usecols=['SMERGE', 'Date', 'Temp', 'PageName', 'LAI', 'Albedo', 'NDVI', 'Clay', 'Sand', 'Silt',
                                  'Slope', 'Elevation', 'Ascept'])

train_page = train_data['PageName']
train_date = train_data['Date']


In [None]:
# Reading test data
test_data = pd.read_csv(test_file,
                        usecols=['SMERGE', 'Date', 'Temp', 'PageName', 'LAI', 'Albedo', 'NDVI', 'Clay', 'Sand', 'Silt',
                                 'Slope', 'Elevation', 'Ascept'])
test_page = test_data[['PageName']]
test_ndvi = test_data['NDVI']
test_dates = test_data['Date']


In [None]:
# Processing 'Date' column for training and testing
train_data['Date'] = pd.to_datetime(train_data['Date'], format="%m/%d/%Y").astype(int)
test_data['Date'] = pd.to_datetime(test_data['Date'], format="%m/%d/%Y").astype(int)

In [None]:
#Separating target data and variables
y_train = train_data['SMERGE']
x_train = train_data[
    ['Clay', 'Sand', 'Silt', 'Elevation', 'Ascept', 'Slope', 'NDVI', 'Date', 'LAI', 'Albedo', 'Temp']]
print(train_data)

In [None]:
#Separating target data and variables
y_test = test_data['SMERGE']
x_test = test_data[
    ['Clay', 'Sand', 'Silt', 'Elevation', 'Ascept', 'Slope', 'NDVI', 'Date', 'LAI', 'Albedo', 'Temp']]
print(test_data)

In [None]:
# Configuring the XGBoost model
model = XGBRegressor(verbosity=1,n_estimators=500,max_depth=10,tree_method='gpu_hist')
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# Compiling and training the model
model.fit(x_train, y_train)

# Predicting and evaluating the model with the trained model
h = model.predict(x_test)
accuracy = model.score(x_test, y_test)
print('Accuracy:', accuracy)
MSE = mse(y_test, h)
RMSE = np.sqrt(MSE)
R_squared = r2_score(y_test, h)
print("\nRMSE: ", np.round(RMSE, 2))
print()
print("R-Squared: ", np.round(R_squared, 2))

# Preparing the output dataframe
test_data['Date'] = test_dates
test_data['ML_'] = h
test_data['PageName'] = test_page
test_data = test_data[["Clay","Sand","Elevation","Ascept","NDVI","Date","LAI","Albedo","PageName","SMERGE","ML_"]]

# Making the Prediction Error Plot
print("\nPrediction Error Plot")
error_plot = prediction_error(model, x_train, y_train, x_test, y_test)
print(error_plot)

# Saving the output dataframe to a CSV file
test_data.to_csv( home + resolution[0] + ".csv")

# Generating variable importance plots
x_sampled = x_train.sample(1000, random_state=10)
explainer = shap.Explainer(model, x_sampled)
shap_values = explainer(x_sampled)
shap.plots.bar(shap_values, max_display = 11)
save_shap = input("Would you like to save the variable importance plot?: y/n")
if save_shap == 'y':
#Saving varibale importance plot
  mean_shap = np.abs(shap_values.values).mean(axis=0)
  shap_pd = pd.DataFrame(mean_shap, index = x_sampled.columns).sort_values(by=[0],ascending = False)
  shap_pd.to_csv(home + resolution[0] + '_SHAP.csv')