In [None]:
# Install SHAP if not already in library
#pip install shap
# Install os
#pip install os

# Importing the necessary libraries
import os
import io
import shap
import subprocess
import pandas as pd
import numpy as np
from numpy import mean, std
from sklearn.datasets import make_regression
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn import datasets, ensemble
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error as mse

# Asking the user for the resolution of the dataset
resolution = input("Enter the resolution of the dataset: ")
## e.g. Enter the resolution of the dataset: 1000m

# Constructing paths based on the zone and resolution
home_directory = input("Enter the home directory where the datasets are located: ")
## e.g. Enter the home directory where the datasets are located: /content/ARM_1/

# Asking for the name of the testing dataset and constructing the full path
test_dataset_name = input("Enter the name of the testing dataset (including file extensions): ")
## e.g. Enter the name of the testing dataset: ERA1_2ARM_1000m_dataV1_test.csv

test_file = os.path.join(home_directory, test_dataset_name)

# Checking if the testing file exists and prompting the user until a valid file is provided
while not os.path.exists(test_file):
  print("The specified training file does not exist. Enter the correct dataset name.")
  test_dataset_name = input("Enter the name of the testing dataset (including file extensions): ")
  test_file = os.path.join(home_directory, test_dataset_name)

# Asking for the name of the training dataset and constructing the full path
train_dataset_name = input("Enter the name of the training dataset (including file extensions): ")
## e.g. Enter the name of the training dataset: ERA1_2ARM_1000m_dataV1_train.csv

train_file = os.path.join(home_directory, train_dataset_name)

# Checking if the training file exists and prompting the user until a valid file is provided
while not os.path.exists(train_file):
  print("The specified training file does now exist. Enter the correct dataset name.")
  train_dataset_name = input("Enter the name of the training dataset (inlcluding file extensions): ")
  train_file = os.path.join(home_directory, train_dataset_name)

# Reading the test data
test_data = pd.read_csv(test_file, usecols=['SMERGE', 'Date','PageName', 'LAI',
                                         'Albedo', 'NDVI', 'Clay', 'Sand',
                                         'Silt', 'Slope', 'Elevation', 'Ascept', 'Temp'])
test_data = test_data.loc[:, ['Clay', 'Sand', 'Silt', 'Elevation', 'Ascept',
                                      'Slope', 'NDVI', 'SMERGE', 'Date', 'LAI', 'Albedo', 'Temp']]
test_data.columns = ['Clay', 'Sand', 'Silt', 'Elevation', 'Ascept', 'Slope',
                             'NDVI', 'SMERGE', 'Date', 'Lai', 'Albedo', 'Temp']

# Reading the training data
train_data = pd.read_csv(train_file, usecols=['SMERGE', 'Date', 'PageName', 'LAI',
                                              'Albedo', 'NDVI', 'Clay', 'Sand', 'Silt',
                                          'Slope', 'Elevation', 'Ascept', 'Temp'])
train_data = train_data.loc[:, ['Clay', 'Sand', 'Silt', 'Elevation', 'Ascept',
                                'Slope', 'NDVI', 'SMERGE', 'Date', 'LAI', 'Albedo', 'Temp']]
train_data.columns = ['Clay', 'Sand', 'Silt', 'Elevation', 'Slope', 'Ascept',
                      'NDVI', 'SMERGE', 'Date', 'Lai', 'Albedo', 'Temp']

# Reading the testing files for: 'PageName' and 'Date'
page = pd.read_csv(test_file, usecols=['PageName'])
date = pd.read_csv(test_file, usecols=['Date'])

# Processing 'Date' column for testing
test_data['Date'] = pd.to_datetime(test_data['Date'], format="%m/%d/%Y").astype(int)

# Processing 'Date' column for training
train_data['Date'] = pd.to_datetime(train_data['Date'], format="%m/%d/%Y").astype(int)

# Separating target variable in variables
y_test = test_data[['SMERGE']]
x_test = test_data[
    ['Clay', 'Sand', 'Silt', 'Elevation', 'Ascept', 'Slope', 'Lai', 'NDVI', 'Albedo', "Temp", "Date"]]
y_train = train_data[['SMERGE']]
x_train = train_data[
    ['Clay', 'Sand', 'Silt', 'Elevation', 'Ascept', 'Slope', 'Lai', 'NDVI', 'Albedo', "Temp", "Date"]]

# Stating the parameters which will be used for GBR
params = {
    "n_estimators": 200,
    "max_depth": 10,
    "min_samples_split": 4,
    "learning_rate": 0.3,
    "loss": "squared_error",
    "verbose": 1
}

# Converting to numpy array
x_train_nu = x_train.to_numpy()
y_train_nu = y_train.to_numpy()
x_test_nu = x_test.to_numpy()

    ### GRADIENT BOOSTING REGRESSION
# Configuring the Gradient Boosting Regressor
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(x_train, y_train.values.ravel())

mse = mean_squared_error(y_test, reg.predict(x_test))
print("The mean squared error (MSE) on the set: {:.4f}".format(mse))

# Predicting the test model to prepare for the output dataframe
h = reg.predict(x_test_nu)
test_data['Date'] = date['Date']
test_data['ML'] = h
test_data['PageName'] = page

# Saving the output dataframe to a CSV file
test_data.to_csv(resolution + ".csv", index=False)

## Using SHAP
# Generating variable importance plots
my_model = reg.fit(x_train, y_train)
X_sampled = x_train.sample(1000, random_state=10, replace=True)

explainer = shap.Explainer(my_model, X_sampled)
shap_values = explainer(X_sampled)
shap.plots.bar(shap_values, max_display=11)

# Saving variable importance plot
mean_shap = np.abs(shap_values.values).mean(axis=0)
shap_pd = pd.DataFrame(mean_shap, index=X_sampled.columns).sort_values(by=[0], ascending=False)
shap_pd.to_csv(resolution + '_SHAP.csv')