# GeoSaskatoon 2023 - Random Forest Regression Model For Investigating The Impact of Data Quantity on Model Results

In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
import math

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error

from openpyxl import load_workbook

In [2]:
# get data
data = pd.read_excel('SWedge_MC_Results.xlsx',sheet_name = "Probabilistic Values", engine='openpyxl')
data.head()

Unnamed: 0,Wedge ID,Safety Factor,Wedge Volume (m3),Wedge Weight (MN),Plunge Line of Intersection (°),Trend Line of Intersection (°),Length Line of Intersection (m),Plunge Line of Intersection Basal-Joint1 (°),Trend Line of Intersection Basal-Joint1 (°),Length Line of Intersection Basal-Joint1 (m),...,Water Pressure Joint 2 (MPa),Water Pressure Basal Joint (MPa),Water Pressure Tension Crack (MPa),Ponded Water Depth (m),Seismic Alpha,Seismic Plunge (°),Seismic Trend (°),Maximum Persistence Joint 1 (m),Maximum Persistence Joint 2 (m),Maximum Persistence Basal Joint (m)
0,MC_TEST1 [0],2.265384,55008.96407,1485.24203,14.278116,139.781918,121.640252,,,,...,,,,,,,,0,0,
1,MC_TEST1 [1],1.426193,8320.062879,224.641698,25.807237,204.085585,68.910893,,,,...,,,,,,,,0,0,
2,MC_TEST1 [2],0.601306,3830.020343,103.410549,44.079254,187.506292,43.124966,,,,...,,,,,,,,0,0,
3,MC_TEST1 [3],1.377451,43551.65033,1175.894559,23.948066,160.957674,73.908297,,,,...,,,,,,,,0,0,
4,MC_TEST1 [4],0.665237,1249.835631,33.745562,44.418988,170.346985,42.863261,,,,...,,,,,,,,0,0,


In [3]:
print(np.shape(data))

(10000, 87)


In [4]:
# get data specifically for the modelling (i.e., the inputs and output)
data_model = data[["Safety Factor","Dip of Joint 1 (°)","Dip Direction of Joint 1 (°)","Dip of Joint 2 (°)","Dip Direction of Joint 2 (°)","Dip of Slope (°)","Dip Direction of Slope (°)","Friction Angle of Joint 1 (°)","Cohesion of Joint 1 (MPa)","Friction Angle of Joint 2 (°)","Cohesion of Joint 2 (MPa)"]]
print(np.shape(data_model))
data_model.head()

(10000, 11)


Unnamed: 0,Safety Factor,Dip of Joint 1 (°),Dip Direction of Joint 1 (°),Dip of Joint 2 (°),Dip Direction of Joint 2 (°),Dip of Slope (°),Dip Direction of Slope (°),Friction Angle of Joint 1 (°),Cohesion of Joint 1 (MPa),Friction Angle of Joint 2 (°),Cohesion of Joint 2 (MPa)
0,2.265384,14.403121,147.496379,88.313556,229.352606,54.231767,180.626149,30.190129,0,30.386605,0
1,1.426193,44.041745,144.087352,39.674372,258.423834,51.704935,175.028804,27.795316,0,31.380021,0
2,0.601306,50.52651,150.39896,44.328675,195.057419,59.749701,173.30697,32.599911,0,29.044598,0
3,1.377451,24.058483,155.12758,59.300962,235.667791,65.899231,175.758059,30.917626,0,30.507236,0
4,0.665237,49.531803,137.066259,49.599646,203.836248,54.313387,185.270114,28.79724,0,32.39443,0


In [5]:
# remove any realizations that are not kinematically possible
data_model =  data_model.dropna()
print(np.shape(data_model))

(9780, 11)


## Learning curves for different train_test_split random_states

Constant random_state (=123) for random forest, variable random_state (=0, 1, 42, 123) for train_test_split

Note that minimal pre-processing (aka normalizing/standardizing data and removing collinear inputs) is performed because they aren't necessary for random forests

In [6]:
# code generating RF models for varying dataset sizes and train/test splits
# performance metrics: R2 and RMSE
num_samples = []
R2_train_subsample_list = []
rmse_train_subsample_list = []
mape_train_subsample_list = []
R2_test_subsample_list = []
rmse_test_subsample_list = []
mape_test_subsample_list = []

random_state_val = [0,1,42,123]

for x in range(0,4):

    for i in range(100, 5000, 50):

        num_samples.append(i)

        # get subsample of data
        data_subsample = data_model.sample(n = i,random_state = 1)

        train_subsample, test_subsample = train_test_split(data_subsample, test_size=0.2, random_state=random_state_val[x])

        x_train_subsample = train_subsample[["Dip of Joint 1 (°)","Dip Direction of Joint 1 (°)","Dip of Joint 2 (°)","Dip Direction of Joint 2 (°)","Dip of Slope (°)","Dip Direction of Slope (°)","Friction Angle of Joint 1 (°)","Cohesion of Joint 1 (MPa)","Friction Angle of Joint 2 (°)","Cohesion of Joint 2 (MPa)"]]
        y_train_subsample = train_subsample[["Safety Factor"]]

        x_test_subsample = test_subsample[["Dip of Joint 1 (°)","Dip Direction of Joint 1 (°)","Dip of Joint 2 (°)","Dip Direction of Joint 2 (°)","Dip of Slope (°)","Dip Direction of Slope (°)","Friction Angle of Joint 1 (°)","Cohesion of Joint 1 (MPa)","Friction Angle of Joint 2 (°)","Cohesion of Joint 2 (MPa)"]]
        y_test_subsample = test_subsample[["Safety Factor"]]

        # train RF model
        rf_subsample = RandomForestRegressor(random_state = 123)
        model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
        ypred_RF_subsample = rf_subsample.predict(x_train_subsample)
        #ypred_RF_subsample = np.reshape(ypred_RF_subsample,(len(ypred_RF_subsample),1))

        R2_train_subsample = rf_subsample.score(x_train_subsample,y_train_subsample)
        rmse_train_subsample = math.sqrt(mean_squared_error(y_train_subsample,ypred_RF_subsample))
        mape_train_subsample = mean_absolute_percentage_error(y_train_subsample,ypred_RF_subsample)

        # append training R2, rmse, and mape to their respective lists
        R2_train_subsample_list.append(R2_train_subsample)
        rmse_train_subsample_list.append(rmse_train_subsample)
        mape_train_subsample_list.append(mape_train_subsample)

        # testing the RF model
        # predict y_test and find RMSE
        ypred_test_RF_subsample = rf_subsample.predict(x_test_subsample)
        #ypred_test_RF_subsample = np.reshape(ypred_test_RF_subsample,(len(ypred_test_RF_subsample),1))

        rmse_test_subsample = math.sqrt(mean_squared_error(y_test_subsample,ypred_test_RF_subsample))
        R2_test_subsample = rf_subsample.score(x_test_subsample,y_test_subsample)
        mape_test_subsample = mean_absolute_percentage_error(y_test_subsample,ypred_test_RF_subsample)

        # append testing R2, rmse, and mape to their respective lists
        R2_test_subsample_list.append(R2_test_subsample)
        rmse_test_subsample_list.append(rmse_test_subsample)
        mape_test_subsample_list.append(mape_test_subsample)


  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x

  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x

  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x

  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x

  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x

  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x

  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x

  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x

  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x

  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x

  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x

  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x

  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x

  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
  model_subsample = rf_subsample.fit(x

In [7]:
# convert the lists into dataframes
num_samples = pd.DataFrame(num_samples)
R2_train_subsample_list = pd.DataFrame(R2_train_subsample_list)
rmse_train_subsample_list = pd.DataFrame(rmse_train_subsample_list)
mape_train_subsample_list = pd.DataFrame(mape_train_subsample_list)
R2_test_subsample_list = pd.DataFrame(R2_test_subsample_list)
rmse_test_subsample_list = pd.DataFrame(rmse_test_subsample_list)
mape_test_subsample_list = pd.DataFrame(mape_test_subsample_list)

In [8]:
# concatenate the dataframes together
rslts = pd.concat([num_samples,R2_train_subsample_list,rmse_train_subsample_list,mape_train_subsample_list,R2_test_subsample_list,rmse_test_subsample_list,mape_test_subsample_list],axis=1)

In [9]:
print(np.shape(rslts))

# rows 0 - 97: random_state = 0
# rows 98 - 195: random_state = 1
# rows 196 - 293: random_state = 42
# rows 294 - 391: random_state = 123
# see Excel sheet for data post-processing

(392, 7)


In [10]:
# output results Excel
book = load_workbook('GeoSaskatoon 2023 ML Results.xlsx')
writer = pd.ExcelWriter('GeoSaskatoon 2023 ML Results.xlsx', engine='openpyxl') 
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
rslts.to_excel(writer, 'ML OUTPUT')
writer.save()
writer.close()

In [11]:
# double check 
data_subsample = data_model.sample(n = 400,random_state = 1)

train_subsample, test_subsample = train_test_split(data_subsample, test_size=0.2, random_state= 42)

x_train_subsample = train_subsample[["Dip of Joint 1 (°)","Dip Direction of Joint 1 (°)","Dip of Joint 2 (°)","Dip Direction of Joint 2 (°)","Dip of Slope (°)","Dip Direction of Slope (°)","Friction Angle of Joint 1 (°)","Cohesion of Joint 1 (MPa)","Friction Angle of Joint 2 (°)","Cohesion of Joint 2 (MPa)"]]
y_train_subsample = train_subsample[["Safety Factor"]]

x_test_subsample = test_subsample[["Dip of Joint 1 (°)","Dip Direction of Joint 1 (°)","Dip of Joint 2 (°)","Dip Direction of Joint 2 (°)","Dip of Slope (°)","Dip Direction of Slope (°)","Friction Angle of Joint 1 (°)","Cohesion of Joint 1 (MPa)","Friction Angle of Joint 2 (°)","Cohesion of Joint 2 (MPa)"]]
y_test_subsample = test_subsample[["Safety Factor"]]

# train RF model
rf_subsample = RandomForestRegressor(random_state = 123)
model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)
ypred_RF_subsample = rf_subsample.predict(x_train_subsample)

# testing the RF model
# predict y_test and find RMSE
ypred_test_RF_subsample = rf_subsample.predict(x_test_subsample)
#ypred_test_RF_subsample = np.reshape(ypred_test_RF_subsample,(len(ypred_test_RF_subsample),1))

rmse_test_subsample = math.sqrt(mean_squared_error(y_test_subsample,ypred_test_RF_subsample))
R2_test_subsample = rf_subsample.score(x_test_subsample,y_test_subsample)
mape_test_subsample = mean_absolute_percentage_error(y_test_subsample,ypred_test_RF_subsample)

print(R2_test_subsample)
print(rmse_test_subsample)

  model_subsample = rf_subsample.fit(x_train_subsample,y_train_subsample)


0.8842448031755211
0.8113617752860339
