# Previous settings

In [55]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import joblib
import pandas as pd
import os 

# Suppress the warning caused by sklearn
import warnings
warnings.filterwarnings('ignore')

In [56]:
# Setting the route to read the data set
PROJECT_ROOT_DIR = "."
DATA_PATH = os.path.join(PROJECT_ROOT_DIR, "datasets")
os.makedirs(DATA_PATH, exist_ok=True)

# Setting the route to save the result of prediction
RESULT_PATH = os.path.join(PROJECT_ROOT_DIR, "results")
os.makedirs(RESULT_PATH, exist_ok=True)

# Setting the route to load the model
MODEL_PATH = os.path.join(PROJECT_ROOT_DIR, "models")
os.makedirs(MODEL_PATH, exist_ok=True)

# Data reading and preprocessing

In [57]:
# Read the excel of the data
data = os.path.join(DATA_PATH,"dataset2.xlsx")
df1 = pd.read_excel(data)
df1

Unnamed: 0,SiO2,TiO2,Al2O3,FeO*,MgO,CaO,Na2O,K2O,P2O5,Rb,...,La,Ce,Nd,Sm,Eu,Yb,Hf,Pb,Th,U
0,50.7849,1.26603,14.7489,8.85043,8.57204,12.4174,2.69378,0.070098,0.124174,1.02000,...,2.52000,7.9100,8.0200,2.81000,1.04000,2.71000,1.99000,0.332000,0.132000,0.033000
1,51.2255,1.35886,15.1206,8.78775,8.03930,11.6706,2.87478,0.170470,0.135165,4.44000,...,3.55000,10.4500,9.3300,3.06000,1.10000,2.90000,2.38000,0.767000,0.497000,0.131000
2,50.7967,1.41358,14.8134,9.40325,8.72019,11.4585,2.78884,0.046481,0.117212,1.11000,...,1.89000,7.1800,8.5700,3.15000,1.12000,3.12000,2.28000,0.315000,0.117000,0.023000
3,50.8165,1.22237,15.4801,8.24114,8.17716,11.6460,2.77904,0.445845,0.168941,12.34000,...,7.88000,18.8800,12.5600,3.35000,1.17000,2.14000,2.21000,1.313000,1.441000,0.379000
4,50.8683,1.17347,15.4212,8.69922,8.77606,11.9609,2.38612,0.089985,0.130428,1.35000,...,2.66000,8.2900,8.0000,2.74000,0.94000,2.83000,2.04000,0.348000,0.175000,0.049000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2199,50.9323,1.69100,15.2455,9.86600,6.94400,10.9220,2.92400,0.124000,0.193561,1.09764,...,3.71728,11.8893,10.8870,3.64576,1.33667,3.31888,2.76534,0.533924,0.159052,0.069925
2200,49.8800,1.63000,15.3000,8.51000,7.64000,10.9600,2.70000,0.820000,0.290000,22.87000,...,20.95000,41.1000,20.1100,4.52000,1.45000,2.55000,2.71000,1.279000,2.534000,0.617000
2201,51.4630,1.36600,15.8350,8.37900,7.49130,11.1810,2.83300,0.290000,0.149903,6.16175,...,6.64117,16.7446,10.7736,3.13623,1.20613,2.46150,2.14591,0.687221,0.609537,0.210778
2202,50.9203,1.11180,14.2816,11.22910,8.00494,11.6536,2.19327,0.042449,0.090969,0.91000,...,1.45600,4.9040,5.6910,2.32300,0.87900,3.07800,1.64800,0.382000,0.076000,0.027000


In [58]:
# Add element ratios
df1['La/Sm'] = df1['La']/df1['Sm']
df1['Ce/Pb'] = df1['Ce']/df1['Pb']
df1['Ba/Nb'] = df1['Ba']/df1['Nb']
df1['Sm/Yb'] = df1['Sm']/df1['Yb']
df1['Nb/U'] = df1['Nb']/df1['U']
df1['Rb/Nb'] = df1['Rb']/df1['Nb']
df1

Unnamed: 0,SiO2,TiO2,Al2O3,FeO*,MgO,CaO,Na2O,K2O,P2O5,Rb,...,Hf,Pb,Th,U,La/Sm,Ce/Pb,Ba/Nb,Sm/Yb,Nb/U,Rb/Nb
0,50.7849,1.26603,14.7489,8.85043,8.57204,12.4174,2.69378,0.070098,0.124174,1.02000,...,1.99000,0.332000,0.132000,0.033000,0.896797,23.825301,8.181818,1.036900,53.333333,0.579545
1,51.2255,1.35886,15.1206,8.78775,8.03930,11.6706,2.87478,0.170470,0.135165,4.44000,...,2.38000,0.767000,0.497000,0.131000,1.160131,13.624511,12.086957,1.055172,17.557252,1.930435
2,50.7967,1.41358,14.8134,9.40325,8.72019,11.4585,2.78884,0.046481,0.117212,1.11000,...,2.28000,0.315000,0.117000,0.023000,0.600000,22.793651,10.916667,1.009615,52.173913,0.925000
3,50.8165,1.22237,15.4801,8.24114,8.17716,11.6460,2.77904,0.445845,0.168941,12.34000,...,2.21000,1.313000,1.441000,0.379000,2.352239,14.379284,28.055077,1.565421,15.329815,2.123924
4,50.8683,1.17347,15.4212,8.69922,8.77606,11.9609,2.38612,0.089985,0.130428,1.35000,...,2.04000,0.348000,0.175000,0.049000,0.970803,23.821839,8.285714,0.968198,35.714286,0.771429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2199,50.9323,1.69100,15.2455,9.86600,6.94400,10.9220,2.92400,0.124000,0.193561,1.09764,...,2.76534,0.533924,0.159052,0.069925,1.019617,22.267776,3.581479,1.098491,42.036408,0.373423
2200,49.8800,1.63000,15.3000,8.51000,7.64000,10.9600,2.70000,0.820000,0.290000,22.87000,...,2.71000,1.279000,2.534000,0.617000,4.634956,32.134480,9.871044,1.772549,48.136143,0.770034
2201,51.4630,1.36600,15.8350,8.37900,7.49130,11.1810,2.83300,0.290000,0.149903,6.16175,...,2.14591,0.687221,0.609537,0.210778,2.117565,24.365670,6.522988,1.274113,44.991413,0.649754
2202,50.9203,1.11180,14.2816,11.22910,8.00494,11.6536,2.19327,0.042449,0.090969,0.91000,...,1.64800,0.382000,0.076000,0.027000,0.626776,12.837696,7.620910,0.754711,46.407407,0.726257


In [59]:
X_all = df1
X_all

Unnamed: 0,SiO2,TiO2,Al2O3,FeO*,MgO,CaO,Na2O,K2O,P2O5,Rb,...,Hf,Pb,Th,U,La/Sm,Ce/Pb,Ba/Nb,Sm/Yb,Nb/U,Rb/Nb
0,50.7849,1.26603,14.7489,8.85043,8.57204,12.4174,2.69378,0.070098,0.124174,1.02000,...,1.99000,0.332000,0.132000,0.033000,0.896797,23.825301,8.181818,1.036900,53.333333,0.579545
1,51.2255,1.35886,15.1206,8.78775,8.03930,11.6706,2.87478,0.170470,0.135165,4.44000,...,2.38000,0.767000,0.497000,0.131000,1.160131,13.624511,12.086957,1.055172,17.557252,1.930435
2,50.7967,1.41358,14.8134,9.40325,8.72019,11.4585,2.78884,0.046481,0.117212,1.11000,...,2.28000,0.315000,0.117000,0.023000,0.600000,22.793651,10.916667,1.009615,52.173913,0.925000
3,50.8165,1.22237,15.4801,8.24114,8.17716,11.6460,2.77904,0.445845,0.168941,12.34000,...,2.21000,1.313000,1.441000,0.379000,2.352239,14.379284,28.055077,1.565421,15.329815,2.123924
4,50.8683,1.17347,15.4212,8.69922,8.77606,11.9609,2.38612,0.089985,0.130428,1.35000,...,2.04000,0.348000,0.175000,0.049000,0.970803,23.821839,8.285714,0.968198,35.714286,0.771429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2199,50.9323,1.69100,15.2455,9.86600,6.94400,10.9220,2.92400,0.124000,0.193561,1.09764,...,2.76534,0.533924,0.159052,0.069925,1.019617,22.267776,3.581479,1.098491,42.036408,0.373423
2200,49.8800,1.63000,15.3000,8.51000,7.64000,10.9600,2.70000,0.820000,0.290000,22.87000,...,2.71000,1.279000,2.534000,0.617000,4.634956,32.134480,9.871044,1.772549,48.136143,0.770034
2201,51.4630,1.36600,15.8350,8.37900,7.49130,11.1810,2.83300,0.290000,0.149903,6.16175,...,2.14591,0.687221,0.609537,0.210778,2.117565,24.365670,6.522988,1.274113,44.991413,0.649754
2202,50.9203,1.11180,14.2816,11.22910,8.00494,11.6536,2.19327,0.042449,0.090969,0.91000,...,1.64800,0.382000,0.076000,0.027000,0.626776,12.837696,7.620910,0.754711,46.407407,0.726257


# Water predicting

In [60]:
# load the trained model "Established_RFR_model.pkl"
model = os.path.join(MODEL_PATH,"Established_RFR_model.pkl")
model_rf = joblib.load(model)

In [61]:
result_rf = model_rf.predict(X_all)
np.array(result_rf,dtype=float)
result_rf

array([0.18057237, 0.26997202, 0.25281291, ..., 0.41894113, 0.21778633,
       0.20218771])

In [62]:
# Insert the predicted H2O contents into the input dataset
df1.insert(df1.shape[1], 'H2O_P', result_rf)
df1

Unnamed: 0,SiO2,TiO2,Al2O3,FeO*,MgO,CaO,Na2O,K2O,P2O5,Rb,...,Pb,Th,U,La/Sm,Ce/Pb,Ba/Nb,Sm/Yb,Nb/U,Rb/Nb,H2O_P
0,50.7849,1.26603,14.7489,8.85043,8.57204,12.4174,2.69378,0.070098,0.124174,1.02000,...,0.332000,0.132000,0.033000,0.896797,23.825301,8.181818,1.036900,53.333333,0.579545,0.180572
1,51.2255,1.35886,15.1206,8.78775,8.03930,11.6706,2.87478,0.170470,0.135165,4.44000,...,0.767000,0.497000,0.131000,1.160131,13.624511,12.086957,1.055172,17.557252,1.930435,0.269972
2,50.7967,1.41358,14.8134,9.40325,8.72019,11.4585,2.78884,0.046481,0.117212,1.11000,...,0.315000,0.117000,0.023000,0.600000,22.793651,10.916667,1.009615,52.173913,0.925000,0.252813
3,50.8165,1.22237,15.4801,8.24114,8.17716,11.6460,2.77904,0.445845,0.168941,12.34000,...,1.313000,1.441000,0.379000,2.352239,14.379284,28.055077,1.565421,15.329815,2.123924,0.501754
4,50.8683,1.17347,15.4212,8.69922,8.77606,11.9609,2.38612,0.089985,0.130428,1.35000,...,0.348000,0.175000,0.049000,0.970803,23.821839,8.285714,0.968198,35.714286,0.771429,0.204247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2199,50.9323,1.69100,15.2455,9.86600,6.94400,10.9220,2.92400,0.124000,0.193561,1.09764,...,0.533924,0.159052,0.069925,1.019617,22.267776,3.581479,1.098491,42.036408,0.373423,0.261334
2200,49.8800,1.63000,15.3000,8.51000,7.64000,10.9600,2.70000,0.820000,0.290000,22.87000,...,1.279000,2.534000,0.617000,4.634956,32.134480,9.871044,1.772549,48.136143,0.770034,1.026779
2201,51.4630,1.36600,15.8350,8.37900,7.49130,11.1810,2.83300,0.290000,0.149903,6.16175,...,0.687221,0.609537,0.210778,2.117565,24.365670,6.522988,1.274113,44.991413,0.649754,0.418941
2202,50.9203,1.11180,14.2816,11.22910,8.00494,11.6536,2.19327,0.042449,0.090969,0.91000,...,0.382000,0.076000,0.027000,0.626776,12.837696,7.620910,0.754711,46.407407,0.726257,0.217786


In [63]:
# Output results
result = os.path.join(RESULT_PATH,"result.xlsx")
df1.to_excel(result,"result.xlsx")