### Optimisation of Gradient Boosting Regressor in Artificial Neural Network for the Prediction of Specific Heat Capacity using a Stacked Model approach and Group Contributions

In [None]:
# Numpy (manipulation of arrays)
import numpy as np
# Pandas (manipulation of databases)
import pandas as pd
# Matplotlib (plotting library)
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, FormatStrFormatter

# Import scikit learn classifier and regressor.
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression

# Import the random forest regressor
from sklearn.ensemble import RandomForestRegressor

# Import Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor

#Import Support Vector Regressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier

#Import grid search to finetune the hyperparameters
from sklearn.model_selection import GridSearchCV

# Tool for splitting sets. Needed to split training and test data
from sklearn.model_selection import train_test_split
# Tool for splitting sets. Used to apply kFold validation on the data sets
from sklearn.model_selection import KFold 

# Module to normalized data
from sklearn import preprocessing
# Module to standardise data
from sklearn.preprocessing import StandardScaler

# Tool for calculation of the mean square error (mse) and mean absolut error (mae)
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Tool for calculating R2 and Cross Validation score
from sklearn.metrics import r2_score 
from sklearn.model_selection import cross_val_score

In [None]:
# Dictates which columns I would like to read from the databse
columns = ['Temperature (K)','Density (kg/m3)','Cp (J/g*K)','CH4','CH3','CH2','CH','C']

# Read data base, specifies no specific sheet thus reads all of them together
d = pd.read_excel('databasegc.xlsx', sheet_name=None, header=0, usecols=columns)

# Colates the multiple sheets in the data base
df = pd.concat(d.values(), ignore_index=True)

# Print data base 
df

In [None]:
# Describe the database (means, standard deviations, etc.)
df.describe().transpose()

In [None]:
# Identify target column (output). For this example, wine quality is the target property
target = ['Cp (J/g*K)']

#Prints target
target

In [None]:
# Specifies exact features (inputs) in order
features = ['Temperature (K)','Density (kg/m3)','CH4','CH3','CH2','CH','C']

#Prints features
features

In [None]:
# Extract input (feature) data and output (target) data from database 
x_data = df[features].values
y_data = df[target].values

# Split data for training and testing. In this example, the splits is 75:25
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,test_size=0.25,random_state=40,shuffle=True)

print(x_train.shape)
print(x_test.shape)

In [None]:
# Define the data scaler
scaler = StandardScaler()

# Fits and transforms x_train data set (standardisation)
x_train_scaled = scaler.fit_transform(x_train)

# Transforms x_test data set using the mean and standard deviation from the fitted x_train data set
x_test_scaled = scaler.transform(x_test)

# Create a Gradient Boosting regressor
param_grid = {'n_estimators': [100, 500, 1000],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [5, 10, 50]}
GBR = GradientBoostingRegressor()
grid_search = GridSearchCV(GBR, param_grid=param_grid, cv=5)
grid_search.fit(x_train_scaled, y_train.ravel())
GBR_preds = grid_search.predict(x_test_scaled)

# Print R2 and MSE for each combination
results = grid_search.cv_results_
mse_values = []
for i in range(len(results['params'])):
    params = results['params'][i]
    model = GradientBoostingRegressor(**params)
    model.fit(x_train_scaled, y_train.ravel())
    y_pred = model.predict(x_test_scaled)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mse_values.append(mse)
    print("Parameters:", params)
    print("R2:", r2)
    print("MSE:", mse)
    print()
# Print the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

In [None]:
# Extract the scores from the grid search results
r2_scores = results['mean_test_score']

# Create a list of parameter combinations
param_combinations = [str(params) for params in results['params']]

# Extract numeric parameter values
param_values = [list(eval(params).values()) for params in param_combinations]

# Plot R2 scores
plt.figure(figsize=(12, 8))
plt.plot(range(len(r2_scores)), r2_scores, marker='o', color='purple')
plt.xlabel('Parameter Combinations')
plt.ylabel('R2 Score')
plt.title('R2 Score for Each Parameter Combination')
plt.xticks(range(len(r2_scores)), param_values, rotation='vertical', fontsize=8) 
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
bars = plt.bar(range(len(mse_values)), mse_values, color='dodgerblue')

# Customizing the bar plot
plt.xlabel('Parameter Combinations')
plt.ylabel('MSE Score')
plt.title('MSE Score for Each Parameter Combination')
plt.xticks(range(len(mse_values)), param_values, rotation='vertical', fontsize=8)
plt.tight_layout()

plt.show()