In [32]:
# Shriya Sharma
# IIT Delhi, 2024
# ML model evaluation for formic acid dataset

In [31]:
# data manipulation libraries
import numpy as np
import pandas as pd
# sklearn ML libraries
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold as kf
from sklearn.model_selection import cross_val_score
# evaluation metrics
from sklearn.metrics import r2_score as r2
from sklearn.metrics import root_mean_squared_error as rmse

Dataset import and pre-processing

In [27]:
#dataset
df = pd.read_csv("dataset.csv")

#variables
columns_reqd = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
X = df[df.columns[columns_reqd]]
check_nan = X.isnull().values.any()

#target
columns_reqd = [12, 13]
y = df[df.columns[columns_reqd]]
check_nan = y.isnull().values.any()

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 49)

#scaling
scaler_x = preprocessing.StandardScaler()
X_train_scaled = scaler_x.fit_transform(X_train)
X_test_scaled = scaler_x.transform(X_test)

scaler_y = preprocessing.StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train)
y_test_scaled = scaler_y.transform(y_test)

Grid search

In [17]:
#grid search for random forest

# grid of parameters
param_grid = {
    'n_estimators': [40, 60, 80, 100, 120],
    'criterion': ['squared_error'],
    'min_samples_split': [2, 4, 6],
    'max_features': ['sqrt'],
    'random_state': [16],
    'max_depth' : [5, 7, 9, 11, 13]}

rf = RandomForestRegressor(bootstrap = False) #random forest regressor
cv = kf(n_splits = 5, shuffle = True, random_state = 5) # definition of 5-fold cross validation
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = cv, n_jobs = -1, verbose = 2) # grid search on 5-fold cv
grid_search.fit(X_train_scaled, y_train_scaled) # fit

Fitting 5 folds for each of 75 candidates, totalling 375 fits


In [18]:
#best parameters from grid search
grid_search.best_params_

{'criterion': 'squared_error',
 'max_depth': 9,
 'max_features': 'sqrt',
 'min_samples_split': 6,
 'n_estimators': 120,
 'random_state': 16}

Model Train (based on grid search)

In [28]:
# Train & test on the best estimator

rf = grid_search.best_estimator_

rf.fit(X_train_scaled, y_train_scaled)
y_pred = rf.predict(X_train_scaled)
score_train = r2(y_pred, y_train_scaled)
score_test = rf.score(X_test_scaled, y_test_scaled)

print("Model results without CV")
print('Train R2: {:0.4f}'.format(score_train))
print('Test R2: {:0.4f}'.format(score_test))

Model results without CV
Train R2: 0.8257
Test R2: 0.7909


Model evaluation metrics

In [29]:
# R2 and RMSE calculation
y_test_scaled_pred = rf.predict(X_test_scaled)
y_train_scaled_pred = rf.predict(X_train_scaled)

y_test_original_pred = scaler_y.inverse_transform(y_test_scaled_pred)
y_train_original_pred = scaler_y.inverse_transform(y_train_scaled_pred)

ton_test = y_test_original_pred[:, 0]
ton_train = y_train_original_pred[:, 0]

yield_test = y_test_original_pred[:, 1]
yield_train = y_train_original_pred[:, 1]

r2_train_ton = r2(pd.Series.to_numpy(y_train.iloc[:, 0]), ton_train)
r2_train_yield =  r2(pd.Series.to_numpy(y_train.iloc[:, 1]), yield_train)
r2_test_ton = r2(pd.Series.to_numpy(y_test.iloc[:, 0]), ton_test)
r2_test_yield = r2(pd.Series.to_numpy(y_test.iloc[:, 1]), yield_test)

print("Train R2 ton without CV: {:0.4f}".format(r2_train_ton))
print("Train R2 yield without CV: {:0.4f}".format(r2_train_yield))
print("Test R2 ton without CV: {:0.4f}".format(r2_test_ton))
print("Test R2 yield without CV: {:0.4f}".format(r2_test_yield))

rmse_train_ton = rmse(pd.Series.to_numpy(y_train.iloc[:, 0]), ton_train)
rmse_train_yield =  rmse(pd.Series.to_numpy(y_train.iloc[:, 1]), yield_train)
rmse_test_ton = rmse(pd.Series.to_numpy(y_test.iloc[:, 0]), ton_test)
rmse_test_yield = rmse(pd.Series.to_numpy(y_test.iloc[:, 1]), yield_test)

print()
print("Train rmse ton without CV: {:0.4f}".format(rmse_train_ton))
print("Train rmse yield without CV: {:0.4f}".format(rmse_train_yield))
print("Test rmse ton without CV: {:0.4f}".format(rmse_test_ton))
print("Test rmse yield without CV: {:0.4f}".format(rmse_test_yield))

Train R2 ton without CV: 0.8527
Train R2 yield without CV: 0.8976
Test R2 ton without CV: 0.7444
Test R2 yield without CV: 0.8374

Train rmse ton without CV: 1429.4210
Train rmse yield without CV: 6.9805
Test rmse ton without CV: 1915.0907
Test rmse yield without CV: 8.2050
