In [None]:
import numpy as np
import pandas as pd
import time
import os
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    mean_squared_error,
    mean_absolute_percentage_error,
    explained_variance_score,
    max_error,
)
from joblib import dump

In [None]:
# Define constants
DATA_PATH = "/kaggle/input/permeabilitia/test data on new field.xlsx"
OUTPUT_DIR = "/kaggle/working/output/"
SHEET_NAME = "Sheet3"
RANDOM_SEED = 32

# Ensure the output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# Load dataset
df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)

In [None]:
# Split features and labels
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Labels

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED
)

In [None]:
def convert_time(seconds):
    seconds = int(seconds)  # Convert to integer
    hours = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return f"{hours:d}:{minutes:02d}:{seconds:02d}"

In [None]:
# Define the parameter grid for AdaBoost
param_grid = {
    "n_estimators": [5, 15, 35, 50, 10, 20, 30],
    "learning_rate": [0.1, 0.5, 0.01, 0.03, 0.05, 0.001, 0.003, 0.005, 0.008],
    "loss": ["linear", "square", "exponential"],
}

In [None]:
# Perform GridSearchCV
regr = GridSearchCV(
    AdaBoostRegressor(random_state=RANDOM_SEED),
    param_grid,
    cv=10,
    scoring=["neg_mean_squared_error", "r2"],
    refit="r2",
    verbose=1,
)

start_time = time.time()
regr.fit(X_train, y_train)
end_time = time.time()

In [None]:
# Save grid search results
grid_results = pd.DataFrame(regr.cv_results_)[
    [
        "param_n_estimators",
        "param_learning_rate",
        "param_loss",
        "mean_test_r2",
        "mean_test_neg_mean_squared_error",
    ]
]
grid_results.to_csv(f"{OUTPUT_DIR}performance_results_adaboost_gridsearchCV.csv", index=False)

In [None]:
# Train the best model
best_params = regr.best_params_
regressor = AdaBoostRegressor(
    n_estimators=best_params["n_estimators"],
    learning_rate=best_params["learning_rate"],
    loss=best_params["loss"],
    random_state=RANDOM_SEED,
)
regressor.fit(X_train, y_train)

In [None]:
# Predictions
pred_train = regressor.predict(X_train)
pred_test = regressor.predict(X_test)
pred_full = regressor.predict(X)

In [None]:
# Save predictions
with pd.ExcelWriter(f"{OUTPUT_DIR}predicted_adaboost_model.xlsx") as writer:
    pd.DataFrame({"y_train": y_train, "y_train_pred": pred_train}).to_excel(writer, sheet_name="training", index=False)
    pd.DataFrame({"y_test": y_test, "y_test_pred": pred_test}).to_excel(writer, sheet_name="testing", index=False)
    pd.DataFrame({"y": y, "y_pred": pred_full}).to_excel(writer, sheet_name="full_data", index=False)

In [None]:
# Metrics calculation
def calculate_metrics(y_true, y_pred):
    return {
        "R2": r2_score(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "MSE": mean_squared_error(y_true, y_pred),
        "MAPE": mean_absolute_percentage_error(y_true, y_pred),
        "Explained Variance": explained_variance_score(y_true, y_pred),
        "Max Error": max_error(y_true, y_pred),
        "Min Error": min(abs(y_true - y_pred)),
    }

metrics = {
    "Metric": ["R2", "MAE", "MSE", "MAPE", "Explained Variance", "Max Error", "Min Error"],
    "Training": list(calculate_metrics(y_train, pred_train).values()),
    "Testing": list(calculate_metrics(y_test, pred_test).values()),
    "Full Dataset": list(calculate_metrics(y, pred_full).values()),
}

metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv(f"{OUTPUT_DIR}performance_adaboost_metrics.csv", index=False)

In [None]:
# Save the trained model
dump(regressor, f"{OUTPUT_DIR}trained_adaboost_model.joblib")

# Print execution time and best parameters
print(f"Execution Time: {convert_time(end_time - start_time)}")
print(f"Best Parameters: {best_params}")