In [1]:

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from xgboost import XGBRegressor
import optuna
import mlflow
import mlflow.xgboost

In [3]:
train = pd.read_csv("/Users/mac/Documents/MySchoolDocs/PERSONAL/JOB APPLICATIONS/ENERGY_PROJECT/data/final_datasets/train_df.csv")
eval = pd.read_csv("/Users/mac/Documents/MySchoolDocs/PERSONAL/JOB APPLICATIONS/ENERGY_PROJECT/data/final_datasets/eval_df.csv")
holdout = pd.read_csv("/Users/mac/Documents/MySchoolDocs/PERSONAL/JOB APPLICATIONS/ENERGY_PROJECT/data/final_datasets/holdout_df.csv")

In [4]:
X_train = train.drop(columns=["Date", "Strain_Index", "target_strain_index"])
y_train = train["target_strain_index"]

X_test = eval.drop(columns=["Date", "Strain_Index", "target_strain_index"])
y_test = eval["target_strain_index"]

X_valid = holdout.drop(columns=["Date", "Strain_Index", "target_strain_index"])
y_valid = holdout["target_strain_index"]

In [5]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
    }
    


    with mlflow.start_run(nested=True):
        model = XGBRegressor(**params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        rmse = root_mean_squared_error(y_test, y_pred)
        mae = float(mean_absolute_error(y_test, y_pred))
        
        def mape(y_true, y_pred, eps=1e-6):
            y_true = np.asarray(y_true, dtype=float)
            y_pred = np.asarray(y_pred, dtype=float)
            return np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), eps, None))) * 100

        def smape(y_true, y_pred, eps=1e-6):
            return 100 * np.mean(2 * np.abs(y_pred - y_true) /
                         (np.abs(y_true) + np.abs(y_pred) + eps))
        MAPE = mape(y_test, y_pred)
        SMAPE = smape(y_test, y_pred)
        
        
        # Log hyperparameters + metrics
        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "mape": MAPE, "smape": SMAPE})

    return rmse, mae, MAPE, SMAPE