# # Table of Contents
# 1. [Importing Libraries](#import-libraries)
# 2. [Hyperparam Tuning Grid](#hyperparam-tuning-grid)

# # Importing Libraries <a id="import-libraries"></a>

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

from loguru import logger

In [2]:
df = pd.read_csv('C:/Users/Usuario/Documents/prueba_pwc/predictive_salary_model/data/processed/dataset_features.csv')

# # Hyperparam Tuning Grid <a id="hyperparam-tuning-grid"></a>

In [4]:
def hyperparam_tuning_salary(df: pd.DataFrame) -> None:

    X = df.drop(columns=["Salary", "Salary_log"], errors="ignore")
    y = df["Salary"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    rf = RandomForestRegressor(random_state=42)
    
    param_grid = {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5]
    }

    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        scoring="neg_mean_absolute_error",
        cv=5,
        n_jobs=-1,
        verbose=2
    )

    logger.info("Iniciando GridSearchCV para RandomForest (Salary).")
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    logger.info(f"Mejores parámetros: {best_params}")
    logger.info(f"Mejor score (neg MAE): {best_score}")

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)
    logger.info(f"En test, MAE: {mae:.2f}, R²: {r2:.2f}")


hyperparam_tuning_salary(df)

[32m2025-01-19 11:37:48.973[0m | [1mINFO    [0m | [36m__main__[0m:[36mhyperparam_tuning_salary[0m:[36m25[0m - [1mIniciando GridSearchCV para RandomForest (Salary).[0m


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[32m2025-01-19 11:37:51.519[0m | [1mINFO    [0m | [36m__main__[0m:[36mhyperparam_tuning_salary[0m:[36m30[0m - [1mMejores parámetros: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}[0m
[32m2025-01-19 11:37:51.520[0m | [1mINFO    [0m | [36m__main__[0m:[36mhyperparam_tuning_salary[0m:[36m31[0m - [1mMejor score (neg MAE): -10437.788967735862[0m
[32m2025-01-19 11:37:51.528[0m | [1mINFO    [0m | [36m__main__[0m:[36mhyperparam_tuning_salary[0m:[36m37[0m - [1mEn test, MAE: 10570.46, R²: 0.88[0m


* GridSearchCV identified {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200} as the best parameters for RandomForest (Salary), with MAE ~$10.57k and R²=0.88 on the test set.
* Conclusion: GridSearchCV significantly improves performance over default settings, achieving a stable error around ~$10.5k.