# # Table of Contents
# 1. [Importing Libraries](#import-libraries)
# 2. [Hyperparam Tuning Optuna](#hyperparam-tuning-optuna)

# # Importing Libraries <a id="import-libraries"></a>

In [1]:
import optuna
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('C:/Users/Usuario/Documents/prueba_pwc/predictive_salary_model/data/processed/dataset_features.csv')

# # Hyperparam Tuning Optuna <a id="hyperparam-tuning-optuna"></a>

In [25]:
def objective(trial):

    n_estimators = trial.suggest_int("n_estimators", 50, 300, step=50)
    max_depth = trial.suggest_categorical("max_depth", [None, 10, 20, 30])
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1
    )

    scores = cross_val_score(
        rf,
        X, 
        y,
        scoring="neg_mean_absolute_error",
        cv=5,
        n_jobs=-1
    )
    mae = -np.mean(scores) 
    return mae

def run_optuna_search(df, n_trials=20):

    global X, y 
  
    X = df.drop(columns=["Salary"], errors="ignore")
    y = df["Salary"]

    study = optuna.create_study(direction="minimize") 
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    print("Number of finished trials:", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print(f"  Value (MAE): {trial.value:.2f}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    best_params = trial.params
    rf_best = RandomForestRegressor(
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        min_samples_split=best_params["min_samples_split"],
        random_state=42,
        n_jobs=-1
    )
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    rf_best.fit(X_train, y_train)
    preds = rf_best.predict(X_test)
    final_mae = mean_absolute_error(y_test, preds)
    print(f"MAE en test con hiperparámetros óptimos: {final_mae:.2f}")

In [26]:
run_optuna_search(df, n_trials=50)

[I 2025-01-19 12:58:18,918] A new study created in memory with name: no-name-b72a1e7f-2e8d-41df-bf19-9d51a21acfb8
Best trial: 0. Best value: 1359.75:   2%|▏         | 1/50 [00:00<00:21,  2.32it/s]

[I 2025-01-19 12:58:19,349] Trial 0 finished with value: 1359.7466593315216 and parameters: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 6}. Best is trial 0 with value: 1359.7466593315216.


Best trial: 1. Best value: 1030.25:   4%|▍         | 2/50 [00:00<00:20,  2.36it/s]

[I 2025-01-19 12:58:19,769] Trial 1 finished with value: 1030.251314143125 and parameters: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 1 with value: 1030.251314143125.


Best trial: 1. Best value: 1030.25:   6%|▌         | 3/50 [00:01<00:18,  2.49it/s]

[I 2025-01-19 12:58:20,144] Trial 2 finished with value: 1181.934374751536 and parameters: {'n_estimators': 250, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 5}. Best is trial 1 with value: 1030.251314143125.


Best trial: 1. Best value: 1030.25:   8%|▊         | 4/50 [00:01<00:17,  2.65it/s]

[I 2025-01-19 12:58:20,483] Trial 3 finished with value: 1628.3672468504942 and parameters: {'n_estimators': 200, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 7}. Best is trial 1 with value: 1030.251314143125.


Best trial: 1. Best value: 1030.25:  10%|█         | 5/50 [00:01<00:15,  2.93it/s]

[I 2025-01-19 12:58:20,762] Trial 4 finished with value: 1627.8217753897661 and parameters: {'n_estimators': 150, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 7}. Best is trial 1 with value: 1030.251314143125.


Best trial: 5. Best value: 815.907:  12%|█▏        | 6/50 [00:02<00:14,  2.96it/s]

[I 2025-01-19 12:58:21,091] Trial 5 finished with value: 815.9070566818826 and parameters: {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 5 with value: 815.9070566818826.


Best trial: 7. Best value: 704.538:  16%|█▌        | 8/50 [00:02<00:11,  3.69it/s]

[I 2025-01-19 12:58:21,338] Trial 6 finished with value: 1061.4302987374354 and parameters: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 5 with value: 815.9070566818826.
[I 2025-01-19 12:58:21,531] Trial 7 finished with value: 704.5381515531527 and parameters: {'n_estimators': 100, 'max_depth': 30, 'min_samples_split': 7, 'min_samples_leaf': 1}. Best is trial 7 with value: 704.5381515531527.


Best trial: 7. Best value: 704.538:  20%|██        | 10/50 [00:02<00:09,  4.44it/s]

[I 2025-01-19 12:58:21,717] Trial 8 finished with value: 2209.9522704378114 and parameters: {'n_estimators': 100, 'max_depth': 30, 'min_samples_split': 6, 'min_samples_leaf': 9}. Best is trial 7 with value: 704.5381515531527.
[I 2025-01-19 12:58:21,899] Trial 9 finished with value: 1061.4302987374374 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 7 with value: 704.5381515531527.


Best trial: 10. Best value: 670.945:  24%|██▍       | 12/50 [00:03<00:06,  5.60it/s]

[I 2025-01-19 12:58:22,031] Trial 10 finished with value: 670.9450879450873 and parameters: {'n_estimators': 50, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 10 with value: 670.9450879450873.
[I 2025-01-19 12:58:22,167] Trial 11 finished with value: 689.4876362076358 and parameters: {'n_estimators': 50, 'max_depth': 30, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 10 with value: 670.9450879450873.


Best trial: 10. Best value: 670.945:  28%|██▊       | 14/50 [00:03<00:05,  6.30it/s]

[I 2025-01-19 12:58:22,320] Trial 12 finished with value: 714.2429305007321 and parameters: {'n_estimators': 50, 'max_depth': 30, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 10 with value: 670.9450879450873.
[I 2025-01-19 12:58:22,451] Trial 13 finished with value: 714.2429305007305 and parameters: {'n_estimators': 50, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 10 with value: 670.9450879450873.


Best trial: 10. Best value: 670.945:  30%|███       | 15/50 [00:03<00:05,  6.65it/s]

[I 2025-01-19 12:58:22,583] Trial 14 finished with value: 714.2429305007299 and parameters: {'n_estimators': 50, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 10 with value: 670.9450879450873.


Best trial: 10. Best value: 670.945:  34%|███▍      | 17/50 [00:04<00:05,  6.21it/s]

[I 2025-01-19 12:58:22,842] Trial 15 finished with value: 912.1872656662138 and parameters: {'n_estimators': 150, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 10 with value: 670.9450879450873.
[I 2025-01-19 12:58:22,951] Trial 16 finished with value: 2515.770833243915 and parameters: {'n_estimators': 50, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 10 with value: 670.9450879450873.


Best trial: 10. Best value: 670.945:  38%|███▊      | 19/50 [00:04<00:05,  5.75it/s]

[I 2025-01-19 12:58:23,215] Trial 17 finished with value: 695.5336236236241 and parameters: {'n_estimators': 150, 'max_depth': None, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 10 with value: 670.9450879450873.
[I 2025-01-19 12:58:23,348] Trial 18 finished with value: 939.3400657184359 and parameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 10 with value: 670.9450879450873.


Best trial: 10. Best value: 670.945:  40%|████      | 20/50 [00:04<00:05,  5.65it/s]

[I 2025-01-19 12:58:23,532] Trial 19 finished with value: 930.8140077899945 and parameters: {'n_estimators': 100, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 10 with value: 670.9450879450873.


Best trial: 10. Best value: 670.945:  42%|████▏     | 21/50 [00:05<00:07,  4.09it/s]

[I 2025-01-19 12:58:23,933] Trial 20 finished with value: 690.5536988416987 and parameters: {'n_estimators': 250, 'max_depth': 30, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 10 with value: 670.9450879450873.


Best trial: 10. Best value: 670.945:  44%|████▍     | 22/50 [00:05<00:09,  3.11it/s]

[I 2025-01-19 12:58:24,437] Trial 21 finished with value: 690.5536988416985 and parameters: {'n_estimators': 250, 'max_depth': 30, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 10 with value: 670.9450879450873.


Best trial: 10. Best value: 670.945:  46%|████▌     | 23/50 [00:05<00:09,  2.87it/s]

[I 2025-01-19 12:58:24,848] Trial 22 finished with value: 701.8786124450535 and parameters: {'n_estimators': 250, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 10 with value: 670.9450879450873.


Best trial: 10. Best value: 670.945:  48%|████▊     | 24/50 [00:06<00:09,  2.88it/s]

[I 2025-01-19 12:58:25,192] Trial 23 finished with value: 707.8925749775752 and parameters: {'n_estimators': 200, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 10 with value: 670.9450879450873.


Best trial: 10. Best value: 670.945:  50%|█████     | 25/50 [00:06<00:09,  2.73it/s]

[I 2025-01-19 12:58:25,604] Trial 24 finished with value: 701.8786124450536 and parameters: {'n_estimators': 250, 'max_depth': 30, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 10 with value: 670.9450879450873.


Best trial: 10. Best value: 670.945:  52%|█████▏    | 26/50 [00:07<00:09,  2.49it/s]

[I 2025-01-19 12:58:26,086] Trial 25 finished with value: 909.3444973524884 and parameters: {'n_estimators': 300, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 10 with value: 670.9450879450873.


Best trial: 27. Best value: 661.494:  56%|█████▌    | 28/50 [00:07<00:06,  3.38it/s]

[I 2025-01-19 12:58:26,353] Trial 26 finished with value: 1185.554487901267 and parameters: {'n_estimators': 150, 'max_depth': 20, 'min_samples_split': 3, 'min_samples_leaf': 5}. Best is trial 10 with value: 670.9450879450873.
[I 2025-01-19 12:58:26,496] Trial 27 finished with value: 661.4936936936938 and parameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 27 with value: 661.4936936936938.


Best trial: 27. Best value: 661.494:  60%|██████    | 30/50 [00:07<00:04,  4.46it/s]

[I 2025-01-19 12:58:26,615] Trial 28 finished with value: 2063.599929125309 and parameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 8}. Best is trial 27 with value: 661.4936936936938.
[I 2025-01-19 12:58:26,798] Trial 29 finished with value: 1371.7859632303978 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 6}. Best is trial 27 with value: 661.4936936936938.


Best trial: 27. Best value: 661.494:  64%|██████▍   | 32/50 [00:08<00:03,  5.76it/s]

[I 2025-01-19 12:58:26,929] Trial 30 finished with value: 714.2429305007303 and parameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 27 with value: 661.4936936936938.
[I 2025-01-19 12:58:27,049] Trial 31 finished with value: 689.4876362076365 and parameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 27 with value: 661.4936936936938.


Best trial: 27. Best value: 661.494:  68%|██████▊   | 34/50 [00:08<00:02,  5.97it/s]

[I 2025-01-19 12:58:27,167] Trial 32 finished with value: 661.4936936936938 and parameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 27 with value: 661.4936936936938.
[I 2025-01-19 12:58:27,359] Trial 33 finished with value: 930.8140077899945 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 27 with value: 661.4936936936938.


Best trial: 27. Best value: 661.494:  72%|███████▏  | 36/50 [00:08<00:02,  5.90it/s]

[I 2025-01-19 12:58:27,492] Trial 34 finished with value: 714.2429305007317 and parameters: {'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 27 with value: 661.4936936936938.
[I 2025-01-19 12:58:27,690] Trial 35 finished with value: 691.4126820326813 and parameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 27 with value: 661.4936936936938.


Best trial: 27. Best value: 661.494:  76%|███████▌  | 38/50 [00:09<00:01,  6.16it/s]

[I 2025-01-19 12:58:27,877] Trial 36 finished with value: 1056.96780088498 and parameters: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 27 with value: 661.4936936936938.
[I 2025-01-19 12:58:28,010] Trial 37 finished with value: 714.2429305007303 and parameters: {'n_estimators': 50, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 27 with value: 661.4936936936938.


Best trial: 27. Best value: 661.494:  80%|████████  | 40/50 [00:09<00:01,  5.66it/s]

[I 2025-01-19 12:58:28,204] Trial 38 finished with value: 683.1387129987127 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 27 with value: 661.4936936936938.
[I 2025-01-19 12:58:28,392] Trial 39 finished with value: 1199.5039764721748 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 5}. Best is trial 27 with value: 661.4936936936938.


Best trial: 27. Best value: 661.494:  84%|████████▍ | 42/50 [00:09<00:01,  5.97it/s]

[I 2025-01-19 12:58:28,587] Trial 40 finished with value: 1693.5961854881316 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 7}. Best is trial 27 with value: 661.4936936936938.
[I 2025-01-19 12:58:28,720] Trial 41 finished with value: 689.487636207636 and parameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 27 with value: 661.4936936936938.


Best trial: 27. Best value: 661.494:  88%|████████▊ | 44/50 [00:10<00:00,  6.55it/s]

[I 2025-01-19 12:58:28,865] Trial 42 finished with value: 689.4876362076358 and parameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 27 with value: 661.4936936936938.
[I 2025-01-19 12:58:28,999] Trial 43 finished with value: 720.08501682261 and parameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 27 with value: 661.4936936936938.


Best trial: 44. Best value: 654.793:  90%|█████████ | 45/50 [00:10<00:00,  5.91it/s]

[I 2025-01-19 12:58:29,207] Trial 44 finished with value: 654.7927927927929 and parameters: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 44 with value: 654.7927927927929.


Best trial: 44. Best value: 654.793:  92%|█████████▏| 46/50 [00:10<00:00,  5.02it/s]

[I 2025-01-19 12:58:29,476] Trial 45 finished with value: 671.8264264264265 and parameters: {'n_estimators': 150, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 44 with value: 654.7927927927929.


Best trial: 44. Best value: 654.793:  94%|█████████▍| 47/50 [00:10<00:00,  4.56it/s]

[I 2025-01-19 12:58:29,743] Trial 46 finished with value: 696.9841983283306 and parameters: {'n_estimators': 150, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 44 with value: 654.7927927927929.


Best trial: 44. Best value: 654.793:  96%|█████████▌| 48/50 [00:11<00:00,  4.20it/s]

[I 2025-01-19 12:58:30,024] Trial 47 finished with value: 1047.2164027038707 and parameters: {'n_estimators': 150, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 44 with value: 654.7927927927929.


Best trial: 44. Best value: 654.793:  98%|█████████▊| 49/50 [00:11<00:00,  4.35it/s]

[I 2025-01-19 12:58:30,234] Trial 48 finished with value: 905.0237889469856 and parameters: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 9, 'min_samples_leaf': 3}. Best is trial 44 with value: 654.7927927927929.


Best trial: 44. Best value: 654.793: 100%|██████████| 50/50 [00:11<00:00,  4.28it/s]

[I 2025-01-19 12:58:30,596] Trial 49 finished with value: 712.75747911748 and parameters: {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 6, 'min_samples_leaf': 1}. Best is trial 44 with value: 654.7927927927929.
Number of finished trials: 50
Best trial:
  Value (MAE): 654.79
  Params: 
    n_estimators: 100
    max_depth: None
    min_samples_split: 2
    min_samples_leaf: 1
MAE en test con hiperparámetros óptimos: 470.00



