# # Table of Contents
# 1. [Importing Libraries](#import-libraries)
# 2. [Advanced Validation](#advanced-validation)

# # Importing Libraries <a id="import-libraries"></a>

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import RepeatedKFold, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from loguru import logger

In [2]:
df = pd.read_csv('C:/Users/Usuario/Documents/prueba_pwc/predictive_salary_model/data/processed/dataset_features.csv')

# # Advanced Validation <a id="advanced-validation"></a>

In [3]:
def repeated_kfold_salary(df):

    X = df.drop(columns=["Salary", "Salary_log"], errors="ignore")
    y = df["Salary"]

    logger.info("Usando RepeatedKFold para evaluar MSE en SALARY.")
    rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
    rf = RandomForestRegressor(random_state=42)
    scores = cross_val_score(rf, X, y, scoring='neg_mean_squared_error', cv=rkf, n_jobs=-1)
    mse_array = -scores
    logger.info(f"RepeatedKFold MSE: {np.mean(mse_array):.2f} ± {np.std(mse_array):.2f}")

In [4]:
repeated_kfold_salary(df)

[32m2025-01-18 18:38:54.906[0m | [1mINFO    [0m | [36m__main__[0m:[36mrepeated_kfold_salary[0m:[36m6[0m - [1mUsando RepeatedKFold para evaluar MSE en SALARY.[0m
[32m2025-01-18 18:38:58.607[0m | [1mINFO    [0m | [36m__main__[0m:[36mrepeated_kfold_salary[0m:[36m11[0m - [1mRepeatedKFold MSE: 250877583.04 ± 68876610.16[0m


In [5]:
def nested_cv_salary(df):

    X = df.drop(columns=["Salary", "Salary_log"], errors="ignore")
    y = df["Salary"]

    param_grid = {
        "n_estimators": [50, 100],
        "max_depth": [None, 10]
    }

    outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
    outer_scores = []

    logger.info("Iniciando Nested CV con SALARY.")
    for train_idx, test_idx in outer_cv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
        rf = RandomForestRegressor(random_state=42)
        grid = GridSearchCV(rf, param_grid, scoring='neg_mean_absolute_error', cv=inner_cv, n_jobs=-1)
        grid.fit(X_train, y_train)

        best_model = grid.best_estimator_
        preds = best_model.predict(X_test)
        mae = mean_absolute_error(y_test, preds)
        outer_scores.append(mae)

    logger.info(f"Nested CV MAE: {np.mean(outer_scores):.2f} ± {np.std(outer_scores):.2f}")

In [6]:
nested_cv_salary(df)

[32m2025-01-18 18:39:27.822[0m | [1mINFO    [0m | [36m__main__[0m:[36mnested_cv_salary[0m:[36m14[0m - [1mIniciando Nested CV con SALARY.[0m
[32m2025-01-18 18:39:29.697[0m | [1mINFO    [0m | [36m__main__[0m:[36mnested_cv_salary[0m:[36m29[0m - [1mNested CV MAE: 10337.70 ± 722.40[0m
