# # Table of Contents
# 1. [Importing Libraries](#import-libraries)
# 2. [Ensemble Model](#ensemble-model)

# # Importing Libraries <a id="import-libraries"></a>

In [1]:
import pandas as pd

from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('C:/Users/Usuario/Documents/prueba_pwc/predictive_salary_model/data/processed/dataset_features.csv')

# # Ensemble Model <a id="ensemble-model"></a>

In [3]:
def voting_ensemble_salary(df: pd.DataFrame) -> None:

    X = df.drop(columns=["Salary", "Salary_log"], errors="ignore")
    y = df["Salary"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    rf = RandomForestRegressor(random_state=42)
    xgb = XGBRegressor(random_state=42)
    lr = LinearRegression()

    ensemble = VotingRegressor([('rf', rf), ('xgb', xgb), ('lr', lr)])
    ensemble.fit(X_train, y_train)

    preds = ensemble.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    r2  = r2_score(y_test, preds)

    print(f"VotingRegressor SALARY -> MAE: {mae:.2f}, R²: {r2:.2f}")

In [4]:
voting_ensemble_salary(df)

VotingRegressor SALARY -> MAE: 10430.45, R²: 0.90


* A VotingRegressor (RandomForest + XGBoost + LinearRegression) reached MAE ~$10.43k and R²=0.90.
* Conclusion: While this ensemble slightly improves or matches a well-tuned RandomForest, it does not dramatically surpass it. It does, however, offer greater robustness by combining different model perspectives.