In [2]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
df = pd.read_csv('C:/Users/Usuario/Documents/prueba_pwc/predictive_salary_model/data/interim/dataset_cleaned.csv')

In [42]:
df["Salary_log"] = np.log(df["Salary"])

In [43]:
df = pd.get_dummies(df, columns=["Gender"], prefix="gender", drop_first=False)

In [4]:
value_counts = df["Job Title"].value_counts()
print(value_counts.head(20))  # mira los 20 más frecuentes

df["Job Title"].unique()

valores_unicos = df["Job Title"].unique()

len(valores_unicos)

Job Title
Director of Marketing                    12
Director of Operations                   10
Senior Business Analyst                  10
Senior Marketing Manager                  9
Senior Marketing Analyst                  9
Junior Business Analyst                   8
Senior Data Scientist                     7
Senior Financial Analyst                  7
Junior Business Development Associate     7
Junior Financial Analyst                  7
Senior Project Manager                    7
Senior Software Engineer                  6
Senior Product Manager                    6
Junior Marketing Coordinator              6
Senior Project Coordinator                5
Junior Operations Analyst                 5
Junior Project Manager                    5
Senior Financial Manager                  5
Senior Product Designer                   5
Senior Operations Manager                 5
Name: count, dtype: int64


174

In [44]:
edu_map = {
    "Missing": 0,
    "Bachelor's": 1,
    "Master's": 2,
    "PhD": 3
}

df["Education_Level_ordinal"] = df["Education Level"].map(edu_map)

df.drop(columns=["Education Level"], inplace=True)

In [45]:
bins = [0, 2, 7, 15, float('inf')]
labels = ["junior", "semi-senior", "senior", "expert"]

df["experience_level"] = pd.cut(
    df["Years of Experience"],
    bins=bins,
    labels=labels,
    include_lowest=True 
)

In [46]:
exp_map = {
    "junior": 0,
    "semi-senior": 1,
    "senior": 2,
    "expert": 3
}

df["experience_level_ordinal"] = df["experience_level"].map(exp_map)
df.drop(columns=["experience_level"], inplace=True)

In [47]:
df.drop(columns=["id", "Description", "Job Title", "Age"], inplace=True)

In [48]:
# Si usas log(Salary), la variable objetivo la tomas de "Salary_log"
TARGET_COLUMN = "Salary_log"  # o "Salary" si decides
y = df[TARGET_COLUMN]

# Quita la columna original de Salary si estás usando la log
# y elimina la propia 'Salary_log' como feature si la usas de target
feature_cols = df.drop(columns=["Salary", "Salary_log"], errors="ignore").columns
X = df[feature_cols]

In [49]:
# 1. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [50]:
# 2. Modelo Baseline
baseline = DummyRegressor(strategy='mean')
baseline.fit(X_train, y_train)
y_pred_base = baseline.predict(X_test)

print("=== Baseline (DummyRegressor) ===")
print("MAE:", mean_absolute_error(y_test, y_pred_base))
print("MSE:", mean_squared_error(y_test, y_pred_base))
print("R2:", r2_score(y_test, y_pred_base))

=== Baseline (DummyRegressor) ===
MAE: 0.4524474825951692
MSE: 0.27525537303634984
R2: -0.004641140426039803


In [52]:
# 3. Regresión Lineal
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("\n=== Linear Regression ===")
print("MAE:", mean_absolute_error(y_test, y_pred_lr))
print("MSE:", mean_squared_error(y_test, y_pred_lr))
print("R2:", r2_score(y_test, y_pred_lr))


=== Linear Regression ===
MAE: 0.1420740681548171
MSE: 0.032986879177174064
R2: 0.8796028010276997


In [53]:
# 4. RandomForest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("\n=== RandomForest ===")
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("R2:", r2_score(y_test, y_pred_rf))


=== RandomForest ===
MAE: 0.10597302791226011
MSE: 0.02232255363699167
R2: 0.9185260018879734
