# Import

In [17]:
# Imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
#!pip install matplotlib

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import pearsonr
from sklearn.model_selection import cross_val_score, KFold

In [18]:
data = pd.read_csv ('data/Salary.csv')
# Fuente: https://www.kaggle.com/datasets/amirmahdiabbootalebi/salary-by-job-title-and-country

# Feature

In [19]:
# Ordenar el DataFrame por la columna 'Salary' de menor a mayor
data = data.sort_values(by='Salary')

# Crear un diccionario para asignar valores numéricos a cada 'Job Title' basado en el salario
jobtitle_salary_mapping = {title: idx for idx, title in enumerate(data['Job Title'].unique(), start=1)}

# Mapear los títulos de trabajo al valor numérico basado en el salario
data['Job Title Numeric'] = data['Job Title'].map(jobtitle_salary_mapping)

In [20]:
# Crear un diccionario para asignar valores numéricos a cada 'Job Title' basado en el salario
jobtitle_salary_mapping = {title: idx for idx, title in enumerate(data['Job Title'].unique(), start=1)}

# Mostrar cada valor único de 'Job Title' con su valor numérico correspondiente
unique_job_titles = data['Job Title'].unique()

for title in unique_job_titles:
    job_numeric = jobtitle_salary_mapping[title]
    print(f"{title}: {job_numeric}")

Business Operations Analyst: 1
HR Coordinator: 2
Front end Developer: 3
Software Engineer Manager: 4
Sales Associate: 5
Receptionist: 6
Sales Representative: 7
Delivery Driver: 8
Digital Marketing Manager: 9
Customer Service Representative: 10
HR Generalist: 11
Juniour HR Coordinator: 12
Sales Executive: 13
Software Developer: 14
Web Developer: 15
Accountant: 16
Marketing Analyst: 17
Sales Manager: 18
Help Desk Analyst: 19
Business Development Associate: 20
Operations Analyst: 21
Data Entry Clerk: 22
Customer Support Specialist: 23
Operations Manager: 24
Software Engineer: 25
Business Analyst: 26
Developer: 27
Marketing Coordinator: 28
Customer Success Rep: 29
Marketing Specialist: 30
Copywriter: 31
Project Manager: 32
Financial Analyst: 33
Technical Support Specialist: 34
Operations Coordinator: 35
Designer: 36
Customer Service Rep: 37
Customer Success Manager: 38
Product Manager: 39
Juniour HR Generalist: 40
Graphic Designer: 41
Account Manager: 42
Data Scientist: 43
Event Coordinato

In [21]:
# Ordenar el DataFrame por la columna 'Salary' de menor a mayor manteniendo el orden de 'Country'
data = data.sort_values(by='Salary')

# Crear un diccionario para asignar valores numéricos a cada 'Country' basado en el salario
country_salary_mapping = {country: idx for idx, country in enumerate(data['Country'].unique(), start=1)}

# Mapear los países al valor numérico basado en el salario
data['Country Numeric'] = data['Country'].map(country_salary_mapping)

In [22]:
# Convertir valores string de la columna 'Race' a valores numéricos
race_numeric, _ = data['Race'].factorize()

# Añadir la nueva columna 'Race Numeric' (números empezando desde 1)
data['Race Numeric'] = race_numeric + 1  # Sumar 1 para iniciar los números desde 1

# Actualizar el DataFrame original 'data' con la nueva columna 'Race Numeric'
data['Race Numeric'] = data['Race Numeric']

In [23]:
# Mapear las string "Gender" a valores numéricos
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})

In [24]:
data_numeric = data[['Age', 'Gender', 'Education Level', 'Years of Experience', 'Salary', 'Senior', 'Job Title Numeric', 'Country Numeric', 'Race Numeric']]

# Dividir en train_1 y test_1, el modelo se aplica a train_1

In [25]:
# Dividir en train1 y test1 para trabajar solo con el conjunto de entrenamiento train1, y mantener el conjunto de prueba test1
    # sin cambios hasta el final del proceso de aprendizaje automático

X = data_numeric[['Age', 'Gender', 'Education Level', 'Years of Experience', 'Senior', 'Job Title Numeric', 'Country Numeric', 'Race Numeric']]
y = data_numeric["Salary"] # target 

# Dividir TODOS los datos en train y test
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
print("Train2 features shape:", X_train1.shape)
print("Test2 features shape:", X_test1.shape)
print("Train2 target shape:", y_train1.shape)
print("Test2 target shape:", y_test1.shape)

Train2 features shape: (5347, 8)
Test2 features shape: (1337, 8)
Train2 target shape: (5347,)
Test2 target shape: (1337,)


# Random Forest

In [27]:
X = data_numeric[['Age', 'Gender', 'Education Level', 'Years of Experience', 'Senior', 'Job Title Numeric', 'Country Numeric', 'Race Numeric']]
y = data_numeric["Salary"] # target 

In [28]:
# Dividir train1 en train2 y test2:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train1, y_train1, test_size=0.2, random_state=42)

In [29]:
# Crear el modelo de Random Forest Regression
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)

# Entrenar el modelo con los datos escalados
random_forest.fit(X_train2, y_train2)

In [30]:
# Realizar predicciones en el conjunto de prueba
predictions_rf = random_forest.predict(X_test2)

In [31]:
# Rendimiento del modelo
mse_rf = mean_squared_error(y_test2, predictions_rf)
r2_rf = r2_score(y_test2, predictions_rf)
mae_rf = mean_absolute_error(y_test2, predictions_rf)
pearson_corr_rf, _ = pearsonr(y_test2, predictions_rf)

print(f"Mean Squared Error (Random Forest): {mse_rf}")
print(f"R-squared (Random Forest): {r2_rf}")
print(f"Mean Absolute Error (Random Forest): {mae_rf}")
print(f"Pearson Correlation (Random Forest): {pearson_corr_rf}")

Mean Squared Error (Random Forest): 81240428.136985
R-squared (Random Forest): 0.9710163087690005
Mean Absolute Error (Random Forest): 4169.5950028590305
Pearson Correlation (Random Forest): 0.9854150714833849


# Validación cruzada

In [34]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr
import numpy as np

# Función para calcular la correlación de Pearson
def pearson_corr(y_true, y_pred):
    corr, _ = pearsonr(y_true, y_pred)
    return corr

# Crear el modelo de Random Forest Regression
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)

# Definir las métricas que se desean calcular
scoring = {
    'mse': make_scorer(mean_squared_error),
    'mae': make_scorer(mean_absolute_error),
    'r2': make_scorer(r2_score),
    'pearson_corr': make_scorer(pearson_corr)
}

# Realizar validación cruzada con múltiples métricas
cv_results = cross_validate(random_forest, X_train2, y_train2, cv=5, scoring=scoring)

# Obtener los resultados de las métricas
mse_scores = cv_results['test_mse']
mae_scores = cv_results['test_mae']
r2_scores = cv_results['test_r2']
pearson_scores = cv_results['test_pearson_corr']

# Imprimir los resultados promedio de cada métrica
print("Promedio MSE:", np.mean(mse_scores))
print("Promedio MAE:", np.mean(mae_scores))
print("Promedio R-squared:", np.mean(r2_scores))
print("Promedio Pearson Correlation:", np.mean(pearson_scores))

Promedio MSE: 102395163.33955608
Promedio MAE: 4707.3279064549815
Promedio R-squared: 0.9630619468397679
Promedio Pearson Correlation: 0.9815311687875099


# Grid Search

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Definir la cuadrícula de hiperparámetros a explorar
param_grid = {
    'n_estimators': [100, 200, 300],  # Diferentes números de árboles en el bosque
    'max_depth': [None, 5, 10, 15],   # Profundidades máximas del árbol
    # Otros hiperparámetros que desees ajustar
}

# Crear el modelo de Random Forest Regression
random_forest = RandomForestRegressor(random_state=42)

# Inicializar Grid Search con validación cruzada (por ejemplo, con 5 folds)
grid_search = GridSearchCV(random_forest, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Entrenar el modelo con diferentes combinaciones de hiperparámetros
grid_search.fit(X_train2, y_train2)

# Obtener los mejores hiperparámetros encontrados por Grid Search
best_params = grid_search.best_params_
print("Mejores hiperparámetros:", best_params)

Mejores hiperparámetros: {'max_depth': None, 'n_estimators': 100}


In [36]:
# Obtener el mejor modelo entrenado con los mejores hiperparámetros
best_model = grid_search.best_estimator_

In [37]:
# Realizar predicciones en el conjunto de prueba con el mejor modelo
predictions_best = best_model.predict(X_test2)

In [38]:
# Evaluar el rendimiento del mejor modelo
mse_best = mean_squared_error(y_test2, predictions_best)
r2_best = r2_score(y_test2, predictions_best)
mae_best = mean_absolute_error(y_test2, predictions_best)
pearson_corr_best, _ = pearsonr(y_test2, predictions_best)

# Imprimir las métricas del mejor modelo
print(f"Mean Squared Error (Best Model): {mse_best}")
print(f"R-squared (Best Model): {r2_best}")
print(f"Mean Absolute Error (Best Model): {mae_best}")
print(f"Pearson Correlation (Best Model): {pearson_corr_best}")

Mean Squared Error (Best Model): 81240428.136985
R-squared (Best Model): 0.9710163087690005
Mean Absolute Error (Best Model): 4169.5950028590305
Pearson Correlation (Best Model): 0.9854150714833849
