In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, mean_squared_log_error
import xgboost as xgb
import joblib

# Cargar datos procesados
data = pd.read_csv("../data/processed/data_cleaned_no_outliers.csv")

# Separar características (X) y la variable objetivo (y)
X = data.drop(columns=["PorcentajeCumplimiento"])
y = data["PorcentajeCumplimiento"]

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Definir el modelo base de XGBoost
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Definir la cuadrícula de parámetros
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.05],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 1, 10],
    'reg_lambda': [1, 0.1, 10],
    'gamma': [0, 0.1, 0.2]
}


In [None]:

# Configurar el GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)

# Ajustar el modelo
grid_search.fit(X_train, y_train)

# Mejor conjunto de hiperparámetros
print(f"Mejores hiperparámetros: {grid_search.best_params_}")
