# modelo 2

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, learning_curve, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import joblib


df = pd.read_csv("data_with_missing.csv")


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

imputer = SimpleImputer(strategy="mean")
df[num_cols] = imputer.fit_transform(df[num_cols])

if cat_cols:
    imputer_cat = SimpleImputer(strategy="most_frequent")
    df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])
    for col in cat_cols:
        df[col] = LabelEncoder().fit_transform(df[col])


X = df.drop("Target", axis=1)
y = df["Target"]


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
param_dist = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.7, 0.8, 1.0]
}

search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', verbose=1, random_state=42)
search.fit(X_train, y_train)

best_model = search.best_estimator_


y_pred = best_model.predict(X_test)
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm).plot(cmap='Blues')
plt.title("Matriz de Confusión - XGBoost")
plt.show()


train_sizes, train_scores, test_scores = learning_curve(best_model, X_train, y_train, cv=5,
                                                        train_sizes=np.linspace(0.1, 1.0, 5),
                                                        scoring='accuracy')
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_mean, label='Precisión en entrenamiento')
plt.plot(train_sizes, test_mean, label='Precisión en validación')
plt.title("Curva de aprendizaje - XGBoost")
plt.xlabel("Tamaño del conjunto de entrenamiento")
plt.ylabel("Precisión")
plt.legend()
plt.grid(True)
plt.show()

