In [None]:
import xgboost as xgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

# 1️⃣ Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# 2️⃣ Chia tập train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3️⃣ Khởi tạo model cơ bản
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

# 4️⃣ Khai báo grid tham số
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# 5️⃣ Grid Search với cross-validation
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=2,
    n_jobs=-1
)

print("🔍 Running grid search...")
grid_search.fit(X_train, y_train)

# 6️⃣ Hiển thị kết quả
print("\n✅ Best parameters found:")
print(grid_search.best_params_)
print("\nBest F1-score (CV):", round(grid_search.best_score_, 4))

# 7️⃣ Dùng model tốt nhất để đánh giá trên test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

f1 = f1_score(y_test, y_pred)
print("\n=== Test Set Evaluation ===")
print("F1-score:", round(f1, 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 8️⃣ Vẽ feature importance
xgb.plot_importance(best_model, importance_type='gain', title="Feature Importance (Gain)")
plt.show()

# 9️⃣ Lưu mô hình tốt nhất
best_model.save_model("best_xgb_grid_model.json")
print("💾 Saved best model to best_xgb_grid_model.json")
