# Hyperparameter Tuning for XGBoost (Cathy)


---

In [4]:
! pip install pandas numpy matplotlib seaborn scikit-learn xgboost



In [6]:
import json
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
train_clean = pd.read_csv("data/model_ready/train_clean.csv")
test_clean  = pd.read_csv("data/model_ready/test_clean.csv")

target_col = "status"

X_train = train_clean.drop(columns=[target_col])
y_train = train_clean[target_col]

X_test  = test_clean.drop(columns=[target_col])
y_test  = test_clean[target_col]

In [12]:
param_dist = {
    "n_estimators": [200, 300, 400, 500],
    "max_depth": [3, 5, 7, 9],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 3, 5]
}

In [40]:
xgb = XGBClassifier(
    objective="binary:logistic",
    random_state=42,
    n_jobs=-1
)

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=1728,                    # number of random combos to try
    scoring="f1_weighted",       # very important for startup dataset likely imbalanced
    cv=5,                        # 5-fold CV
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

In [42]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


In [48]:
best_params = random_search.best_params_
print("Best params:", best_params)

os.makedirs("docs/tuning_results", exist_ok=True)
with open("docs/tuning_results（XGBoost)/xgboost_best_params.json", "w") as f:
    json.dump(best_params, f, indent=4)

Best params: {'subsample': 0.6, 'n_estimators': 400, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.6}


In [52]:
RANDOM_STATE = 42
best_model = XGBClassifier(**best_params, random_state=RANDOM_STATE, use_label_encoder=False, eval_metric="logloss", n_jobs=-1)
best_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [62]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import os


output_dir = "docs/tuning_results（XGBoost)"
os.makedirs(output_dir, exist_ok=True)


y_pred = best_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

report_text = (
    "XGBoost Tuned Model Report\n"
    "============================\n\n"
    f"Accuracy:  {acc:.4f}\n"
    f"Precision: {prec:.4f}\n"
    f"Recall:    {rec:.4f}\n"
    f"F1 Score:  {f1:.4f}\n\n"
    "Confusion Matrix:\n"
    f"{cm}\n"
)


print(report_text)

file_path = os.path.join(output_dir, "xgboost_tuned_report.txt")
with open(file_path, "w") as f:
    f.write(report_text)

print(f"Report saved to: {file_path}")


XGBoost Tuned Model Report

Accuracy:  0.7405
Precision: 0.7687
Recall:    0.8583
F1 Score:  0.8110

Confusion Matrix:
[[ 34  31]
 [ 17 103]]

Report saved to: docs/tuning_results（XGBoost)/xgboost_tuned_report.txt


In [66]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("XGBoost Tuned Confusion Matrix")
plt.savefig("docs/tuning_results（XGBoost)/xgboost_tuned_cm.png", dpi=300)
plt.close()

In [71]:
cv_df = pd.DataFrame(random_search.cv_results_)
cv_df.to_csv("docs/tuning_results（XGBoost)/xgboost_cv_results.csv", index=False)