## Tinh chỉnh (hyperparameter tuning)

### Import thư viện

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint, uniform
import matplotlib.pyplot as plt
import joblib
import warnings
warnings.filterwarnings("ignore")

### Load và xử lý dữ liệu 

In [2]:
train = pd.read_excel("../exps/data/train_feat.xlsx")

# Chia features & target
X = train.drop("Survived", axis=1)
y = train["Survived"]

# Chuyển các cột object -> số bằng LabelEncoder
for col in X.select_dtypes(include="object").columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

print("Dữ liệu đã encode xong. Dtypes:")
print(X.dtypes.value_counts())

# Tách train / validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape}, Validation: {X_val.shape}")

Dữ liệu đã encode xong. Dtypes:
int64      17
float64     2
Name: count, dtype: int64
Train: (712, 19), Validation: (179, 19)


### Thiết lập RandomizedSearchCV (Dò thô)

In [3]:
param_dist = {
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    'n_estimators': randint(200, 800),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 0.4),
    'min_child_weight': randint(1, 10),
    'reg_alpha': uniform(0, 2),
    'reg_lambda': uniform(1, 5)
}

xgb_model = XGBClassifier(
    eval_metric="logloss", 
    use_label_encoder=False, 
    random_state=42
)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=100,
    scoring="accuracy",
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best parameters (Random Search):")
print(random_search.best_params_)
print(f"Best CV Accuracy: {random_search.best_score_:.4f}")

best_random_params = random_search.best_params_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters (Random Search):
{'colsample_bytree': np.float64(0.6546485325768115), 'gamma': np.float64(0.28356439876404743), 'learning_rate': np.float64(0.12056399538158155), 'max_depth': 4, 'min_child_weight': 7, 'n_estimators': 737, 'reg_alpha': np.float64(1.6202267893583615), 'reg_lambda': np.float64(5.335361592900519), 'subsample': np.float64(0.9652962210225885)}
Best CV Accuracy: 0.8301


### Fine-tuning quanh tham số tốt nhất

In [4]:
param_grid_fine = {
    "max_depth": [
        max(1, best_random_params["max_depth"] - 1),
        best_random_params["max_depth"],
        best_random_params["max_depth"] + 1
    ],

    "learning_rate": [
        best_random_params["learning_rate"] * 0.8,
        best_random_params["learning_rate"],
        best_random_params["learning_rate"] * 1.2
    ],
    "subsample": [
        max(0.6, best_random_params["subsample"] - 0.1),
        best_random_params["subsample"],
        min(1.0, best_random_params["subsample"] + 0.1)
    ],
    "colsample_bytree": [
        max(0.6, best_random_params["colsample_bytree"] - 0.1),
        best_random_params["colsample_bytree"],
        min(1.0, best_random_params["colsample_bytree"] + 0.1)
    ]
}

fine_model = XGBClassifier(
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=42,
    n_estimators=best_random_params["n_estimators"],
    gamma=best_random_params["gamma"],
    reg_alpha=best_random_params["reg_alpha"],
    reg_lambda=best_random_params["reg_lambda"],
    min_child_weight=best_random_params["min_child_weight"]
)

grid_search = GridSearchCV(
    estimator=fine_model,
    param_grid=param_grid_fine,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Best parameters (Fine-tuning):")
print(grid_search.best_params_)
print(f"Best CV Accuracy: {grid_search.best_score_:.4f}")

best_params = grid_search.best_params_

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters (Fine-tuning):
{'colsample_bytree': np.float64(0.6546485325768115), 'learning_rate': np.float64(0.12056399538158155), 'max_depth': 4, 'subsample': np.float64(0.9652962210225885)}
Best CV Accuracy: 0.8301


### Thử nghiệm tham số

In [5]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import itertools

# Các giá trị muốn thử
param_grid = {
    "learning_rate": [0.05, 0.03],
    "max_depth": [5, 6, 7],
    "n_estimators": [600, 800, 1000],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
}


# Duyệt mọi tổ hợp
best_acc = 0
best_params_combo = None

for combo in itertools.product(*param_grid.values()):
    params = dict(zip(param_grid.keys(), combo))
    model = XGBClassifier(
        **params,
        gamma=0.1,
        reg_alpha=0.3,
        reg_lambda=2.0,
        random_state=42,
        eval_metric="logloss",
        use_label_encoder=False,
    )
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, val_pred)
    
    if acc > best_acc:
        best_acc = acc
        best_params_combo = params
    
    print(f"{params} → {acc:.4f}")

print("\nBest Validation Accuracy:", round(best_acc, 4))
print("Best Params:", best_params_combo)


{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 600, 'subsample': 0.8, 'colsample_bytree': 0.8} → 0.8101
{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 600, 'subsample': 0.8, 'colsample_bytree': 1.0} → 0.7989
{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 600, 'subsample': 1.0, 'colsample_bytree': 0.8} → 0.8156
{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 600, 'subsample': 1.0, 'colsample_bytree': 1.0} → 0.8101
{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 800, 'subsample': 0.8, 'colsample_bytree': 0.8} → 0.8101
{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 800, 'subsample': 0.8, 'colsample_bytree': 1.0} → 0.7933
{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 800, 'subsample': 1.0, 'colsample_bytree': 0.8} → 0.8156
{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 800, 'subsample': 1.0, 'colsample_bytree': 1.0} → 0.8101
{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 1000, 'subsample': 0.8, 'colsample_bytre

### Huấn luyện mô hình cuối cùng & đánh giá

In [6]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

final_model = XGBClassifier(
    learning_rate=0.03,
    max_depth=5,
    n_estimators=800,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_alpha=0.3,
    reg_lambda=2.0,
    random_state=42,
    eval_metric="logloss",
    use_label_encoder=False
)

# Huấn luyện mô hình
final_model.fit(X_train, y_train)

# Dự đoán & đánh giá
y_pred = final_model.predict(X_val)
val_acc = accuracy_score(y_val, y_pred)

print(f"🏆 Final Validation Accuracy: {val_acc:.4f}")


🏆 Final Validation Accuracy: 0.8380


In [7]:
import joblib
joblib.dump(final_model, "XGBoost_best_final.pkl")
print("Model saved as XGBoost_best_final.pkl")

Model saved as XGBoost_best_final.pkl


In [8]:
import pandas as pd

test = pd.read_excel("../exps/data/test_feat.xlsx")

# Xử lý test giống train
full = pd.concat([train.drop("Survived", axis=1), test])
full = pd.get_dummies(full, drop_first=True)

X = full.iloc[:len(train), :]
X_test = full.iloc[len(train):, :]
y = train["Survived"]

In [None]:
from xgboost import XGBClassifier

final_model = XGBClassifier(
    learning_rate=0.03,
    max_depth=5,
    n_estimators=800,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_alpha=0.3,
    reg_lambda=2.0,
    random_state=42,
    eval_metric="logloss",
    use_label_encoder=False
)

final_model.fit(X, y)

In [None]:
test_pred = final_model.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": test_pred
})

submission.to_csv("submission.csv", index=False)
print("File submission.csv đã được tạo!")

File submission.csv đã được tạo!
