# 1.Environment Setup

In [None]:
# Import thư viện cần thiết
import os
import random
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_score

warnings.filterwarnings('ignore')

In [None]:
# Đặt seed để đảm bảo kết quả nhất quán giữa các lần chạy
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)
random.seed(SEED)
print(f"Seed: {SEED}")

# 2.Data Processing

In [None]:
def read_csv(file_path):
    df = pd.read_csv(file_path)
    X = df.drop('target', axis=1)
    y = df['target']
    return X, y

# --- Đọc các tập dữ liệu ---
print("Đang đọc dữ liệu...")

## 2.1 Dataset

In [None]:
# Load all datasets
X_train, y_train = read_csv('splits/raw_train.csv')
X_val, y_val = read_csv('splits/raw_val.csv')
X_test, y_test = read_csv('splits/raw_test.csv')

X_fe_train, y_fe_train = read_csv('splits/fe_train.csv')
X_fe_val, y_fe_val = read_csv('splits/fe_val.csv')
X_fe_test, y_fe_test = read_csv('splits/fe_test.csv')

X_dt_train, y_dt_train = read_csv('splits/dt_train.csv')
X_dt_val, y_dt_val = read_csv('splits/dt_val.csv')
X_dt_test, y_dt_test = read_csv('splits/dt_test.csv')

X_fe_dt_train, y_fe_dt_train = read_csv('splits/fe_dt_train.csv')
X_fe_dt_val, y_fe_dt_val = read_csv('splits/fe_dt_val.csv')
X_fe_dt_test, y_fe_dt_test = read_csv('splits/fe_dt_test.csv')

# 3.AdaBoost (AB)

In [None]:
# --- Gradient Boosting Functions ---

def find_optimal_gb(
    X_train, y_train,
    n_estimators_range=range(50, 501, 50),
    cv_splits=3
):
    """
    Hàm tìm số lượng cây con (n_estimators) tối ưu cho Gradient Boosting.
    """
    cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=SEED)
    scores = []

    for n in n_estimators_range:
        gb = GradientBoostingClassifier(
            n_estimators=n,
            learning_rate=0.1,
            max_depth=5,
            subsample=1.0,
            random_state=SEED
        )
        cv_score = cross_val_score(
            gb, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1
        )
        scores.append(cv_score.mean())

    best_n = list(n_estimators_range)[int(np.argmax(scores))]
    print(f"n_estimators tối ưu (CV): {best_n}")

    # Vẽ biểu đồ
    plt.figure(figsize=(10, 6))
    plt.plot(list(n_estimators_range), scores, 'bo-')
    plt.title(f'Chọn n_estimators tối ưu cho Gradient Boosting (CV={cv_splits}-fold)')
    plt.xlabel('n_estimators')
    plt.ylabel('Cross-Validation Accuracy')
    plt.grid(True)
    plt.show()

    # Train lại với toàn bộ X_train
    best_model = GradientBoostingClassifier(
        n_estimators=best_n,
        learning_rate=0.1,
        max_depth=5,
        subsample=1.0,
        random_state=SEED
    )
    best_model.fit(X_train, y_train)
    return best_model, best_n, max(scores)


def evaluate_val_gb(
    X_train, y_train, X_val, y_val,
    n_estimators_range=range(50, 501, 50),
    cv_splits=3
):
    """
    Hàm huấn luyện và đánh giá Gradient Boosting trên tập validation.
    """
    print("\n" + "="*50)
    print("Tìm n_estimators tối ưu cho Gradient Boosting...")
    gb_model, best_n, cv_acc = find_optimal_gb(
        X_train, y_train,
        n_estimators_range=n_estimators_range,
        cv_splits=cv_splits
    )

    val_pred = gb_model.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    print(f"\nĐộ chính xác GB trên tập validation: {val_acc:.4f}")
    print("Classification Report:")
    print(classification_report(y_val, val_pred))
    return gb_model, val_acc, {"n_estimators": best_n}


def evaluate_test_gb(gb_model, X_test, y_test):
    """
    Hàm đánh giá Gradient Boosting trên tập test.
    """
    test_pred = gb_model.predict(X_test)
    test_acc = accuracy_score(y_test, test_pred)
    print(f"\nĐộ chính xác GB trên tập test: {test_acc:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, test_pred))
    return test_acc


## 3.1 AB on Original Dataset

In [None]:
# 1. Trên bộ Original
gb_model, val_acc, best_params = evaluate_val_gb(
    X_train, y_train, X_val, y_val
)
test_acc = evaluate_test_gb(gb_model, X_test, y_test)

## 3.2 AB on FE Dataset

In [None]:
# 2. Trên bộ Feature Engineering (FE)
gb_model, val_fe_acc, best_params = evaluate_val_gb(
    X_fe_train, y_fe_train, X_fe_val, y_fe_val
)
test_fe_acc = evaluate_test_gb(gb_model, X_fe_test, y_fe_test)

## 3.3 AB on Original DT Dataset

In [None]:
# 3. Trên bộ Original + Decision Tree (DT)
gb_model, val_dt_acc, best_params = evaluate_val_gb(
    X_dt_train, y_dt_train, X_dt_val, y_dt_val
)
test_dt_acc = evaluate_test_gb(gb_model, X_dt_test, y_dt_test)


## 3.4 AB on FE + DT Dataset

In [None]:
# 4. Trên bộ FE + DT
gb_model, val_fe_dt_acc, best_params = evaluate_val_gb(
    X_fe_dt_train, y_fe_dt_train, X_fe_dt_val, y_fe_dt_val
)
test_fe_dt_acc = evaluate_test_gb(gb_model, X_fe_dt_test, y_fe_dt_test)
print("\n" + "="*50)
print("Đã hoàn tất huấn luyện và đánh giá trên 4 bộ dữ liệu.")
print("Đang vẽ biểu đồ tổng hợp...")

## 4.Result Visualization

In [None]:
# --- Vẽ Biểu đồ So sánh ---

plt.rcParams['font.family'] = 'Serif'

labels = ['Original', 'FE', 'Original + DT', "FE + DT"]
val_accs = [val_acc, val_fe_acc, val_dt_acc, val_fe_dt_acc]
test_accs = [test_acc, test_fe_acc, test_dt_acc, test_fe_dt_acc]

x = np.arange(len(labels))
width = 0.3

fig, ax = plt.subplots(figsize=(8, 6))

rects1 = ax.bar(x - width/2, val_accs, width,
                label='Validation Accuracy',
                color='tab:blue', edgecolor='black', linewidth=1.2)
rects2 = ax.bar(x + width/2, test_accs, width,
                label='Test Accuracy',
                color='tab:red', edgecolor='black', linewidth=1.2)

ax.set_ylim(0.5, 1.05)
ax.set_ylabel('Accuracy')
ax.set_title('Gradient Boosting Performance', fontsize=16)
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend(ncol=2, loc="upper center")

def autolabel(rects):
    for rect in rects:
        h = rect.get_height()
        ax.annotate(f'{h:.2f}', xy=(rect.get_x() + rect.get_width() / 2, h),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

fig.tight_layout()
plt.show()

# II.6 XGBoost

In [None]:
# --- XGBoost Functions ---

def find_optimal_xgb(
    X_train, y_train,
    n_estimators_range=range(50, 501, 50),
    cv_splits=3,
    learning_rate=0.1,
    max_depth=5,
    subsample=1.0,
    use_gpu=False
):
    """
    Hàm tìm số lượng cây con (n_estimators) tối ưu cho XGBoost.
    """
    cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=SEED)
    scores = []

    # Xác định objective và eval_metric dựa trên số lớp
    n_classes = len(np.unique(y_train))
    objective = 'binary:logistic' if n_classes == 2 else 'multi:softprob'
    eval_metric = 'logloss' if n_classes == 2 else 'mlogloss'

    for n in n_estimators_range:
        xgb = XGBClassifier(
            n_estimators=n,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
            objective=objective,
            eval_metric=eval_metric,
            random_state=SEED,
            n_jobs=-1,
            tree_method='gpu_hist' if use_gpu else 'hist',
            verbosity=0
        )
        cv_score = cross_val_score(
            xgb, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1
        )
        scores.append(cv_score.mean())

    best_n = list(n_estimators_range)[int(np.argmax(scores))]
    print(f"n_estimators tối ưu (CV): {best_n}")

    # Vẽ biểu đồ
    plt.figure(figsize=(10, 6))
    plt.plot(list(n_estimators_range), scores, 'bo-')
    plt.title(f'Chọn n_estimators tối ưu cho XGBoost (CV={cv_splits}-fold)')
    plt.xlabel('n_estimators')
    plt.ylabel('Cross-Validation Accuracy')
    plt.grid(True)
    plt.show()

    # Train lại với toàn bộ X_train
    best_model = XGBClassifier(
        n_estimators=best_n,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        objective=objective,
        eval_metric=eval_metric,
        random_state=SEED,
        n_jobs=-1,
        tree_method='gpu_hist' if use_gpu else 'hist',
        verbosity=0
    )
    best_model.fit(X_train, y_train)
    return best_model, best_n, max(scores)


def evaluate_val_xgb(
    X_train, y_train, X_val, y_val,
    n_estimators_range=range(50, 501, 50),
    cv_splits=3,
    learning_rate=0.1,
    max_depth=5,
    subsample=1.0,
    use_gpu=False
):
    """
    Hàm huấn luyện và đánh giá XGBoost trên tập validation.
    """
    print("\n" + "="*50)
    print("Tìm n_estimators tối ưu cho XGBoost...")
    xgb_model, best_n, cv_acc = find_optimal_xgb(
        X_train, y_train,
        n_estimators_range=n_estimators_range,
        cv_splits=cv_splits,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        use_gpu=use_gpu
    )

    val_pred = xgb_model.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    print(f"\nĐộ chính xác XGBoost trên tập validation: {val_acc:.4f}")
    print("Classification Report:")
    print(classification_report(y_val, val_pred))
    return xgb_model, val_acc, {"n_estimators": best_n}


def evaluate_test_xgb(xgb_model, X_test, y_test):
    """
    Hàm đánh giá XGBoost trên tập test.
    """
    test_pred = xgb_model.predict(X_test)
    test_acc = accuracy_score(y_test, test_pred)
    print(f"\nĐộ chính xác XGBoost trên tập test: {test_acc:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, test_pred))
    return test_acc


# 3.1 AB on Original Dataset

In [None]:
# 1. Trên bộ Original
xgb_model, val_acc, best_params = evaluate_val_xgb(
    X_train, y_train, X_val, y_val
)
test_acc = evaluate_test_xgb(xgb_model, X_test, y_test)

# 3.2 AB on FE Dataset

In [None]:
# 2. Trên bộ Feature Engineering (FE)
xgb_model, val_fe_acc, best_params = evaluate_val_xgb(
    X_fe_train, y_fe_train, X_fe_val, y_fe_val
)
test_fe_acc = evaluate_test_xgb(xgb_model, X_fe_test, y_fe_test)


# 3.3 AB on Original DT Dataset

In [None]:
# 3. Trên bộ Original + Decision Tree (DT)
xgb_model, val_dt_acc, best_params = evaluate_val_xgb(
    X_dt_train, y_dt_train, X_dt_val, y_dt_val
)
test_dt_acc = evaluate_test_xgb(xgb_model, X_dt_test, y_dt_test)


# 3.4 AB on FE + DT Dataset

In [None]:
# 4. Trên bộ FE + DT
xgb_model, val_fe_dt_acc, best_params = evaluate_val_xgb(
    X_fe_dt_train, y_fe_dt_train, X_fe_dt_val, y_fe_dt_val
)
test_fe_dt_acc = evaluate_test_xgb(xgb_model, X_fe_dt_test, y_fe_dt_test)
print("\n" + "="*50)
print("Đã hoàn tất huấn luyện và đánh giá trên 4 bộ dữ liệu.")
print("Đang vẽ biểu đồ tổng hợp...")


# 4.Result Visualization XGBoost

In [None]:
# --- Vẽ Biểu đồ So sánh ---

plt.rcParams['font.family'] = 'Serif'

labels = ['Original', 'FE', 'Original + DT', "FE + DT"]
val_accs = [val_acc, val_fe_acc, val_dt_acc, val_fe_dt_acc]
test_accs = [test_acc, test_fe_acc, test_dt_acc, test_fe_dt_acc]

x = np.arange(len(labels))
width = 0.3

fig, ax = plt.subplots(figsize=(8, 6))

rects1 = ax.bar(x - width/2, val_accs, width,
                label='Validation Accuracy',
                color='tab:blue', edgecolor='black', linewidth=1.2)
rects2 = ax.bar(x + width/2, test_accs, width,
                label='Test Accuracy',
                color='tab:red', edgecolor='black', linewidth=1.2)

ax.set_ylim(0.5, 1.05)
ax.set_ylabel('Accuracy')
ax.set_title('XGBoost Performance', fontsize=16)
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend(ncol=2, loc="upper center")

def autolabel(rects):
    for rect in rects:
        h = rect.get_height()
        ax.annotate(f'{h:.2f}', xy=(rect.get_x() + rect.get_width() / 2, h),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

fig.tight_layout()
plt.show()