import

In [None]:
import pandas as pd
import numpy as np
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import os
import random
import os
print("XGBoost version:", xgboost.__version__)
print(xgboost.build_info())


#global variables
dir_path = "C:\\school\\SchoolProgram\\NTUST_CSIE_DS\\DataSet"
outputPath = ''
random_state = 42

load data

In [None]:
df_origin = pd.read_csv(os.path.join(dir_path, 'preprocessing_T1_basic.csv'))

#auto drop non-numeric, nan columns
df_preprocessing = df_origin.select_dtypes(include=[np.number]).dropna(axis=1)

print("Processed data shape:", df_preprocessing.shape)

正歸化、split data

In [None]:
# 正規化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_preprocessing.copy().drop(columns=['label']))

# 分割資料集
y = df_preprocessing['label']
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=random_state, stratify=y)

# 使用 SMOTE 進行過採樣
target_pos_ratio = 0.30  # 想要的正類比例
sampling_strategy = target_pos_ratio / (1 - target_pos_ratio)

smote = SMOTE(
    sampling_strategy=sampling_strategy, 
    random_state=random_state
)
X_train, y_train = smote.fit_resample(X_train, y_train)
print("SMOTE 後訓練集大小:", X_train.shape, "正類比例:", y_train.mean())
print(f"訓練集大小: {X_train.shape}")
print(f"測試集大小: {X_test.shape}")
print(f"訓練集中警示帳戶比例: {y_train.mean():.2%}")
print(f"測試集中警示帳戶比例: {y_test.mean():.2%}")

training XGBoost model

In [None]:
# 定義參數網格
param_grid = {
    'n_estimators': [200, 400, 600],
    'max_depth': [6, 8, 10],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# 初始化 XGBoost 分類器
xgb_clf = XGBClassifier(
    objective='binary:logistic',
    random_state=random_state,
    tree_method='hist',
    device='gpu'
)

print("開始訓練 XGBoost 模型...")
# 使用 GridSearchCV 搜索最佳參數
grid_search = RandomizedSearchCV(estimator=xgb_clf, param_distributions=param_grid, cv=3, scoring='roc_auc', n_jobs=1, verbose=3, random_state=random_state)
grid_search.fit(X_train, y_train)

# 使用最佳參數初始化分類器
best_params, best_socre = grid_search.best_params_, grid_search.best_score_
xgb_clf = XGBClassifier(**best_params, random_state=random_state, tree_method='hist', device='gpu')
print(f"最佳參數:{best_params}, 最佳分數: {best_socre}")

# 初始化 XGBoost 分類器
# xgb_clf = XGBClassifier(
#     objective='binary:logistic',
#     eval_metric='logloss',
#     use_label_encoder=False,
#     n_estimators=150,                 # 樹的數量
#     max_depth=6,                      # 樹的最大深度
#     learning_rate=0.1,                # 學習率
#     subsample=0.8,                    # 訓練每棵樹時使用的樣本比例
#     colsample_bytree=0.8,             # 訓練每棵樹時使用的特徵比例
#     random_state=42,
#     device='gpu',
#     tree_method='hist'
# )

xgb_clf.fit(X_train, y_train)
xgb_clf.save_model('xgb_model.json')
print("模型訓練完成！")

evaluate model

In [None]:
import sys
sys.path.append(os.path.dirname(os.getcwd()))

from Util import Evaluater

# 評估模型
Evaluater.evaluate_model(xgb_clf, (X_train, X_test, y_train, y_test))

# 特徵重要性
import matplotlib.pyplot as plt
xgb_clf.feature_importances_
xgb_clf.get_booster().get_score(importance_type='weight')
xgboost.plot_importance(xgb_clf)
plt.show()

output result

In [None]:
from datetime import datetime

df_test_acct = pd.read_csv(os.path.join(dir_path, 'acct_alert.csv'))
df_test = df_origin[df_origin['acct'].isin(df_test_acct['acct'])].copy().select_dtypes(include=[np.number]).dropna(axis=1)
X_test = scaler.transform(df_test.drop(columns=['label']))
y_test_pred = xgb_clf.predict(X_test)

df_pred = pd.DataFrame({
    'acct': df_test_acct['acct'],
    'label': y_test_pred
})

current_time = datetime.now().strftime("%m%d_%H%M")
output_file = f"xgboost_{current_time}.csv"

df_pred.to_csv(os.path.join(outputPath, output_file), index=False)
print(f"(Finish) Output saved to {os.path.join(outputPath, output_file)}")