引入函式

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import precision_recall_curve, f1_score, classification_report, confusion_matrix, auc, roc_auc_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import seaborn as sns

載入與清理

In [None]:
csv_path = "Final Transactions.csv"

data = pd.read_csv(csv_path)

print(f"本地資料載入完成，形狀：{data.shape}")
print(data.head())

print("\n=== 原始數值欄位統計量 ===")
print(data.describe())

# 清理特徵名稱 (避免 LightGBM 的 JSON 錯誤)
def clean_column_names(df):
    df.columns = [re.sub(r'[^\\w\\s]', '_', col).replace(' ', '_') for col in df.columns]
    return df

本地資料載入完成，形狀：(1754155, 10)
   Unnamed: 0  TRANSACTION_ID          TX_DATETIME  CUSTOMER_ID  TERMINAL_ID  \
0           0               0  2023-01-01 00:00:31          596         3156   
1           1               1  2023-01-01 00:02:10         4961         3412   
2           2               2  2023-01-01 00:07:56            2         1365   
3           3               3  2023-01-01 00:09:29         4128         8737   
4           4               4  2023-01-01 00:10:34          927         9906   

   TX_AMOUNT  TX_TIME_SECONDS  TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0     533.07               31             0         0                  0  
1     808.56              130             0         0                  0  
2    1442.94              476             0         1                  1  
3     620.65              569             0         0                  0  
4     490.66              634             0         0                  0  

=== 原始數值欄位統計量 ===
         Unnamed: 0  TRA

特徵工程

In [3]:
# A. 時間轉化
data['TX_DATETIME'] = pd.to_datetime(data['TX_DATETIME'])
data['hour'] = data['TX_DATETIME'].dt.hour
data['day_of_week'] = data['TX_DATETIME'].dt.dayofweek

# B. 客戶行為特徵
data = data.sort_values(['CUSTOMER_ID', 'TX_DATETIME'])
data['time_since_last_tx'] = data.groupby('CUSTOMER_ID')['TX_TIME_SECONDS'].diff().fillna(0)
data['amt_to_mean'] = data['TX_AMOUNT'] / data.groupby('CUSTOMER_ID')['TX_AMOUNT'].transform('mean')

# C. 移除 ID 欄位 (避免 Data Leakage)
drop_cols = ['TRANSACTION_ID', 'TX_FRAUD_SCENARIO', 'TX_DATETIME', 'TX_TIME_SECONDS', 
             'TX_TIME_DAYS', 'CUSTOMER_ID', 'TERMINAL_ID']
X = data.drop(['TX_FRAUD'] + drop_cols, axis=1)
y = data['TX_FRAUD']

X = clean_column_names(X)

print("完成")

完成


資料分割

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

ratio = (y_train == 0).sum() / (y_train == 1).sum()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("完成")

完成


定義 Ensemble

In [5]:
model_xgb = xgb.XGBClassifier(
    n_estimators=500, max_depth=6, learning_rate=0.05,
    scale_pos_weight=ratio,
    random_state=42, use_label_encoder=False, eval_metric='logloss', tree_method='hist'
)

model_lgb = lgb.LGBMClassifier(
    n_estimators=500, num_leaves=31, learning_rate=0.05,
    class_weight='balanced',
    random_state=42, verbose=-1
)

model_cat = CatBoostClassifier(
    iterations=500, learning_rate=0.05, depth=6,
    auto_class_weights='Balanced',
    random_state=42, verbose=0
)

ensemble_model = VotingClassifier(
    estimators=[('xgb', model_xgb), ('lgb', model_lgb), ('cat', model_cat)],
    voting='soft',
    weights=[1, 1, 1]
)

print("完成")

完成


訓練與評估

In [6]:
print("開始訓練(LightGBM + XGBoost + CatBoost)...")
ensemble_model.fit(X_train_scaled, y_train)

probs = ensemble_model.predict_proba(X_test_scaled)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_test, probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
best_threshold = thresholds[np.argmax(f1_scores)]

final_preds = (probs >= best_threshold).astype(int)

開始訓練(LightGBM + XGBoost + CatBoost)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


輸出結果

In [None]:
cm = confusion_matrix(y_test, final_preds)
roc = roc_auc_score(y_test, probs)
pr_auc = auc(recalls, precisions)

# cm = [[TN, FP], [FN, TP]]
TN, FP, FN, TP = cm.ravel()
prec_score = TP / (TP + FP + 1e-10)
rec_score = TP / (TP + FN + 1e-10)

final_f1 = f1_score(y_test, final_preds)

print(f"\n=== (最佳門檻 = {best_threshold:.4f}) ===")
print(f"ROC-AUC: {roc:.4f} | PR-AUC: {pr_auc:.4f}")
print(f"Precision: {prec_score:.4f} | Recall: {rec_score:.4f} | F1: {final_f1:.4f}")
print("Confusion Matrix:\n", cm)


=== (最佳門檻 = 0.8095) ===
ROC-AUC: 0.9840 | PR-AUC: 0.9767
Precision: 1.0000 | Recall: 0.9579 | F1: 0.9785
Confusion Matrix:
 [[303636      1]
 [  1987  45207]]
