In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, precision_recall_curve
import kagglehub

RANDOM_STATE_VAL = 42
TEST_SET_RATIO = 0.3

print("正在載入數據集...")
path_source = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
credit_data = pd.read_csv(f"{path_source}/creditcard.csv")
credit_data['Class'] = credit_data['Class'].astype(int)
credit_data.drop('Time', axis=1, inplace=True)

amount_scaler = StandardScaler()
credit_data['Amount'] = amount_scaler.fit_transform(credit_data['Amount'].values.reshape(-1, 1))

num_fraud = credit_data[credit_data['Class'] == 1].shape[0]
num_nonfraud = credit_data[credit_data['Class'] == 0].shape[0]
print(f'詐欺交易數量: {num_fraud}, 正常交易數量: {num_nonfraud}')
print(f'正樣本 (詐欺) 百分比: {num_fraud / (num_fraud + num_nonfraud) * 100:.3f}%')

features_X = credit_data.drop(columns=['Class']).values
labels_y = credit_data['Class'].values

X_train_set, X_test_set, y_train_set, y_test_set = train_test_split(
    features_X, labels_y, test_size=TEST_SET_RATIO, random_state=RANDOM_STATE_VAL, stratify=labels_y
)

data_scaler = StandardScaler()
X_train_scaled = data_scaler.fit_transform(X_train_set)
X_test_scaled = data_scaler.transform(X_test_set)

count_class_0 = np.sum(y_train_set == 0)
count_class_1 = np.sum(y_train_set == 1)
dynamic_scale_pos_weight = count_class_0 / count_class_1 if count_class_1 > 0 else 1
print(f"動態計算的 scale_pos_weight: {dynamic_scale_pos_weight:.2f}")

xgb_model = XGBClassifier(
    n_estimators=100,
    scale_pos_weight=2.5,
    max_depth=6,
    learning_rate=0.1,
    random_state=RANDOM_SEED,
    eval_metric='logloss'
)
print("開始訓練 XGBoost 模型...")
xgb_classifier.fit(X_train_scaled, y_train_set)

y_pred_probabilities = xgb_classifier.predict_proba(X_test_scaled)[:, 1]

prec, rec, thresh = precision_recall_curve(y_test_set, y_pred_probabilities)
f1_scores = 2 * (prec * rec) / (prec + rec + 1e-9)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresh[optimal_idx]

print(f"\n🔍 基於 F1 分數計算出的最佳閾值: {optimal_threshold:.4f}, 對應的 F1 分數 = {f1_scores[optimal_idx]:.4f}")

y_final_predictions = (y_pred_probabilities > optimal_threshold).astype(int)

print("\n使用最佳閾值的 XGBoost 分類報告:")
print(classification_report(y_test_set, y_final_predictions, target_names=['正常交易', '詐欺交易']))

正在載入數據集...
詐欺交易數量: 492, 正常交易數量: 284315
正樣本 (詐欺) 百分比: 0.173%
動態計算的 scale_pos_weight: 578.55
開始訓練 XGBoost 模型...


Parameters: { "use_label_encoder" } are not used.




🔍 基於 F1 分數計算出的最佳閾值: 0.9282, 對應的 F1 分數 = 0.8453

使用最佳閾值的 XGBoost 分類報告:
              precision    recall  f1-score   support

        正常交易       1.00      1.00      1.00     85295
        詐欺交易       0.96      0.75      0.84       148

    accuracy                           1.00     85443
   macro avg       0.98      0.87      0.92     85443
weighted avg       1.00      1.00      1.00     85443



In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, precision_recall_curve
import kagglehub

# 一般參數
RANDOM_SEED = 42
TEST_SIZE = 0.3

# 載入資料集
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)
data.drop('Time', axis=1, inplace=True)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

# 顯示類別比例
fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
print(f'the positive class (frauds) percentage: {len(fraud)/(len(fraud)+len(nonfraud))*100:.3f}%')

# 特徵與標籤
X = data.drop(columns=['Class']).to_numpy()
Y = data['Class'].to_numpy()

# 切分訓練與測試集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

# 標準化
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# 建立 XGBoost 模型
xgb_model = XGBClassifier(
    colsample_bytree=0.95,
    learning_rate=0.1,
    max_depth=7,
    n_estimators=250,
    subsample=0.8,
    scale_pos_weight=2.5,
    eval_metric='logloss',
    tree_method='hist',
    random_state=RANDOM_SEED
)

# 訓練模型
xgb_model.fit(X_train_std, y_train)

# 模型機率預測
y_prob = xgb_model.predict_proba(X_test_std)[:, 1]

threshold = 0.4031
y_pred_custom = (y_prob > threshold).astype(int)

# 分類報告
print(classification_report(y_test, y_pred_custom))

precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
best_idx = f1.argmax()
best_threshold = thresholds[best_idx]

print(f"\n🔍 Best Threshold based on F1: {best_threshold:.4f}, F1 = {f1[best_idx]:.4f}")

Fraudulent:492, non-fraudulent:284315
the positive class (frauds) percentage: 0.173%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.91      0.86      0.88       136

    accuracy                           1.00     85443
   macro avg       0.95      0.93      0.94     85443
weighted avg       1.00      1.00      1.00     85443


🔍 Best Threshold based on F1: 0.6064, F1 = 0.9070


In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, precision_recall_curve
import kagglehub

RANDOM_SEED = 42
TEST_SIZE = 0.3

path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")

data['Class'] = data['Class'].astype(int)
data.drop('Time', axis=1, inplace=True)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]

# 修改輸出文字
print(f'詐騙案例筆數: {len(fraud)}, 正常案例筆數: {len(nonfraud)}')
print(f'詐騙案例占比 (%): {len(fraud)/(len(fraud)+len(nonfraud))*100:.3f}')

X = data.drop(columns=['Class']).to_numpy()
Y = data['Class'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# XGBoost 模型參數
xgb_model = XGBClassifier(
    colsample_bytree=1.0,
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
    subsample=0.8,
    scale_pos_weight=2.5,
    eval_metric='logloss',
    tree_method='hist',
    random_state=RANDOM_SEED
)

xgb_model.fit(X_train_std, y_train)

y_prob = xgb_model.predict_proba(X_test_std)[:, 1]

threshold = 0.4031
y_pred_custom = (y_prob > threshold).astype(int)


print("\n--- 模型分類成效報告 (固定閾值) ---")
print(classification_report(y_test, y_pred_custom, target_names=['常規交易 (Class 0)', '可疑交易 (Class 1)']))

precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
best_idx = f1.argmax()
best_threshold = thresholds[best_idx]





詐騙案例筆數: 492, 正常案例筆數: 284315
詐騙案例占比 (%): 0.173

--- 模型分類成效報告 (固定閾值) ---
                precision    recall  f1-score   support

常規交易 (Class 0)       1.00      1.00      1.00     85307
可疑交易 (Class 1)       0.94      0.86      0.90       136

      accuracy                           1.00     85443
     macro avg       0.97      0.93      0.95     85443
  weighted avg       1.00      1.00      1.00     85443

