In [5]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)
import kagglehub

RANDOM_SEED_VALUE = 42
TEST_DATA_RATIO = 0.3

data_path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
raw_data = pd.read_csv(f"{data_path}/creditcard.csv")
raw_data.drop(['Time'], axis=1, inplace=True)
std_scaler = StandardScaler()
raw_data['Amount'] = std_scaler.fit_transform(
    raw_data['Amount'].values.reshape(-1, 1)
)
features_all = raw_data.drop(columns=['Class']).values
labels_all = raw_data['Class'].values

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    features_all, labels_all,
    test_size=TEST_DATA_RATIO,
    random_state=RANDOM_SEED_VALUE,
    stratify=labels_all
)

fraud_transactions = raw_data[raw_data['Class'] == 1]
nonfraud_transactions = raw_data[raw_data['Class'] == 0]

print(f'總詐騙交易筆數: {len(fraud_transactions)}，總正常交易筆數: {len(nonfraud_transactions)}')
print(f'整體數據中詐騙交易百分比: {len(fraud_transactions)/(len(fraud_transactions)+len(nonfraud_transactions))*100:.3f}%')

def assess_model_performance(true_labels, predicted_labels, model_name_tag="模型"):
    accuracy_val = accuracy_score(true_labels, predicted_labels)
    precision_val = precision_score(true_labels, predicted_labels, zero_division=0)
    recall_val = recall_score(true_labels, predicted_labels)
    f1_val = f1_score(true_labels, predicted_labels)


    print(f'   (Accuracy): {accuracy_val:.6f}')
    print(f'    (Precision): {precision_val:.6f}')
    print(f'    (Recall): {recall_val:.6f}')
    print(f'     (F1 Score): {f1_val:.6f}')

    print(classification_report(true_labels, predicted_labels, target_names=['正常交易 (Class 0)', '詐騙交易 (Class 1)']))


selected_feature_columns = ['V1','V2','V3','V4','V5','V6','V7',
                           'V9','V10','V11','V12','V14','V16','V17','V18','V19','Amount']

X_selected_from_data = raw_data[selected_feature_columns].to_numpy()
y_labels_from_data = raw_data['Class'].to_numpy()

nonfraud_samples_for_balance = nonfraud_transactions.sample(n=5000, random_state=RANDOM_SEED_VALUE)
data_for_balancing = pd.concat([fraud_transactions, nonfraud_samples_for_balance])

X_features_from_balanced = data_for_balancing[selected_feature_columns].to_numpy()
y_labels_from_balanced = data_for_balancing['Class'].to_numpy()

X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(
    X_features_from_balanced, y_labels_from_balanced,
    test_size=TEST_DATA_RATIO,
    random_state=RANDOM_SEED_VALUE,
    stratify=y_labels_from_balanced
)

isolation_forest_model = IsolationForest(
        n_estimators=400,
        max_samples='auto',
        contamination='auto',
        bootstrap=True,
        random_state=RANDOM_SEED_VALUE
    )
isolation_forest_model.fit(X_train_balanced[y_train_balanced == 0])

anomaly_scores_train = (-isolation_forest_model.decision_function(X_train_balanced)).reshape(-1, 1)
anomaly_scores_test  = (-isolation_forest_model.decision_function(X_test_balanced )).reshape(-1, 1)

X_train_with_anomaly_score = np.hstack([X_train_balanced, anomaly_scores_train])
X_test_with_anomaly_score  = np.hstack([X_test_balanced , anomaly_scores_test ])

pos_weight_for_xgb = (y_train_balanced == 0).sum() / (y_train_balanced == 1).sum()

xgb_hybrid_model = XGBClassifier(
    n_estimators=350,
    max_depth=6,
    learning_rate=0.08,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=pos_weight_for_xgb,
    random_state=RANDOM_SEED_VALUE,
    eval_metric='logloss',
    use_label_encoder=False # 避免 UserWarning
)
xgb_hybrid_model.fit(X_train_with_anomaly_score, y_train_balanced)

y_predictions_hybrid = xgb_hybrid_model.predict(X_test_with_anomaly_score)

assess_model_performance(y_test_balanced, y_predictions_hybrid, model_name_tag="混合模型 (Isolation Forest + XGBoost)")

總詐騙交易筆數: 492，總正常交易筆數: 284315
整體數據中詐騙交易百分比: 0.173%


Parameters: { "use_label_encoder" } are not used.



   (Accuracy): 0.986044
    (Precision): 0.969925
    (Recall): 0.871622
     (F1 Score): 0.918149
                precision    recall  f1-score   support

正常交易 (Class 0)       0.99      1.00      0.99      1500
詐騙交易 (Class 1)       0.97      0.87      0.92       148

      accuracy                           0.99      1648
     macro avg       0.98      0.93      0.96      1648
  weighted avg       0.99      0.99      0.99      1648

