In [33]:
#XG BOOST

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, precision_recall_curve
import kagglehub

RANDOM_SEED = 42
TEST_SIZE = 0.3

path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")

data['Class'] = data['Class'].astype(int)
data.drop('Time', axis=1, inplace=True)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]

# 修改輸出文字
print(f'詐騙案例筆數: {len(fraud)}, 正常案例筆數: {len(nonfraud)}')
print(f'詐騙案例占比 (%): {len(fraud)/(len(fraud)+len(nonfraud))*100:.3f}')

X = data.drop(columns=['Class']).to_numpy()
Y = data['Class'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# XGBoost 模型參數
xgb_model = XGBClassifier(
    colsample_bytree=1.0,
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
    subsample=0.8,
    scale_pos_weight=2.5,
    eval_metric='logloss',
    tree_method='hist',
    random_state=RANDOM_SEED
)

xgb_model.fit(X_train_std, y_train)

y_prob = xgb_model.predict_proba(X_test_std)[:, 1]

threshold = 0.4031
y_pred_custom = (y_prob > threshold).astype(int)


print("\n--- 模型分類成效報告 (固定閾值) ---")
print(classification_report(y_test, y_pred_custom, target_names=['常規交易 (Class 0)', '可疑交易 (Class 1)']))

precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
best_idx = f1.argmax()
best_threshold = thresholds[best_idx]





詐騙案例筆數: 492, 正常案例筆數: 284315
詐騙案例占比 (%): 0.173

--- 模型分類成效報告 (固定閾值) ---
                precision    recall  f1-score   support

常規交易 (Class 0)       1.00      1.00      1.00     85307
可疑交易 (Class 1)       0.94      0.86      0.90       136

      accuracy                           1.00     85443
     macro avg       0.97      0.93      0.95     85443
  weighted avg       1.00      1.00      1.00     85443



In [50]:
#isolation forest


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier , IsolationForest
from sklearn.cluster import KMeans
from sklearn.metrics import (
    silhouette_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)
import kagglehub

RANDOM_SEED = 42
TEST_SIZE = 0.3

path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(
    data['Amount'].values.reshape(-1, 1)
)

fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]

print(f'詐騙交易筆數: {len(fraud)}，非詐騙交易筆數: {len(nonfraud)}')
print(f'正向類別(詐騙)百分比: {len(fraud)/(len(fraud)+len(nonfraud))*100:.3f}%')

def evaluate_model_performance(y_true_values, y_predicted_values, model_identifier="模型"):
    acc_score = accuracy_score(y_true_values, y_predicted_values)
    prec_score = precision_score(y_true_values, y_predicted_values, zero_division=0)
    rec_score = recall_score(y_true_values, y_predicted_values)
    f1_val_score = f1_score(y_true_values, y_predicted_values)


    print(f'        準確率 (Accuracy): {acc_score:.6f}')
    print(f'        精確率 (Precision): {prec_score:.6f}')
    print(f'        召回率 (Recall): {rec_score:.6f}')
    print(f'        F1 分數 (F1 Score): {f1_val_score:.6f}')
    print("\n詳細分類報告:")
    print(classification_report(y_true_values, y_predicted_values, target_names=['類別 0 (正常)', '類別 1 (詐騙)']))

selected_feature_names = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V19', 'Amount']
X_features_selected = np.asarray(data[selected_feature_names])
y_labels_selected = np.asarray(data['Class'])

nonfraud_subset = nonfraud.sample(n=5000, random_state=RANDOM_SEED)
balanced_data_df = pd.concat([fraud, nonfraud_subset])
X_features_balanced = np.asarray(balanced_data_df[selected_feature_names])
y_labels_balanced = np.asarray(balanced_data_df['Class'])

X_train_data, X_test_data, y_train_labels, y_test_labels = train_test_split(
    X_features_balanced, y_labels_balanced, test_size=TEST_SIZE, random_state=RANDOM_SEED
)


X_main_features = data[selected_feature_names].to_numpy()
y_main_labels = data['Class'].to_numpy()

nonfraud_sampled_df = nonfraud.sample(n=5000, random_state=RANDOM_SEED)
data_balanced_final = pd.concat([fraud, nonfraud_sampled_df])

X_balanced_np = data_balanced_final[selected_feature_names].to_numpy()
y_balanced_np = data_balanced_final['Class'].to_numpy()


X_training_set, X_testing_set, y_training_labels, y_testing_labels = train_test_split(
    X_balanced_np, y_balanced_np, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y_balanced_np)


iso_forest_model = IsolationForest(
        n_estimators=1000,
        max_samples='auto',
        contamination='auto',
        bootstrap=False,
        random_state=RANDOM_SEED
    )
iso_forest_model.fit(X_training_set[y_training_labels == 0])

iso_scores_train = (-iso_forest_model.decision_function(X_training_set)).reshape(-1, 1)
iso_scores_test  = (-iso_forest_model.decision_function(X_testing_set )).reshape(-1, 1)
anomaly_scores_test_flat = iso_scores_test.ravel()
threshold_value = np.percentile(anomaly_scores_test_flat, 96)
y_predicted_labels = (anomaly_scores_test_flat > threshold_value).astype(int)

evaluate_model_performance(y_testing_labels, y_predicted_labels, model_identifier="孤立森林 (非監督式)")

詐騙交易筆數: 492，非詐騙交易筆數: 284315
正向類別(詐騙)百分比: 0.173%
        準確率 (Accuracy): 0.946602
        精確率 (Precision): 0.954545
        召回率 (Recall): 0.425676
        F1 分數 (F1 Score): 0.588785

詳細分類報告:
              precision    recall  f1-score   support

   類別 0 (正常)       0.95      1.00      0.97      1500
   類別 1 (詐騙)       0.95      0.43      0.59       148

    accuracy                           0.95      1648
   macro avg       0.95      0.71      0.78      1648
weighted avg       0.95      0.95      0.94      1648

