In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# 加載數據
data = pd.read_csv('/Users/linyinghsiao/Desktop/chatgpt_output拷貝.csv')

# 處理類別型特徵：轉換為數值型
label_encoders = {}
categorical_columns = data.select_dtypes(include=['object']).columns

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# 處理 NaN 值：填充或刪除
imputer = SimpleImputer(strategy='median')
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# 分割特徵和標籤
X = data.drop('label', axis=1)
y = data['label']

# 應用 SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 分割數據集
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42)

# 訓練 XGBoost 模型
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

# 評估模型
y_pred = xgb_model.predict(X_test_resampled)

# 計算並顯示分類報告和混淆矩陣
class_report = classification_report(y_test_resampled, y_pred)
conf_matrix = confusion_matrix(y_test_resampled, y_pred)
print("Classification Report:\n", class_report)
print("Confusion Matrix:\n", conf_matrix)

# 計算並顯示其他模型指標
accuracy = accuracy_score(y_test_resampled, y_pred)
precision = precision_score(y_test_resampled, y_pred, average='macro')
recall = recall_score(y_test_resampled, y_pred, average='macro')
f1 = f1_score(y_test_resampled, y_pred, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      5909
         1.0       1.00      1.00      1.00      6091

    accuracy                           1.00     12000
   macro avg       1.00      1.00      1.00     12000
weighted avg       1.00      1.00      1.00     12000

Confusion Matrix:
 [[5898   11]
 [   3 6088]]
Accuracy: 0.9988333333333334
Precision: 0.9988440186174019
Recall: 0.9988229514683642
F1 Score: 0.9988330407831407


保存

In [2]:
import joblib

# 保存模型
joblib.dump(xgb_model, 'xgb_model.pkl')

# 保存 LabelEncoders 和 Imputer
for col, le in label_encoders.items():
    joblib.dump(le, f'label_encoder_{col}.pkl')
joblib.dump(imputer, 'imputer.pkl')


['imputer.pkl']

In [3]:
# 加載模型
xgb_model = joblib.load('xgb_model.pkl')

# 加載 LabelEncoders 和 Imputer
label_encoders_loaded = {}
for col in categorical_columns:
    label_encoders_loaded[col] = joblib.load(f'label_encoder_{col}.pkl')
imputer_loaded = joblib.load('imputer.pkl')


In [4]:
# 加載新數據
new_data = pd.read_csv('/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/datasets/dataset_1st/public_processed.csv')

# 處理類別型特徵
for col in categorical_columns:
    new_data[col] = label_encoders_loaded[col].transform(new_data[col])

# 處理 NaN 值
new_data = pd.DataFrame(imputer_loaded.transform(new_data), columns=new_data.columns)

# 預處理新數據
X_new = new_data.drop('label', axis=1)  # 假設新數據中也有 'label' 列

# 使用模型進行預測
y_pred_new = xgb_model.predict(X_new)

# 輸出預測結果
print(y_pred_new)


ValueError: y contains previously unseen labels: 'a2c1209018e4e52e04f6fabb48f05f1b8bc09dc838ff6cb19906377fab414587'