In [None]:
# --- 第 1 部分 ---
# 載入函式庫與資料集
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn import metrics
from xgboost import XGBClassifier

np.random.seed(123456)
data = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
data.Time = (data.Time-data.Time.min())/data.Time.std()
data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std()

# 把資料分為 70% 訓練資料集與 30% 測試資料集
x_train, x_test, y_train, y_test = train_test_split(
        data.drop('Class', axis=1).values, data.Class.values, test_size=0.3)

In [None]:
# --- 第 2 部分 ---
# 進行集成
ensemble = XGBClassifier(max_depth=3, n_jobs=4)
ensemble.fit(x_train, y_train)
print('XGB f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
print('XGB recall', metrics.recall_score(y_test, ensemble.predict(x_test)))

In [None]:
# --- 第 3 部分 ---
# 篩選特徵
threshold = 0.1

correlations = data.corr()['Class'].drop('Class')
fs = list(correlations[(abs(correlations)>threshold)].index.values)
fs.append('Class')
data = data[fs]

x_train_f, x_test_f, y_train_f, y_test_f = train_test_split(data.drop('Class', axis=1).values, 
                                                            data.Class.values, 
                                                            test_size=0.3)

ensemble = XGBClassifier(max_depth=3, n_jobs=4)
ensemble.fit(x_train_f, y_train_f)
print('XGB f1', metrics.f1_score(y_test_f, ensemble.predict(x_test_f)))
print('XGB recall', metrics.recall_score(y_test_f, ensemble.predict(x_test_f)))

In [None]:
# --- 第 4 部分 ---
# 改用預設
ensemble = XGBClassifier(n_jobs=4)
ensemble.fit(x_train, y_train)
print('XGB f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
print('XGB recall', metrics.recall_score(y_test, ensemble.predict(x_test)))

ensemble = XGBClassifier(n_jobs=4)
ensemble.fit(x_train_f, y_train_f)
print('XGB f1', metrics.f1_score(y_test_f, ensemble.predict(x_test_f)))
print('XGB recall', metrics.recall_score(y_test_f, ensemble.predict(x_test_f)))