In [None]:
# --- 第 1 部分 ---
# 載入函式庫與資料集
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

np.random.seed(123456)
data = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
data.Time = (data.Time-data.Time.min())/data.Time.std()
data.Amount = (data.Amount
               - data.Amount.mean()) / data.Amount.std()

# 把資料分為 70% 訓練資料集與 30% 測試資料集
x_train, x_test, y_train, y_test = train_test_split(
    data.drop('Class', axis=1).values, 
    data.Class.values, 
    test_size=0.3)


In [None]:
# --- 第 2 部分 ---
# 訓練基學習器
base_classifiers = [('DT', DecisionTreeClassifier(max_depth = 3)),
                    ('NB', GaussianNB()),
                    ('LR', LogisticRegression(solver = 'liblinear'))]

for bc in base_classifiers:
    lr = bc[1]
    lr.fit(x_train, y_train)

    predictions = lr.predict(x_test)
    print(bc[0]+' f1', metrics.f1_score(y_test, predictions))
    print(bc[0]+' recall', metrics.recall_score(y_test, predictions))
    print(metrics.confusion_matrix(y_test, predictions))


In [None]:
# --- 第 3 部分 ---
# 檢查特徵與標籤的相關性
plt.figure(figsize = (8, 8))
correlations = data.corr()['Class'].drop('Class')
correlations.sort_values().plot(kind = 'bar')
plt.title('Correlations to Class')

In [None]:
# --- 第 4 部分 ---
# 根據與標籤的相關性篩選特徵
threshold = 0.1

correlations = data.corr()['Class'].drop('Class')
fs = list(correlations[(abs(correlations)
                        > threshold)].index.values)
fs.append('Class')
data = data[fs]

x_train_f, x_test_f, y_train_f, y_test_f = train_test_split(data.drop('Class', axis=1).values, 
                                                            data.Class.values, 
                                                            test_size=0.3)

for bc in base_classifiers:
    lr = bc[1]
    lr.fit(x_train_f, y_train_f)

    predictions = lr.predict(x_test_f)
    print(bc[0]+' f1', metrics.f1_score(y_test_f, predictions))
    print(bc[0]+' recall', metrics.recall_score(y_test_f, predictions))
    print(metrics.confusion_matrix(y_test_f, predictions))


In [None]:
# --- 第 5 部分 ---
# 優化決策樹
plt.figure(figsize = (8, 8))
raw_f1 = []
raw_recall = []
range_ = [x for x in range(3,12)]
for max_d in range_:
    lr = DecisionTreeClassifier(max_depth = max_d)
    lr.fit(x_train, y_train)

    predictions = lr.predict(x_test)
    raw_f1.append(metrics.f1_score(y_test, predictions))
    raw_recall.append(metrics.recall_score(y_test, 
                                           predictions))

plt.plot(range_, raw_f1, label='Raw F1', linestyle = '-')
plt.plot(range_, raw_recall, label='Raw Recall', linestyle = ':')

filter_f1 = []
filter_recall = []
for max_d in range_:
    lr = DecisionTreeClassifier(max_depth = max_d)
    lr.fit(x_train_f, y_train_f)

    predictions = lr.predict(x_test_f)
    filter_f1.append(metrics.f1_score(y_test_f, predictions))
    filter_recall.append(metrics.recall_score(y_test_f, 
                                              predictions))

plt.plot(range_, filter_f1, label='Filtered F1', linestyle = '--')
plt.plot(range_, filter_recall, label='Filtered Recall', linestyle = '-.')
plt.legend()
plt.xlabel('Max Depth')
plt.ylabel('Score')
plt.show()

print("Raw Data Max F1:", max(raw_f1))
print("Raw Data Max Recall:", max(raw_recall))
print("Filtered Data Max F1:", max(filter_f1))
print("Filtered Data Max Recall:", max(filter_recall))

In [None]:
# --- 第 6 部分 ---
# 進行集成
base_classifiers = [('DT', DecisionTreeClassifier(max_depth = 10)),
                    ('NB', GaussianNB()),
                    ('LR', LogisticRegression(solver = 'liblinear'))]

ensemble = VotingClassifier(base_classifiers)
ensemble.fit(x_train, y_train)

print('Voting f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
print('Voting recall', metrics.recall_score(y_test, ensemble.predict(x_test)))

ensemble = VotingClassifier(base_classifiers)
ensemble.fit(x_train_f, y_train_f)

print('Voting f1', metrics.f1_score(y_test_f, ensemble.predict(x_test_f)))
print('Voting recall', metrics.recall_score(y_test_f, ensemble.predict(x_test_f)))

In [None]:
# --- 第 7 部分 ---
# 增加基學習器
base_classifiers = [('DT1', DecisionTreeClassifier(max_depth = 10)),
                    ('DT2', DecisionTreeClassifier(max_depth = 7)),
                    ('DT3', DecisionTreeClassifier(max_depth = 6)),
                    ('NB', GaussianNB()),
                    ('LR', LogisticRegression(solver = 'liblinear'))]

ensemble = VotingClassifier(base_classifiers)
ensemble.fit(x_train, y_train)

print('Voting f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
print('Voting recall', metrics.recall_score(y_test, ensemble.predict(x_test)))

ensemble = VotingClassifier(base_classifiers)
ensemble.fit(x_train_f, y_train_f)

print('Voting f1', metrics.f1_score(y_test_f, ensemble.predict(x_test_f)))
print('Voting recall', metrics.recall_score(y_test_f, ensemble.predict(x_test_f)))