### Hybrid Pipeline 雙階段混和
> 非監督篩式清洗 + 監督式分類

> autoencoder(AE) + catboost

#### 載入外部程式庫

anaconda 需額外載入
> tensorflow

> catboost

In [1]:
import pandas as pd
from pandas.testing import assert_frame_equal # panda 檢查
# model
from src.model import AE_CatBoost_Model
from src.data_processing import DataProcessor
# Evaluater
import sys
import os


#### 準備資料集

##### Load csv


In [2]:
entities_path = {
     "T1"   :   "../DataSet/preprocessing_T1_basic.csv"
    ,"T12"  :   "../DataSet/preprocessing_T1_2.csv"
    ,"T123" :   "../DataSet/preprocessing_T1_2_3.csv"
}
entities = {}

# Load preprocessing and cut 
for index, path in entities_path.items():
    entities[index] = pd.read_csv(path)

print("原始資料：")
for v in entities.values():
    print(f"{v.head()}\n\n")

原始資料：
                                                acct  txn_count  first_txn_ts  \
0  00000577cfcd0bde8ee693021419ef13a1f7f933ec8626...          1       1953000   
1  00000eec52ea49377de91bc7b54eb3192943e6c20e0a51...          1       4489500   
2  000015150c92e2a41c4715a088df78d77a7d4f3017aadc...          1       5823000   
3  00002846e6b430580825e2b10fe3ff1e3ddb93f42c608d...          1       1100100   
4  00002b3d8f9c7b91c407a5725849deb521fcf1dd5eea1f...          1        777300   

   last_txn_ts  std_txn_ts  acct_type_x  cross_type    uni_amt  acct_type_y  \
0      1953000         0.0            2           1    47500.0            2   
1      4489500         0.0            2           1     6150.0            2   
2      5823000         0.0            2           1  1150000.0            2   
3      1100100         0.0            2           1     8550.0            2   
4       777300         0.0            2           1     1450.0            2   

   num_unique_dest_accts  num_un

##### Cut acct ID

In [3]:
# Cut preprocessing
entities_id = pd.DataFrame()

for index, entity in entities.items():
    id = entity.iloc[:,[0]]
    entities[index] = entity.iloc[:,1:]

    # 判斷entity id 是否相同
    if entities_id.empty:
        entities_id = id
    else :
        try:
            assert_frame_equal(entities_id, id)
        except AssertionError as e:
            print(f"Data ID error : from {index}")
            print(e)

print("資料ID :")
entities_id.head()

資料ID :


Unnamed: 0,acct
0,00000577cfcd0bde8ee693021419ef13a1f7f933ec8626...
1,00000eec52ea49377de91bc7b54eb3192943e6c20e0a51...
2,000015150c92e2a41c4715a088df78d77a7d4f3017aadc...
3,00002846e6b430580825e2b10fe3ff1e3ddb93f42c608d...
4,00002b3d8f9c7b91c407a5725849deb521fcf1dd5eea1f...


#### 建立 Evaluater評分 (結果產出)

In [4]:
def get_evaluater_score(model,X_train, X_test, y_train, y_test):
    # 假設您的環境設定
    sys.path.append(os.path.dirname(os.getcwd()))
    from Util import Evaluater

    Evaluater.evaluate_model(model, (X_train, X_test, y_train, y_test))

#### 實作 : 定義運行資料集(dict)

In [5]:
# entities 標籤 (dict)
entities_index = []
for index in entities:
    entities_index.append(index)

print(entities_index)

['T1', 'T12', 'T123']


In [6]:
#==== 實作引用資料參考上面 輸出 index ====#

#       這裡手動調整跑訓練的資料集(參考上方輸出)
#                       vv
train_entities_index = ["T1"]
#                       ^^
#       這裡手動調整跑訓練的資料集(參考上方輸出)

#       這裡手動調整跑訓練的次數
#                  vv
model_train_turn = 10
#                  ^^
#       這裡手動調整跑訓練的次數

In [7]:
# 定義輸出資料集
best_models = {}

In [8]:
# test cut label
test_entity =  entities['T1'].copy()
test_label, test_entity_nolabel = DataProcessor.cut_label(test_entity)
print(test_label.head())
print(test_entity_nolabel.head())

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64
   txn_count  first_txn_ts  last_txn_ts  std_txn_ts  acct_type_x  cross_type  \
0          1       1953000      1953000         0.0            2           1   
1          1       4489500      4489500         0.0            2           1   
2          1       5823000      5823000         0.0            2           1   
3          1       1100100      1100100         0.0            2           1   
4          1        777300       777300         0.0            2           1   

     uni_amt  acct_type_y  num_unique_dest_accts  num_unique_source_accts  \
0    47500.0            2                    0.0                      1.0   
1     6150.0            2                    1.0                      0.0   
2  1150000.0            2                    1.0                      0.0   
3     8550.0            2                    0.0                      1.0   
4     1450.0            2                    0.0                      1.0

#### 實作 : 資料分析 ( AE + CatBoost )

In [9]:
for index in train_entities_index:
    seeds = [21,42,84]
    model = AE_CatBoost_Model()
    best_models[index] = None
    entity = entities[index]
    # 根據 seed 切分資料
    for seed in seeds:

        print ("="*50)
        print (f"訓練資料集：{index} , seed: {seed}")
        print ("="*50)
        print ("\n")

        # 切分訓練與測試資料(第一次切分)
        X_train, X_test, y_train, y_test = DataProcessor.train_test_seed_split(
                entity,
                rand_seed=seed
            )

        # 訓練模型
        model.fit(X_train, y_train) 
        # model.fit(X_train, y_train, train_turn=model_train_turn,tune_params=True)
        
        # 儲存最佳模型
        best_models[index] = model

        # evaluater
        get_evaluater_score(best_models[index],X_train, X_test, y_train, y_test)
        
        # 找回 label = 1 的準確度
        """
        - entity_label1 : 全部為 label 1 的資料集
        - X_label1, y_label1 : 從 entity_label1 切分出來的訓練資料 (All label 1)
        - X_test_label1, y__test_label1 : 從 X_test 切分出來的測試資料 (All label 1)
        """
        # All label 1
        entity_label1 = entity.copy()
        label,entity_label1_cut = DataProcessor.cut_label(entity_label1) # label -> enitiy的 label ; entity_label1_cut -> entity 移除 label 列 
        X_label1 = DataProcessor.split_diff_label(entity_label1_cut, label, positive_label=True) # label1 -> entity 移除 label 列並取 label = 1 (alert)
        y_label1 = pd.Series([1]*X_label1.shape[0]) # 全部為 1
        # no train label 1
        X_test_label1 = DataProcessor.split_diff_label(X_test, y_test, positive_label=True) # 從 X_test 切分出來的測試資料 (All label 1) 
        y_test_label1 = pd.Series([1]*X_test_label1.shape[0])

        # 產生分數
        print ("="*50)
        print ("="*50)
        print("Label 1 的準確度：")
        print ("="*50)
        print ("="*50)
        get_evaluater_score(best_models[index],X_train, X_label1, y_train, y_label1)
        print ("="*50)
        print ("="*50)
        print("Test 集合中 Label 1 的準確度：")
        print ("="*50)
        print ("="*50)
        get_evaluater_score(best_models[index],X_train, X_test_label1, y_train, y_test_label1)


訓練資料集：T1 , seed: 21


步驟 1: 訓練 AutoEncoder 並篩選樣本...
[1m39378/39378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 535us/step
篩選出 1008059 筆可靠正常樣本，剔除 252015 筆潛在雜訊。

步驟 3: 建立混合資料集並訓練 CatBoost...

CatBoost 訓練完成
訓練資料集:
Confusion Matrix:
[[1209913   49458]
 [      5     698]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98   1259371
           1       0.01      0.99      0.03       703

    accuracy                           0.96   1260074
   macro avg       0.51      0.98      0.50   1260074
weighted avg       1.00      0.96      0.98   1260074

Accuracy: 0.9607
ROC AUC Score: 0.9959
------------------------------------------------------------
測試資料集:
Confusion Matrix:
[[518361  21370]
 [     4    297]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98    539731
           1       0.01      0.99      0.03       301

    accuracy         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


訓練資料集:
Confusion Matrix:
[[1209913   49458]
 [      5     698]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98   1259371
           1       0.01      0.99      0.03       703

    accuracy                           0.96   1260074
   macro avg       0.51      0.98      0.50   1260074
weighted avg       1.00      0.96      0.98   1260074

Accuracy: 0.9607
ROC AUC Score: 0.9959
------------------------------------------------------------
測試資料集:
Confusion Matrix:
[[  0   0]
 [  4 297]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       1.00      0.99      0.99       301

    accuracy                           0.99       301
   macro avg       0.50      0.49      0.50       301
weighted avg       1.00      0.99      0.99       301

Accuracy: 0.9867
ROC AUC Score: nan
--------------------------------------------------

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


步驟 1: 訓練 AutoEncoder 並篩選樣本...
[1m39378/39378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 523us/step
篩選出 1008059 筆可靠正常樣本，剔除 252015 筆潛在雜訊。

步驟 3: 建立混合資料集並訓練 CatBoost...

CatBoost 訓練完成
訓練資料集:
Confusion Matrix:
[[1222031   37340]
 [      7     696]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98   1259371
           1       0.02      0.99      0.04       703

    accuracy                           0.97   1260074
   macro avg       0.51      0.98      0.51   1260074
weighted avg       1.00      0.97      0.98   1260074

Accuracy: 0.9704
ROC AUC Score: 0.9964
------------------------------------------------------------
測試資料集:
Confusion Matrix:
[[523749  15982]
 [    10    291]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98    539731
           1       0.02      0.97      0.04       301

    accuracy                           0.97

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


訓練資料集:
Confusion Matrix:
[[1222031   37340]
 [      7     696]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98   1259371
           1       0.02      0.99      0.04       703

    accuracy                           0.97   1260074
   macro avg       0.51      0.98      0.51   1260074
weighted avg       1.00      0.97      0.98   1260074

Accuracy: 0.9704
ROC AUC Score: 0.9964
------------------------------------------------------------
測試資料集:
Confusion Matrix:
[[  0   0]
 [ 10 291]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       1.00      0.97      0.98       301

    accuracy                           0.97       301
   macro avg       0.50      0.48      0.49       301
weighted avg       1.00      0.97      0.98       301

Accuracy: 0.9668
ROC AUC Score: nan
--------------------------------------------------

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


步驟 1: 訓練 AutoEncoder 並篩選樣本...
[1m39378/39378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 526us/step
篩選出 1008059 筆可靠正常樣本，剔除 252015 筆潛在雜訊。

步驟 3: 建立混合資料集並訓練 CatBoost...

CatBoost 訓練完成
訓練資料集:
Confusion Matrix:
[[1216956   42415]
 [      4     699]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98   1259371
           1       0.02      0.99      0.03       703

    accuracy                           0.97   1260074
   macro avg       0.51      0.98      0.51   1260074
weighted avg       1.00      0.97      0.98   1260074

Accuracy: 0.9663
ROC AUC Score: 0.9971
------------------------------------------------------------
測試資料集:
Confusion Matrix:
[[521515  18216]
 [    10    291]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98    539731
           1       0.02      0.97      0.03       301

    accuracy                           0.97

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


訓練資料集:
Confusion Matrix:
[[1216956   42415]
 [      4     699]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98   1259371
           1       0.02      0.99      0.03       703

    accuracy                           0.97   1260074
   macro avg       0.51      0.98      0.51   1260074
weighted avg       1.00      0.97      0.98   1260074

Accuracy: 0.9663
ROC AUC Score: 0.9971
------------------------------------------------------------
測試資料集:
Confusion Matrix:
[[  0   0]
 [ 10 291]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       1.00      0.97      0.98       301

    accuracy                           0.97       301
   macro avg       0.50      0.48      0.49       301
weighted avg       1.00      0.97      0.98       301

Accuracy: 0.9668
ROC AUC Score: nan
--------------------------------------------------

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
