### Hybrid Pipeline 雙階段混和
> 非監督篩式清洗 + 監督式分類

> autoencoder(AE) + catboost

#### 載入外部程式庫

anaconda 需額外載入
> tensorflow

> catboost

In [1]:
import pandas as pd
from pandas.testing import assert_frame_equal # panda 檢查
# model
from src.data_processing import DataProcessor
# Evaluater
import sys
import os


#### 準備資料集

##### Load csv


In [2]:
entities_path = {
     "T1"   :   "../DataSet/preprocessing_T1_basic.csv"
    ,"T12"  :   "../DataSet/preprocessing_T1_2.csv"
    ,"T123" :   "../DataSet/preprocessing_T1_2_3.csv"
}
entities = {}

# Load preprocessing and cut 
for index, path in entities_path.items():
    entities[index] = pd.read_csv(path)

print("原始資料：")
for v in entities.values():
    print(f"{v.head()}\n\n")

原始資料：
                                                acct  txn_count  first_txn_ts  \
0  00000577cfcd0bde8ee693021419ef13a1f7f933ec8626...          1       1953000   
1  00000eec52ea49377de91bc7b54eb3192943e6c20e0a51...          1       4489500   
2  000015150c92e2a41c4715a088df78d77a7d4f3017aadc...          1       5823000   
3  00002846e6b430580825e2b10fe3ff1e3ddb93f42c608d...          1       1100100   
4  00002b3d8f9c7b91c407a5725849deb521fcf1dd5eea1f...          1        777300   

   last_txn_ts  std_txn_ts  acct_type_x  cross_type    uni_amt  acct_type_y  \
0      1953000         0.0            2           1    47500.0            2   
1      4489500         0.0            2           1     6150.0            2   
2      5823000         0.0            2           1  1150000.0            2   
3      1100100         0.0            2           1     8550.0            2   
4       777300         0.0            2           1     1450.0            2   

   num_unique_dest_accts  num_un

##### Cut acct ID

In [3]:
# Cut preprocessing
entities_id = pd.DataFrame()

for index, entity in entities.items():
    id = entity.iloc[:,[0]]
    entities[index] = entity.iloc[:,1:]

    # 判斷entity id 是否相同
    if entities_id.empty:
        entities_id = id
    else :
        try:
            assert_frame_equal(entities_id, id)
        except AssertionError as e:
            print(f"Data ID error : from {index}")
            print(e)

print("資料ID :")
entities_id.head()

資料ID :


Unnamed: 0,acct
0,00000577cfcd0bde8ee693021419ef13a1f7f933ec8626...
1,00000eec52ea49377de91bc7b54eb3192943e6c20e0a51...
2,000015150c92e2a41c4715a088df78d77a7d4f3017aadc...
3,00002846e6b430580825e2b10fe3ff1e3ddb93f42c608d...
4,00002b3d8f9c7b91c407a5725849deb521fcf1dd5eea1f...


#### 建立 Evaluater評分 (結果產出)

In [4]:
def get_evaluater_score(model,X_train, X_test, y_train, y_test):
    # 假設您的環境設定
    sys.path.append(os.path.dirname(os.getcwd()))
    from Util import Evaluater

    Evaluater.evaluate_model(model, (X_train, X_test, y_train, y_test))

#### 假資料

In [5]:
# 4倍


#### 減少資料

In [6]:
# 2.5%

#### 實作 : 定義運行資料集(dict)

In [7]:
# entities 標籤 (dict)
entities_index = []
for index in entities:
    entities_index.append(index)

print(entities_index)

['T1', 'T12', 'T123']


In [8]:
#==== 實作引用資料參考上面 輸出 index ====#

#       這裡手動調整跑訓練的資料集(參考上方輸出)
#                       vv
train_entities_index = ["T1","T12","T123"]
#                       ^^
#       這裡手動調整跑訓練的資料集(參考上方輸出)

#       這裡手動調整跑訓練的次數
#                  vv
model_train_turn = 50
#                  ^^
#       這裡手動調整跑訓練的次數

In [9]:
# 定義輸出資料集
best_models = {}

#### 實作 : 資料分析 ( AE + CatBoost )

In [10]:
from src.model import AE_CatBoost_Model 

In [11]:
for index in train_entities_index:
    seeds = [42]    #[21,42,84]
    model = AE_CatBoost_Model(0.95)
    best_models[index] = None
    entity = entities[index]
    # 根據 seed 切分資料
    for seed in seeds:

        print ("="*50)
        print (f"訓練資料集：{index} , seed: {seed}")
        print ("="*50)

        # 切分訓練與測試資料(第一次切分)
        X_train, X_test, y_train, y_test = DataProcessor.train_test_seed_split(
                entity,
                rand_seed=seed
            )

        # 訓練模型
        # model.fit(X_train, y_train) 
        model.fit(X_train, y_train, train_turn=model_train_turn,tune_params=True)
        
        # 儲存最佳模型
        best_models[index] = model

        # evaluater
        print ("\n"+"-"*50)
        print ("evaluater 測試：")
        print ("-"*50)
        get_evaluater_score(best_models[index],X_train, X_test, y_train, y_test)
        
        # 找回 label = 1 的準確度
        """
        - entity_label1 : 全部為 label 1 的資料集
        - X_label1, y_label1 : 從 entity_label1 切分出來的訓練資料 (All label 1)
        - X_test_label1, y__test_label1 : 從 X_test 切分出來的測試資料 (All label 1)
        """
        # All label 1
        entity_label1 = entity.copy()
        label,entity_label1_cut = DataProcessor.cut_label(entity_label1) # label -> enitiy的 label ; entity_label1_cut -> entity 移除 label 列 
        X_label1 = DataProcessor.split_diff_label(entity_label1_cut, label, positive_label=True) # label1 -> entity 移除 label 列並取 label = 1 (alert)
        y_label1 = pd.Series([1]*X_label1.shape[0]) # 全部為 1
        # no train label 1
        X_test_label1 = DataProcessor.split_diff_label(X_test, y_test, positive_label=True) # 從 X_test 切分出來的測試資料 (All label 1) 
        y_test_label1 = pd.Series([1]*X_test_label1.shape[0])

        # 產生分數
        print ("\n"+"-"*50)
        print ("Label = 1 (alert) 的準確度：")
        print ("-"*50)
        get_evaluater_score(best_models[index],X_train, X_label1, y_train, y_label1)
        # thresholds_to_test = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.98, 0.99]
        # for th in thresholds_to_test:
        #     print(f"\nTesting Threshold: {th}")
    
        #     # 1. 修改模型內部的閾值
        #     model = best_models[index]
        #     model.set_threshold(th)
    
        #     # 2. 直接使用原本的評估器，完全不用改 Evaluater 的 code
        #     # 因為 Evaluater 呼叫 model.predict() 時，現在會自動用新的閾值
        #     get_evaluater_score(model, X_train, X_test, y_train, y_test)



訓練資料集：T1 , seed: 42
步驟 1: 訓練 AutoEncoder 並篩選樣本...
[1m39378/39378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 551us/step
篩選出 1008059 筆可靠正常樣本，剔除 252015 筆潛在雜訊。

步驟 2: 建立混合資料集並訓練 CatBoost...

執行超參數調整 (RandomizedSearchCV)...
Fitting 3 folds for each of 50 candidates, totalling 150 fits
0:	test: 0.7773806	best: 0.7773806 (0)	total: 268ms	remaining: 4m 27s
1:	test: 0.7952383	best: 0.7952383 (1)	total: 371ms	remaining: 3m 5s
2:	test: 0.8278227	best: 0.8278227 (2)	total: 489ms	remaining: 2m 42s
3:	test: 0.8280825	best: 0.8280825 (3)	total: 609ms	remaining: 2m 31s
4:	test: 0.8238946	best: 0.8280825 (3)	total: 726ms	remaining: 2m 24s
5:	test: 0.9243175	best: 0.9243175 (5)	total: 881ms	remaining: 2m 25s
6:	test: 0.9256021	best: 0.9256021 (6)	total: 1s	remaining: 2m 22s
7:	test: 0.9589575	best: 0.9589575 (7)	total: 1.12s	remaining: 2m 19s
8:	test: 0.9575365	best: 0.9589575 (7)	total: 1.21s	remaining: 2m 13s
9:	test: 0.9705662	best: 0.9705662 (9)	total: 1.33s	remaining: 2m 12s
10:	test: 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


步驟 1: 訓練 AutoEncoder 並篩選樣本...
[1m39378/39378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 528us/step
篩選出 1008059 筆可靠正常樣本，剔除 252015 筆潛在雜訊。

步驟 2: 建立混合資料集並訓練 CatBoost...

執行超參數調整 (RandomizedSearchCV)...
Fitting 3 folds for each of 50 candidates, totalling 150 fits
0:	test: 0.9803394	best: 0.9803394 (0)	total: 74.6ms	remaining: 1m 14s
1:	test: 0.9811174	best: 0.9811174 (1)	total: 151ms	remaining: 1m 15s
2:	test: 0.9775892	best: 0.9811174 (1)	total: 234ms	remaining: 1m 17s
3:	test: 0.9835699	best: 0.9835699 (3)	total: 314ms	remaining: 1m 18s
4:	test: 0.9907881	best: 0.9907881 (4)	total: 398ms	remaining: 1m 19s
5:	test: 0.9935246	best: 0.9935246 (5)	total: 483ms	remaining: 1m 19s
6:	test: 0.9948005	best: 0.9948005 (6)	total: 576ms	remaining: 1m 21s
7:	test: 0.9954657	best: 0.9954657 (7)	total: 666ms	remaining: 1m 22s
8:	test: 0.9954303	best: 0.9954657 (7)	total: 753ms	remaining: 1m 22s
9:	test: 0.9954014	best: 0.9954657 (7)	total: 843ms	remaining: 1m 23s
10:	test: 0.9954703	best:

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


步驟 1: 訓練 AutoEncoder 並篩選樣本...
[1m39378/39378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 542us/step
篩選出 1008059 筆可靠正常樣本，剔除 252015 筆潛在雜訊。

步驟 2: 建立混合資料集並訓練 CatBoost...

執行超參數調整 (RandomizedSearchCV)...
Fitting 3 folds for each of 50 candidates, totalling 150 fits
0:	test: 0.9625816	best: 0.9625816 (0)	total: 230ms	remaining: 3m 49s
1:	test: 0.9725145	best: 0.9725145 (1)	total: 530ms	remaining: 4m 24s
2:	test: 0.9705776	best: 0.9725145 (1)	total: 827ms	remaining: 4m 34s
3:	test: 0.9796240	best: 0.9796240 (3)	total: 1.1s	remaining: 4m 33s
4:	test: 0.9794388	best: 0.9796240 (3)	total: 1.35s	remaining: 4m 29s
5:	test: 0.9809910	best: 0.9809910 (5)	total: 1.62s	remaining: 4m 28s
6:	test: 0.9810528	best: 0.9810528 (6)	total: 1.91s	remaining: 4m 30s
7:	test: 0.9822513	best: 0.9822513 (7)	total: 2.2s	remaining: 4m 32s
8:	test: 0.9827797	best: 0.9827797 (8)	total: 2.51s	remaining: 4m 36s
9:	test: 0.9843418	best: 0.9843418 (9)	total: 2.82s	remaining: 4m 38s
10:	test: 0.9848363	best: 0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
for index in train_entities_index:
    seeds = [42]    #[21,42,84]
    model = AE_CatBoost_Model(0.95)
    best_models[index] = None
    entity = entities[index]
    # 根據 seed 切分資料
    for seed in seeds:

        print ("="*50)
        print (f"訓練資料集：{index} , seed: {seed}")
        print ("="*50)

        # 切分訓練與測試資料(第一次切分)
        X_train, X_test, y_train, y_test = DataProcessor.train_test_seed_split(
                entity,
                rand_seed=seed
            )

        # 訓練模型
        # model.fit(X_train, y_train) 
        model.fit(X_train, y_train, train_turn=200,tune_params=True)
        
        # 儲存最佳模型
        best_models[index] = model

        # evaluater
        print ("\n"+"-"*50)
        print ("evaluater 測試：")
        print ("-"*50)
        get_evaluater_score(best_models[index],X_train, X_test, y_train, y_test)
        
        # 找回 label = 1 的準確度
        """
        - entity_label1 : 全部為 label 1 的資料集
        - X_label1, y_label1 : 從 entity_label1 切分出來的訓練資料 (All label 1)
        - X_test_label1, y__test_label1 : 從 X_test 切分出來的測試資料 (All label 1)
        """
        # All label 1
        entity_label1 = entity.copy()
        label,entity_label1_cut = DataProcessor.cut_label(entity_label1) # label -> enitiy的 label ; entity_label1_cut -> entity 移除 label 列 
        X_label1 = DataProcessor.split_diff_label(entity_label1_cut, label, positive_label=True) # label1 -> entity 移除 label 列並取 label = 1 (alert)
        y_label1 = pd.Series([1]*X_label1.shape[0]) # 全部為 1
        # no train label 1
        X_test_label1 = DataProcessor.split_diff_label(X_test, y_test, positive_label=True) # 從 X_test 切分出來的測試資料 (All label 1) 
        y_test_label1 = pd.Series([1]*X_test_label1.shape[0])

        # 產生分數
        print ("\n"+"-"*50)
        print ("Label = 1 (alert) 的準確度：")
        print ("-"*50)
        get_evaluater_score(best_models[index],X_train, X_label1, y_train, y_label1)
        # thresholds_to_test = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.98, 0.99]
        # for th in thresholds_to_test:
        #     print(f"\nTesting Threshold: {th}")
    
        #     # 1. 修改模型內部的閾值
        #     model = best_models[index]
        #     model.set_threshold(th)
    
        #     # 2. 直接使用原本的評估器，完全不用改 Evaluater 的 code
        #     # 因為 Evaluater 呼叫 model.predict() 時，現在會自動用新的閾值
        #     get_evaluater_score(model, X_train, X_test, y_train, y_test)



訓練資料集：T1 , seed: 42
步驟 1: 訓練 AutoEncoder 並篩選樣本...
[1m39378/39378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 544us/step
篩選出 1008059 筆可靠正常樣本，剔除 252015 筆潛在雜訊。

步驟 2: 建立混合資料集並訓練 CatBoost...

執行超參數調整 (RandomizedSearchCV)...
Fitting 3 folds for each of 200 candidates, totalling 600 fits


KeyboardInterrupt: 