### Hybrid Pipeline 雙階段混和
> 非監督篩式清洗 + 監督式分類

> autoencoder(AE) + catboost

#### 載入外部程式庫

anaconda 需額外載入
> tensorflow

> catboost

In [1]:
import numpy as np
import pandas as pd
from pandas.testing import assert_frame_equal # panda 檢查
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from tensorflow.keras.models import Model           #AE
from tensorflow.keras.layers import Input, Dense    #AE
from tensorflow.keras.optimizers import Adam        #AE
from catboost import CatBoostClassifier             #catboost
# Evaluater
import sys
import os
import itertools


#### 準備資料集

##### Load csv


In [2]:
entities_path = {
     "T1"   :   "../DataSet/preprocessing_T1_basic.csv"
    ,"T12"  :   "../DataSet/preprocessing_T1_2.csv"
    ,"T123" :   "../DataSet/preprocessing_T1_2_3.csv"
}
entities = {}

# Load 有標籤的資料集(alert)
alert_path = "../DataSet/acct_alert.csv"
entity_label = pd.read_csv(alert_path)
entity_label = entity_label.iloc[:,[0]] # 提出ID (只要ID)

# Load preprocessing and cut 
for index, path in entities_path.items():
    entities[index] = pd.read_csv(path)

print("原始資料：")
for v in entities.values():
    print(f"{v.head()}\n\n")

print(f"有標籤ID:\n{entity_label}")

原始資料：
                                                acct  txn_count  first_txn_ts  \
0  00000577cfcd0bde8ee693021419ef13a1f7f933ec8626...          1       1953000   
1  00000eec52ea49377de91bc7b54eb3192943e6c20e0a51...          1       4489500   
2  000015150c92e2a41c4715a088df78d77a7d4f3017aadc...          1       5823000   
3  00002846e6b430580825e2b10fe3ff1e3ddb93f42c608d...          1       1100100   
4  00002b3d8f9c7b91c407a5725849deb521fcf1dd5eea1f...          1        777300   

   last_txn_ts  std_txn_ts  acct_type_x  cross_type    uni_amt  acct_type_y  \
0      1953000         0.0            2           1    47500.0            2   
1      4489500         0.0            2           1     6150.0            2   
2      5823000         0.0            2           1  1150000.0            2   
3      1100100         0.0            2           1     8550.0            2   
4       777300         0.0            2           1     1450.0            2   

   num_unique_dest_accts  num_un

##### 生成有標籤的資料集

In [6]:
entities_label = {}
# entity_label -> label ( only id )

for index, entity in entities.items():
    missing_ids = entity_label.loc[~entity_label['acct'].isin(entity['acct']), 'acct']
    if not missing_ids.empty:
        print("error")
        continue
    A_indexed = entity_label.set_index('acct')
    entities_label[index] = A_indexed.join(entity.set_index('acct'))
    entities_label[index] = entities_label[index].reset_index()

print("原始資料：")
for v in entities_label.values():
    print(f"{v.head()}\n\n")

原始資料：
                                                acct  txn_count  first_txn_ts  \
0  80bd1c28b47357a3d37a01835ebb1bed5edf54e791bd3d...          8       7218300   
1  b8c11db05d00b5ac66be10ffee5f6ce6ef9221c733a4bb...          3       1598700   
2  daa05c68b290ac3cc522abad400c5304dffba07baa232c...         10        831600   
3  174e26ecc9cee56aaaca855c743a106275c58629740a49...         20        365700   
4  007cf5c98aa4f9f3e444c9cdaca74d0f7542e9a2804201...         17       1013100   

   last_txn_ts    std_txn_ts  acct_type_x  cross_type  uni_amt  acct_type_y  \
0      7392900  6.103451e+04            1           1  22500.0            1   
1      1606800  3.749667e+03            1           1   1250.0            1   
2      6959700  2.485175e+06            1           1   2850.0            1   
3      7563600  1.736112e+06            1           1   4550.0            1   
4      9926700  3.636601e+06            1           1  31500.0            1   

   num_unique_dest_accts  num_un

##### Cut_acct

In [10]:
# Cut preprocessing
entities_id = pd.DataFrame()

for index, entity in entities.items():
    id = entity.iloc[:,[0]]
    entities[index] = entity.iloc[:,1:]

    # 判斷entity id 是否相同
    if entities_id.empty:
        entities_id = id
    else :
        try:
            assert_frame_equal(entities_id, id)
        except AssertionError as e:
            print(f"Data ID error : from {index}")
            print(e)

print("資料ID :")
entities_id.head()

資料ID :


Unnamed: 0,acct
0,00000577cfcd0bde8ee693021419ef13a1f7f933ec8626...
1,00000eec52ea49377de91bc7b54eb3192943e6c20e0a51...
2,000015150c92e2a41c4715a088df78d77a7d4f3017aadc...
3,00002846e6b430580825e2b10fe3ff1e3ddb93f42c608d...
4,00002b3d8f9c7b91c407a5725849deb521fcf1dd5eea1f...


In [11]:
entities_label_id = pd.DataFrame()

for index,entity in entities_label.items():
    id = entity.iloc[:,[0]]
    entities_label[index] = entity.iloc[:,1:]
    
    # 判斷entity id 是否相同
    if entities_label_id.empty:
        entities_label_id = id
    else :
        try:
            assert_frame_equal(entities_label_id, id)
        except AssertionError as e:
            print(f"Data ID error : from {index}")
            print(e)
    
print("id :")
entities_label_id.head()

id :


Unnamed: 0,acct
0,80bd1c28b47357a3d37a01835ebb1bed5edf54e791bd3d...
1,b8c11db05d00b5ac66be10ffee5f6ce6ef9221c733a4bb...
2,daa05c68b290ac3cc522abad400c5304dffba07baa232c...
3,174e26ecc9cee56aaaca855c743a106275c58629740a49...
4,007cf5c98aa4f9f3e444c9cdaca74d0f7542e9a2804201...


#### 資料前處理

> Autoencoder 需要標準化

> catboost 基本無須前處理

> 警示上戶須加上特徵值

##### 標準化

In [12]:
# === 標準化 MinMaxScaler ===
entities_scaler = {}
scaler = MinMaxScaler()

for index,label in entities.items():
    entities_scaler[index] = scaler.fit_transform(label)


# print
print("\n標準化後：")
for v in entities.values():
    print(v[:5]) # 顯示前 5 筆


標準化後：
   txn_count  first_txn_ts  last_txn_ts  std_txn_ts  acct_type_x  cross_type  \
0          1       1953000      1953000         0.0            2           1   
1          1       4489500      4489500         0.0            2           1   
2          1       5823000      5823000         0.0            2           1   
3          1       1100100      1100100         0.0            2           1   
4          1        777300       777300         0.0            2           1   

     uni_amt  acct_type_y  num_unique_dest_accts  num_unique_source_accts  \
0    47500.0            2                    0.0                      1.0   
1     6150.0            2                    1.0                      0.0   
2  1150000.0            2                    1.0                      0.0   
3     8550.0            2                    0.0                      1.0   
4     1450.0            2                    0.0                      1.0   

   is_high_freq  has_foreign_currency  label  
0 

#### 建立 Autoencoder (AE)

In [13]:
def build_autoencoder(input_dim):
    """建立簡單的 AE 模型"""
    input_layer = Input(shape=(input_dim,))
    
    # Encoder: 壓縮特徵
    encoded = Dense(16, activation='relu')(input_layer)
    encoded = Dense(8, activation='relu')(encoded)
    
    # Decoder: 還原特徵
    decoded = Dense(16, activation='relu')(encoded)
    output_layer = Dense(input_dim, activation='sigmoid')(decoded) # 若資料正規化到 0-1 用 sigmoid
    
    autoencoder = Model(inputs=input_layer, outputs=output_layer)
    autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return autoencoder

#### 建立 Step1 : 非監督清洗

In [14]:
# X_unlabeled_scaled    標準化無標記綜合資料(含特徵)
# X_unlabeled           無標記綜合資料(含特徵)
def AE(X_unlabeled_scaled,X_unlabeled):
    # 建立並訓練 AE
    # 注意：這裡使用全部未標記資料訓練，模型會傾向學習"大眾(正常)"的模式
    input_dim = X_unlabeled_scaled.shape[1]
    ae_model = build_autoencoder(input_dim)
    ae_model.fit(
        X_unlabeled_scaled, X_unlabeled_scaled,
        epochs=20, 
        batch_size=256, 
        shuffle=True, 
        verbose=0
    )

    # 計算重建誤差 (Reconstruction Error / MSE)
    reconstructions = ae_model.predict(X_unlabeled_scaled)
    mse = np.mean(np.power(X_unlabeled_scaled - reconstructions, 2), axis=1)

    # 設定閾值：選取誤差最小的前 80% 作為「可靠的正常樣本 (Reliable Negatives)」
    # 剩下的 20% 被視為模糊地帶，暫時不參與訓練，避免誤導模型
    threshold = np.percentile(mse, 80) 
    mask_reliable = mse < threshold

    X_reliable_negatives = X_unlabeled[mask_reliable] # 取得原始數值(非 scaled)
    print(f"篩選出 {len(X_reliable_negatives)} 筆可靠正常樣本，剔除 {len(X_unlabeled) - len(X_reliable_negatives)} 筆潛在雜訊。")
    return X_reliable_negatives

#### 建立 Step2 : 監督式分類

##### 定義模型及參數網格

In [15]:
# 定義 CatBoost 模型
# auto_class_weights='Balanced' 對於異常偵測極為重要，自動處理樣本不平衡
model = CatBoostClassifier(
    auto_class_weights='Balanced', # 覆蓋預設值 (None -> Balanced)
    eval_metric='AUC',             # 覆蓋預設值 (Logloss -> AUC)
)

In [16]:
# 定義參數網格 (Parameter Grid)
param_dist = {
    'iterations': [200, 500],           # 樹的數量
    'learning_rate': [0.01, 0.05, 0.1], # 學習率
    'depth': [4, 6, 8],                 # 樹的深度 (太深容易 overfitting)
    'l2_leaf_reg': [1, 3, 5, 7],        # L2 正則化係數
    'random_strength': [1, 5, 10],      # 防止過擬合的隨機性強度
    'bagging_temperature': [0, 1]       # 貝葉斯自助抽樣的強度
}

##### 資料集切分

In [17]:
def catBoost_cut(X_reliable_negatives,X_known_anomalies):
    # 建立&切分訓練集
    # 建構訓練集：可靠正常 (Label 0) + 已知異常 (Label 1)
    X_train_final = np.vstack([X_reliable_negatives, X_known_anomalies])
    y_train_final = np.hstack([np.zeros(len(X_reliable_negatives)), np.ones(len(X_known_anomalies))])

    # 切分驗證集 (為了調參使用)
    X_train, X_test, y_train, y_test = train_test_split(X_train_final, y_train_final, test_size=0.2, stratify=y_train_final, random_state=42)
    return X_train, X_test, y_train, y_test

In [None]:
# x -> 無標籤資料集
# y -> x 的標籤
def result_cut(x,y):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_preprocessing.copy().drop(columns=['label']))

    # 分割資料集
    y = df_preprocessing['label']
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=random_state, stratify=y)

##### 參數調整

In [18]:
def catBoost_get_best_model(train_turn,X_train, X_test, y_train, y_test):
    # 使用 RandomizedSearchCV 進行參數調整
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter = train_turn,  # 隨機嘗試 10 組參數
        scoring='roc_auc',
       cv=3,       # 3-Fold Cross Validation
       verbose=1,
       n_jobs=-1   # 使用所有 CPU 核心
    )

    # 開始訓練與搜索
    random_search.fit(X_train, y_train, eval_set=(X_test, y_test))

    # 取得最佳模型
    best_model = random_search.best_estimator_
    print(f"\n最佳參數組合: {random_search.best_params_}")
    print(f"最佳 CV AUC 分數: {random_search.best_score_:.4f}")
    return best_model

#### 建立 Evaluater評分 (結果產出)

In [19]:
def get_evaluater_score(model,X_train, X_test, y_train, y_test):
    # 假設您的環境設定
    sys.path.append(os.path.dirname(os.getcwd()))
    from Util import Evaluater

    Evaluater.evaluate_model(model, (X_train, X_test, y_train, y_test))

#### 實作 : 定義運行資料集(dict)

In [20]:
# entities 標籤 (dict)
entities_index = []
for index in entities:
    entities_index.append(index)

print(entities_index)

['T1', 'T12', 'T123']


In [23]:
#==== 實作引用資料參考上面 輸出 index ====#

#       這裡手動調整跑訓練的資料集(參考上方輸出)
#                       vv
train_entities_index = ["T1"]
#                       ^^
#       這裡手動調整跑訓練的資料集(參考上方輸出)


# 載入訓練用資料
X_unlabeled_scaled = {} # 標準化無標記綜合資料(含特徵)
X_unlabeled = {}        # 無標記綜合資料(含特徵)
X_known_anomalies = {}  # 已知警示帳戶(含特徵)

for index in train_entities_index:
    X_unlabeled_scaled[index] = entities_scaler[index]
    X_unlabeled[index] = entities[index]
    X_known_anomalies[index] = entities_label[index]

In [24]:
# 定義輸出資料集
best_models = {}
entities_scores = {}

#### 實作 : 資料分析 ( AE + CatBoost )

In [26]:
for index in train_entities_index:
    # AE
    X_reliable_negatives = AE(X_unlabeled_scaled[index],X_unlabeled[index])
    # 切分資料
    X_train, X_test, y_train, y_test = catBoost_cut(X_reliable_negatives,X_known_anomalies[index])
    # CatBoost + 參數調整
    best_models[index] = catBoost_get_best_model(10,X_train, X_test, y_train, y_test)
    # evaluater
    get_evaluater_score(best_models[index],X_train, X_test, y_train, y_test)

[1m56254/56254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 671us/step
篩選出 1440084 筆可靠正常樣本，剔除 360022 筆潛在雜訊。
Fitting 3 folds for each of 10 candidates, totalling 30 fits
0:	test: 0.9998968	best: 0.9998968 (0)	total: 293ms	remaining: 58.3s
1:	test: 0.9999322	best: 0.9999322 (1)	total: 429ms	remaining: 42.4s
2:	test: 0.9999322	best: 0.9999322 (1)	total: 549ms	remaining: 36.1s
3:	test: 0.9999383	best: 0.9999383 (3)	total: 674ms	remaining: 33s
4:	test: 0.9999397	best: 0.9999397 (4)	total: 799ms	remaining: 31.1s
5:	test: 0.9999390	best: 0.9999397 (4)	total: 940ms	remaining: 30.4s
6:	test: 0.9999421	best: 0.9999421 (6)	total: 1.06s	remaining: 29.2s
7:	test: 0.9999428	best: 0.9999428 (7)	total: 1.17s	remaining: 28.2s
8:	test: 0.9999465	best: 0.9999465 (8)	total: 1.29s	remaining: 27.4s
9:	test: 0.9999499	best: 0.9999499 (9)	total: 1.4s	remaining: 26.7s
10:	test: 0.9999499	best: 0.9999499 (10)	total: 1.51s	remaining: 26s
11:	test: 0.9999514	best: 0.9999514 (11)	total: 1.65s	remaining: