### Hybrid Pipeline 雙階段混和
> 非監督篩式清洗 + 監督式分類

> autoencoder(AE) + catboost

#### 載入外部程式庫

anaconda 需額外載入
> tensorflow

> catboost

In [27]:
import numpy as np
import pandas as pd
from pandas.testing import assert_frame_equal # panda 檢查
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from tensorflow.keras.models import Model           #AE
from tensorflow.keras.layers import Input, Dense    #AE
from tensorflow.keras.optimizers import Adam        #AE
from catboost import CatBoostClassifier             #catboost
# Evaluater
import sys
import os
import itertools


#### 準備資料集

##### Load csv


In [28]:
entities_path = {
     "T1"   :   "../DataSet/preprocessing_T1_basic.csv"
    ,"T12"  :   "../DataSet/preprocessing_T1_2.csv"
    ,"T123" :   "../DataSet/preprocessing_T1_2_3.csv"
}
entities = {}

# Load preprocessing and cut 
for index, path in entities_path.items():
    entities[index] = pd.read_csv(path)

print("原始資料：")
for v in entities.values():
    print(f"{v.head()}\n\n")

原始資料：
                                                acct  txn_count  first_txn_ts  \
0  00000577cfcd0bde8ee693021419ef13a1f7f933ec8626...          1       1953000   
1  00000eec52ea49377de91bc7b54eb3192943e6c20e0a51...          1       4489500   
2  000015150c92e2a41c4715a088df78d77a7d4f3017aadc...          1       5823000   
3  00002846e6b430580825e2b10fe3ff1e3ddb93f42c608d...          1       1100100   
4  00002b3d8f9c7b91c407a5725849deb521fcf1dd5eea1f...          1        777300   

   last_txn_ts  std_txn_ts  acct_type_x  cross_type    uni_amt  acct_type_y  \
0      1953000         0.0            2           1    47500.0            2   
1      4489500         0.0            2           1     6150.0            2   
2      5823000         0.0            2           1  1150000.0            2   
3      1100100         0.0            2           1     8550.0            2   
4       777300         0.0            2           1     1450.0            2   

   num_unique_dest_accts  num_un

##### Cut Data

###### 切 acct

In [29]:
# Cut preprocessing
entities_id = pd.DataFrame()

for index, entity in entities.items():
    id = entity.iloc[:,[0]]
    entities[index] = entity.iloc[:,1:]

    # 判斷entity id 是否相同
    if entities_id.empty:
        entities_id = id
    else :
        try:
            assert_frame_equal(entities_id, id)
        except AssertionError as e:
            print(f"Data ID error : from {index}")
            print(e)

print("資料ID :")
entities_id.head()

資料ID :


Unnamed: 0,acct
0,00000577cfcd0bde8ee693021419ef13a1f7f933ec8626...
1,00000eec52ea49377de91bc7b54eb3192943e6c20e0a51...
2,000015150c92e2a41c4715a088df78d77a7d4f3017aadc...
3,00002846e6b430580825e2b10fe3ff1e3ddb93f42c608d...
4,00002b3d8f9c7b91c407a5725849deb521fcf1dd5eea1f...


###### 切訓練資料用 ( label 分開 )  ( 工具 )

In [30]:
# 輸入 pd 去掉 label 
# return pd label
def cut_label(data):
    temp = data.cpoy()
    data.drop(columns=['label'])
    return temp['label']

###### 切資料集(定義)

In [31]:
# x -> 無標籤資料集
# y -> x 的標籤
# test -> 結果產出用的 ; X_test -> 丟AE -> 拿"高風險"當結果的 X_test
# X_train -> 丟進 AE -> 跑訓練
# y_train -> 提取出 label 為 1 
def tran_test_cut(df_preprocessing,rand_seed):
    y = cut_label(df_preprocessing)
    X_train, X_test, y_train, y_test = train_test_split(df_preprocessing, y, test_size=0.3, random_state=rand_seed, stratify=y)
    return X_train, X_test, y_train, y_test

#### 取得準確資料答案資料

In [32]:
def get_label_1_data(data,label):
    label_1_data = pd.DataFrame()
    for i in label:
        if label[i] == 1:
            label_1_data.add(data[i])
    return label_1_data

In [None]:
for entity in entities.values():
    label = cut_label(entity)
    print(get_label_1_data(entity,label))

#### 資料前處理

> Autoencoder 需要標準化

> catboost 基本無須前處理

> 警示上戶須加上特徵值

##### 標準化

In [None]:
# === 標準化 MinMaxScaler ===
def scaler(data):
    scaler = MinMaxScaler()
    temp = scaler.fit_transform(data)
    return temp

In [None]:

entities_scalr = {}
for index,entity in entities.items():
    entities_scalr[index] = scaler(entity)

# print
print("\n標準化後：")
for v in entities_scalr.values():
    print(v[:5]) # 顯示前 5 筆

#### 建立 Autoencoder (AE)

In [None]:
def build_autoencoder(input_dim):
    """建立簡單的 AE 模型"""
    input_layer = Input(shape=(input_dim,))
    
    # Encoder: 壓縮特徵
    encoded = Dense(16, activation='relu')(input_layer)
    encoded = Dense(8, activation='relu')(encoded)
    
    # Decoder: 還原特徵
    decoded = Dense(16, activation='relu')(encoded)
    output_layer = Dense(input_dim, activation='sigmoid')(decoded) # 若資料正規化到 0-1 用 sigmoid
    
    autoencoder = Model(inputs=input_layer, outputs=output_layer)
    autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return autoencoder

#### 建立 Step1 : 非監督清洗

In [None]:
# X_unlabeled_scaled    標準化無標記綜合資料(含特徵)
# X_unlabeled           無標記綜合資料(含特徵)
def AE(X_unlabeled_scaled,X_unlabeled):
    # 建立並訓練 AE
    # 注意：這裡使用全部未標記資料訓練，模型會傾向學習"大眾(正常)"的模式
    input_dim = X_unlabeled_scaled.shape[1]
    ae_model = build_autoencoder(input_dim)
    ae_model.fit(
        X_unlabeled_scaled, X_unlabeled_scaled,
        epochs=20, 
        batch_size=256, 
        shuffle=True, 
        verbose=0
    )

    # 計算重建誤差 (Reconstruction Error / MSE)
    reconstructions = ae_model.predict(X_unlabeled_scaled)
    mse = np.mean(np.power(X_unlabeled_scaled - reconstructions, 2), axis=1)

    # 設定閾值：選取誤差最小的前 80% 作為「可靠的正常樣本 (Reliable Negatives)」
    # 剩下的 20% 被視為模糊地帶，暫時不參與訓練，避免誤導模型
    threshold = np.percentile(mse, 80) 
    mask_reliable = mse < threshold
    

    X_reliable_negatives = X_unlabeled[mask_reliable] # 取得原始數值(非 scaled)
    X_uncertain = X_unlabeled[~mask_reliable] # 第一階段被剔除的高誤差群體
    print(f"篩選出 {len(X_reliable_negatives)} 筆可靠正常樣本，剔除 {len(X_unlabeled) - len(X_reliable_negatives)} 筆潛在雜訊。")
    return X_reliable_negatives,X_uncertain

#### 建立 Step2 : 監督式分類

##### 定義模型及參數網格

In [None]:
# 定義 CatBoost 模型
# auto_class_weights='Balanced' 對於異常偵測極為重要，自動處理樣本不平衡
model = CatBoostClassifier(
    auto_class_weights='Balanced', # 覆蓋預設值 (None -> Balanced)
    eval_metric='AUC',             # 覆蓋預設值 (Logloss -> AUC)
)

In [None]:
# 定義參數網格 (Parameter Grid)
param_dist = {
    'iterations': [200, 500],           # 樹的數量
    'learning_rate': [0.01, 0.05, 0.1], # 學習率
    'depth': [4, 6, 8],                 # 樹的深度 (太深容易 overfitting)
    'l2_leaf_reg': [1, 3, 5, 7],        # L2 正則化係數
    'random_strength': [1, 5, 10],      # 防止過擬合的隨機性強度
    'bagging_temperature': [0, 1]       # 貝葉斯自助抽樣的強度
}

##### 資料集切分

In [None]:
def catBoost_cut(X_reliable_negatives,X_known_anomalies):
    # 建立&切分訓練集
    # 建構訓練集：可靠正常 (Label 0) + 已知異常 (Label 1)
    X_train_final = np.vstack([X_reliable_negatives, X_known_anomalies])
    y_train_final = np.hstack([np.zeros(len(X_reliable_negatives)), np.ones(len(X_known_anomalies))])

    # 切分驗證集 (為了調參使用)
    X_train, X_test, y_train, y_test = train_test_split(X_train_final, y_train_final, test_size=0.2, stratify=y_train_final, random_state=42)
    return X_train, X_test, y_train, y_test

##### 參數調整

In [None]:
def catBoost_get_best_model(train_turn,X_train, X_test, y_train, y_test):
    # 使用 RandomizedSearchCV 進行參數調整
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter = train_turn,  # 隨機嘗試 10 組參數
        scoring='roc_auc',
       cv=3,       # 3-Fold Cross Validation
       verbose=1,
       n_jobs=-1   # 使用所有 CPU 核心
    )

    # 開始訓練與搜索
    random_search.fit(X_train, y_train, eval_set=(X_test, y_test))

    # 取得最佳模型
    best_model = random_search.best_estimator_
    print(f"\n最佳參數組合: {random_search.best_params_}")
    print(f"最佳 CV AUC 分數: {random_search.best_score_:.4f}")
    return best_model

#### 建立 Evaluater評分 (結果產出)

In [None]:
def get_evaluater_score(model,X_train, X_test, y_train, y_test):
    # 假設您的環境設定
    sys.path.append(os.path.dirname(os.getcwd()))
    from Util import Evaluater

    Evaluater.evaluate_model(model, (X_train, X_test, y_train, y_test))

#### 實作 : 定義運行資料集(dict)

In [None]:
# entities 標籤 (dict)
entities_index = []
for index in entities:
    entities_index.append(index)

print(entities_index)

In [None]:
#==== 實作引用資料參考上面 輸出 index ====#

#       這裡手動調整跑訓練的資料集(參考上方輸出)
#                       vv
train_entities_index = ["T1"]
#                       ^^
#       這裡手動調整跑訓練的資料集(參考上方輸出)


# 載入訓練用資料
X_unlabeled_scaled = {} # 標準化無標記綜合資料(含特徵)
X_unlabeled = {}        # 無標記綜合資料(含特徵)
X_known_anomalies = {}  # 已知警示帳戶(含特徵)

In [None]:
# 定義輸出資料集
best_models = {}
entities_scores = {}

#### 實作 : 資料分析 ( AE + CatBoost )

In [None]:
for index in train_entities_index:
    seeds = [21,42,84]
    # 根據 seed 切分資料
    for seed in seeds:
        # 切
        train_data, test_data, train_label, test_label = tran_test_cut(entities[index],seed)
        # AE
        train_0, train_1 = AE(scaler(train_data),train_data)
        test_0, test_1 = AE(scaler(test_data),test_data)
        # 切分資料
        X_train, X_test, y_train, y_test = catBoost_cut(train_0,get_label_1_data(train_data,train_label))
        # CatBoost + 參數調整
        turn = 10 # 10個參數
        best_models[index] = catBoost_get_best_model(turn,X_train, X_test, y_train, y_test)
        
        # evaluater
        get_evaluater_score(best_models[index],X_train, X_test, y_train, y_test)