### Hybrid Pipeline 雙階段混和
> 非監督篩式清洗 + 監督式分類

> autoencoder(AE) + catboost

#### 載入外部程式庫

anaconda 需額外載入
> tensorflow

> catboost

In [None]:
import pandas as pd
from pandas.testing import assert_frame_equal # panda 檢查
# model
from src.data_processing import DataProcessor
# Evaluater
import sys
import os


#### 準備資料集

##### Load csv


In [None]:
entities_path = {
     "T1"   :   "../DataSet/preprocessing_T1_basic.csv"
    ,"T12"  :   "../DataSet/preprocessing_T1_2.csv"
    ,"T123" :   "../DataSet/preprocessing_T1_2_3.csv"
}
entities = {}

# Load preprocessing and cut 
for index, path in entities_path.items():
    entities[index] = pd.read_csv(path)

print("原始資料：")
for v in entities.values():
    print(f"{v.head()}\n\n")

##### Cut acct ID

In [None]:
# Cut preprocessing
entities_id = pd.DataFrame()

for index, entity in entities.items():
    id = entity.iloc[:,[0]]
    entities[index] = entity.iloc[:,1:]

    # 判斷entity id 是否相同
    if entities_id.empty:
        entities_id = id
    else :
        try:
            assert_frame_equal(entities_id, id)
        except AssertionError as e:
            print(f"Data ID error : from {index}")
            print(e)

print("資料ID :")
entities_id.head()

#### 建立 Evaluater評分 (結果產出)

In [None]:
def get_evaluater_score(model,X_train, X_test, y_train, y_test):
    # 假設您的環境設定
    sys.path.append(os.path.dirname(os.getcwd()))
    from Util import Evaluater

    Evaluater.evaluate_model(model, (X_train, X_test, y_train, y_test))

#### 假資料

In [None]:
# 4倍


#### 減少資料

In [None]:
# 2.5%

#### 實作 : 定義運行資料集(dict)

In [None]:
# entities 標籤 (dict)
entities_index = []
for index in entities:
    entities_index.append(index)

print(entities_index)

In [None]:
#==== 實作引用資料參考上面 輸出 index ====#

#       這裡手動調整跑訓練的資料集(參考上方輸出)
#                       vv
train_entities_index = ["T1","T12","T123"]
#                       ^^
#       這裡手動調整跑訓練的資料集(參考上方輸出)

#       這裡手動調整跑訓練的次數
#                  vv
model_train_turn = 50
#                  ^^
#       這裡手動調整跑訓練的次數

In [None]:
# 定義輸出資料集
best_models = {}

#### 實作 : 資料分析 ( AE + CatBoost )

In [None]:
from src.model import AE_CatBoost_Model 

In [None]:
for index in train_entities_index:
    seeds = [42]    #[21,42,84]
    model = AE_CatBoost_Model(0.95)
    best_models[index] = None
    entity = entities[index]
    # 根據 seed 切分資料
    for seed in seeds:

        print ("="*50)
        print (f"訓練資料集：{index} , seed: {seed}")
        print ("="*50)

        # 切分訓練與測試資料(第一次切分)
        X_train, X_test, y_train, y_test = DataProcessor.train_test_seed_split(
                entity,
                rand_seed=seed
        )

        # 

        # 訓練模型
        # model.fit(X_train, y_train) 
        model.fit(X_train, y_train, train_turn=model_train_turn,tune_params=True)
        
        # 儲存最佳模型
        best_models[index] = model

        # evaluater
        print ("\n"+"-"*50)
        print ("evaluater 測試：")
        print ("-"*50)
        get_evaluater_score(best_models[index],X_train, X_test, y_train, y_test)
        
        # 找回 label = 1 的準確度
        """
        - entity_label1 : 全部為 label 1 的資料集
        - X_label1, y_label1 : 從 entity_label1 切分出來的訓練資料 (All label 1)
        - X_test_label1, y__test_label1 : 從 X_test 切分出來的測試資料 (All label 1)
        """
        # All label 1
        entity_label1 = entity.copy()
        label,entity_label1_cut = DataProcessor.cut_label(entity_label1) # label -> enitiy的 label ; entity_label1_cut -> entity 移除 label 列 
        X_label1 = DataProcessor.split_diff_label(entity_label1_cut, label, positive_label=True) # label1 -> entity 移除 label 列並取 label = 1 (alert)
        y_label1 = pd.Series([1]*X_label1.shape[0]) # 全部為 1
        # no train label 1
        X_test_label1 = DataProcessor.split_diff_label(X_test, y_test, positive_label=True) # 從 X_test 切分出來的測試資料 (All label 1) 
        y_test_label1 = pd.Series([1]*X_test_label1.shape[0])

        # 產生分數
        print ("\n"+"-"*50)
        print ("Label = 1 (alert) 的準確度：")
        print ("-"*50)
        get_evaluater_score(best_models[index],X_train, X_label1, y_train, y_label1)
        # thresholds_to_test = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.98, 0.99]
        # for th in thresholds_to_test:
        #     print(f"\nTesting Threshold: {th}")
    
        #     # 1. 修改模型內部的閾值
        #     model = best_models[index]
        #     model.set_threshold(th)
    
        #     # 2. 直接使用原本的評估器，完全不用改 Evaluater 的 code
        #     # 因為 Evaluater 呼叫 model.predict() 時，現在會自動用新的閾值
        #     get_evaluater_score(model, X_train, X_test, y_train, y_test)



In [None]:
for index in train_entities_index:
    seeds = [42]    #[21,42,84]
    model = AE_CatBoost_Model(0.95)
    best_models[index] = None
    entity = entities[index]
    # 根據 seed 切分資料
    for seed in seeds:

        print ("="*50)
        print (f"訓練資料集：{index} , seed: {seed}")
        print ("="*50)

        # 切分訓練與測試資料(第一次切分)
        X_train, X_test, y_train, y_test = DataProcessor.train_test_seed_split(
                entity,
                rand_seed=seed
            )

        # 訓練模型
        # model.fit(X_train, y_train) 
        model.fit(X_train, y_train, train_turn=200,tune_params=True)
        
        # 儲存最佳模型
        best_models[index] = model

        # evaluater
        print ("\n"+"-"*50)
        print ("evaluater 測試：")
        print ("-"*50)
        get_evaluater_score(best_models[index],X_train, X_test, y_train, y_test)
        
        # 找回 label = 1 的準確度
        """
        - entity_label1 : 全部為 label 1 的資料集
        - X_label1, y_label1 : 從 entity_label1 切分出來的訓練資料 (All label 1)
        - X_test_label1, y__test_label1 : 從 X_test 切分出來的測試資料 (All label 1)
        """
        # All label 1
        entity_label1 = entity.copy()
        label,entity_label1_cut = DataProcessor.cut_label(entity_label1) # label -> enitiy的 label ; entity_label1_cut -> entity 移除 label 列 
        X_label1 = DataProcessor.split_diff_label(entity_label1_cut, label, positive_label=True) # label1 -> entity 移除 label 列並取 label = 1 (alert)
        y_label1 = pd.Series([1]*X_label1.shape[0]) # 全部為 1
        # no train label 1
        X_test_label1 = DataProcessor.split_diff_label(X_test, y_test, positive_label=True) # 從 X_test 切分出來的測試資料 (All label 1) 
        y_test_label1 = pd.Series([1]*X_test_label1.shape[0])

        # 產生分數
        print ("\n"+"-"*50)
        print ("Label = 1 (alert) 的準確度：")
        print ("-"*50)
        get_evaluater_score(best_models[index],X_train, X_label1, y_train, y_label1)
        # thresholds_to_test = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.98, 0.99]
        # for th in thresholds_to_test:
        #     print(f"\nTesting Threshold: {th}")
    
        #     # 1. 修改模型內部的閾值
        #     model = best_models[index]
        #     model.set_threshold(th)
    
        #     # 2. 直接使用原本的評估器，完全不用改 Evaluater 的 code
        #     # 因為 Evaluater 呼叫 model.predict() 時，現在會自動用新的閾值
        #     get_evaluater_score(model, X_train, X_test, y_train, y_test)

