## baseline, attention, pretrainedEmb를 진행한 3가지 모델에 대해 앙상블 수행

In [1]:
!pip install wandb



In [2]:
from tensorflow.keras.layers import Layer
import tensorflow.keras.backend as K
from tensorflow.keras.models import load_model
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import pickle

import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m00_jw[0m ([33mjiwoong-team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
# 커스텀 어텐션 층을 읽기 전에 한 번 정의
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros")
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = x * a
        return K.sum(output, axis=1)


In [4]:
# 저장된 모델 불러오기
model_pretrained_emb = load_model('/content/drive/MyDrive/Colab Notebooks/Aiffel/AIFFEL_DLThon_DKTC_online13/notebooks/jiwoong/models/model_pretrained_emb_fine.keras')
model_attention = load_model('/content/drive/MyDrive/Colab Notebooks/Aiffel/AIFFEL_DLThon_DKTC_online13/notebooks/jiwoong/models/model_attention.keras', custom_objects={"AttentionLayer": AttentionLayer})
model_baseline = load_model('/content/drive/MyDrive/Colab Notebooks/Aiffel/AIFFEL_DLThon_DKTC_online13/notebooks/jiwoong/models/baseline.keras')

In [5]:
# 검증 데이터 불러오기
with open('/content/drive/MyDrive/Colab Notebooks/Aiffel/AIFFEL_DLThon_DKTC_online13/notebooks/jiwoong/data/preprocessed_data_no_aug.pkl', 'rb') as f:
    data = pickle.load(f)
    X_val = data['X_val']
    y_val = data['y_val']

## 소프트 보팅(확률 평균)

In [6]:
ordered_columns = ['협박 대화', '갈취 대화', '직장 내 괴롭힘 대화', '기타 괴롭힘 대화', '일반 대화'] # kaggle에 명시된 순서로 재배치

# W&B 프로젝트 초기화
run = wandb.init(
    # Set the wandb entity where your project will be logged (generally your team name).
    entity="jiwoong-team",
    # Set the wandb project where this run will be logged.
    project="DLThon-DKTC",
    # Track hyperparameters and run metadata.
    name=f"jiwoong_BiLSTM_ensemble_soft", # 실험 이름
    notes="baseline, attention, pretrainedEmb 모델 3개를 앙상블(소프트 보팅)", # 실험에 대한 간단한 설명
    config={ # 세부 구성 내용
        "experiment_name": "BiLSTM_ensemble",
        "general_conversation_type": "LLM에서 생성한 대화 데이터", # 일반 데이터 타입
        "architecture": "BiLSTM(64), Asemble",
    },
)

y_pred1 = model_baseline.predict(X_val)
y_pred2 = model_attention.predict(X_val)
y_pred3 = model_pretrained_emb.predict(X_val)

# 예측 결과
y_pred_ensemble = (y_pred1 + y_pred2 + y_pred3) / 3
final_preds = np.argmax(y_pred_ensemble, axis=1)
# 실제 결과
y_true_labels = np.argmax(y_val, axis=1)

# wandb Table 생성
report = classification_report(y_true_labels, final_preds, target_names=ordered_columns, output_dict=True)
columns = ["class", "precision", "recall", "f1-score", "support"]
data = []

for label, metrics in report.items():
    if isinstance(metrics, dict):  # dict일 때만 values 가져오기
        row = [label] + [metrics.get(col, None) for col in columns[1:]]
        data.append(row)

table = wandb.Table(columns=columns, data=data)

# wandb에 훈련 결과 기록
wandb.log({
    "classification_report_table": table,
    "macro_f1": report["macro avg"]["f1-score"],
    "accuracy": report["accuracy"],
    "threat_f1": report["협박 대화"]["f1-score"],
    "extortion_f1": report["갈취 대화"]["f1-score"],
    "workplace_bullying_f1": report["직장 내 괴롭힘 대화"]["f1-score"],
    "other bullying": report["기타 괴롭힘 대화"]["f1-score"],
    "general_conversation_f1": report["일반 대화"]["f1-score"],
})

run.finish()

print(confusion_matrix(y_true_labels, final_preds))
print(classification_report(y_true_labels, final_preds, target_names=ordered_columns))

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


0,1
accuracy,▁
extortion_f1,▁
general_conversation_f1,▁
macro_f1,▁
other bullying,▁
threat_f1,▁
workplace_bullying_f1,▁

0,1
accuracy,0.89763
extortion_f1,0.86139
general_conversation_f1,0.99363
macro_f1,0.90081
other bullying,0.85642
threat_f1,0.86726
workplace_bullying_f1,0.92537


[[147  18   4  10   0]
 [  5 174   7   9   0]
 [  1   1 186   6   0]
 [  7  16   9 170   0]
 [  0   0   2   0 156]]
              precision    recall  f1-score   support

       협박 대화       0.92      0.82      0.87       179
       갈취 대화       0.83      0.89      0.86       195
 직장 내 괴롭힘 대화       0.89      0.96      0.93       194
   기타 괴롭힘 대화       0.87      0.84      0.86       202
       일반 대화       1.00      0.99      0.99       158

    accuracy                           0.90       928
   macro avg       0.90      0.90      0.90       928
weighted avg       0.90      0.90      0.90       928



## 하드 보팅(클래스 투표)
- 각 모델의 최종 예측 클래스만 보고 다수결로 결정

In [7]:
# W&B 프로젝트 초기화
run = wandb.init(
    # Set the wandb entity where your project will be logged (generally your team name).
    entity="jiwoong-team",
    # Set the wandb project where this run will be logged.
    project="DLThon-DKTC",
    # Track hyperparameters and run metadata.
    name=f"jiwoong_BiLSTM_ensemble_hard", # 실험 이름
    notes="baseline, attention, pretrainedEmb 모델 3개를 앙상블(하드 보팅)", # 실험에 대한 간단한 설명
    config={ # 세부 구성 내용
        "experiment_name": "BiLSTM_ensemble",
        "general_conversation_type": "LLM에서 생성한 대화 데이터", # 일반 데이터 타입
        "architecture": "BiLSTM(64), Asemble",
    },
)

pred1 = np.argmax(model_baseline.predict(X_val), axis=1)
pred2 = np.argmax(model_attention.predict(X_val), axis=1)
pred3 = np.argmax(model_pretrained_emb.predict(X_val), axis=1)

# 예측 결과
stacked = np.stack([pred1, pred2, pred3], axis=1)
final_preds = [np.bincount(row).argmax() for row in stacked]

# wandb Table 생성
report = classification_report(y_true_labels, final_preds, target_names=ordered_columns, output_dict=True)
columns = ["class", "precision", "recall", "f1-score", "support"]
data = []

for label, metrics in report.items():
    if isinstance(metrics, dict):  # dict일 때만 values 가져오기
        row = [label] + [metrics.get(col, None) for col in columns[1:]]
        data.append(row)

table = wandb.Table(columns=columns, data=data)

# wandb에 훈련 결과 기록
wandb.log({
    "classification_report_table": table,
    "macro_f1": report["macro avg"]["f1-score"],
    "accuracy": report["accuracy"],
    "threat_f1": report["협박 대화"]["f1-score"],
    "extortion_f1": report["갈취 대화"]["f1-score"],
    "workplace_bullying_f1": report["직장 내 괴롭힘 대화"]["f1-score"],
    "other bullying": report["기타 괴롭힘 대화"]["f1-score"],
    "general_conversation_f1": report["일반 대화"]["f1-score"],
})

run.finish()

print(confusion_matrix(y_true_labels, final_preds))
print(classification_report(y_true_labels, final_preds, target_names=ordered_columns))

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


0,1
accuracy,▁
extortion_f1,▁
general_conversation_f1,▁
macro_f1,▁
other bullying,▁
threat_f1,▁
workplace_bullying_f1,▁

0,1
accuracy,0.89547
extortion_f1,0.86352
general_conversation_f1,0.99363
macro_f1,0.8985
other bullying,0.84987
threat_f1,0.85549
workplace_bullying_f1,0.93


[[148  18   3  10   0]
 [  7 174   6   8   0]
 [  1   1 186   6   0]
 [ 11  15   9 167   0]
 [  0   0   2   0 156]]
              precision    recall  f1-score   support

       협박 대화       0.89      0.83      0.86       179
       갈취 대화       0.84      0.89      0.86       195
 직장 내 괴롭힘 대화       0.90      0.96      0.93       194
   기타 괴롭힘 대화       0.87      0.83      0.85       202
       일반 대화       1.00      0.99      0.99       158

    accuracy                           0.90       928
   macro avg       0.90      0.90      0.90       928
weighted avg       0.90      0.90      0.90       928



하나의 모델을 사용하는 것보다 3개의 모델을 앙상블 하는 게 더 높은 검증 점수를 기록했다.