In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
import torch.nn.functional as F
import pandas as pd

class SharedBERT(nn.Module):
    def __init__(self, model_name="beomi/kcbert-base"):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        for p in self.bert.parameters():
            p.requires_grad = False

    def forward(self, input_ids, attention_mask):
        return self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output

class BinaryClassifier(nn.Module):
    def __init__(self, hidden_size=768):
        super().__init__()
        self.classifier = nn.Linear(hidden_size, 2)

    def forward(self, cls_output):
        return self.classifier(cls_output)

class MultiClassifier(nn.Module):
    def __init__(self, hidden_size=768, num_classes=5):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, cls_output):
        return self.classifier(cls_output)

# 예측 함수 정의

def predict_pipeline(texts, tokenizer, shared_bert, binary_model, multi_model):
    shared_bert.eval()
    binary_model.eval()
    multi_model.eval()

    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
    # with torch.no_grad():
    #     cls_output = shared_bert(inputs['input_ids'], inputs['attention_mask'])
    #     binary_logits = binary_model(cls_output)
    #     binary_probs = F.softmax(binary_logits, dim=1)
    #     binary_preds = (binary_probs[:, 1] > 0.7).long()

    #     multi_logits = multi_model(cls_output)
    #     multi_preds = torch.argmax(multi_logits, dim=1)

    #     final_preds = []
    #     for i in range(len(texts)):
    #         if binary_preds[i].item() == 0:
    #             final_preds.append(0)  # 일반대화
    #         else:
    #             final_preds.append(multi_preds[i].item())  # 1~4 (다중 분류 결과)
    with torch.no_grad():
        # BERT로 CLS 토큰 임베딩 추출
        cls_output = shared_bert(inputs['input_ids'], inputs['attention_mask'])

        # 1단계: 이진 분류 (일반 vs 위협)
        binary_logits = binary_model(cls_output)
        binary_probs = F.softmax(binary_logits, dim=1)
        binary_preds = (binary_probs[:, 1] > 0.68).long()

        # 일반 대화(0)로 초기화
        final_preds = [0] * len(texts)

        # 위협으로 판단된 인덱스만 추출
        threat_indices = [i for i, pred in enumerate(binary_preds) if pred.item() == 1]

        if threat_indices:
            # 해당 인덱스의 cls_output만 다중 분류기로 전달
            threat_cls_outputs = cls_output[threat_indices]
            multi_logits = multi_model(threat_cls_outputs)
            multi_preds = torch.argmax(multi_logits, dim=1)

            # multi_preds 결과를 final_preds에 반영
            for idx, multi_pred in zip(threat_indices, multi_preds):
                final_preds[idx] = multi_pred.item()

    return final_preds


# test.csv 기반으로 예측하고 submission.csv 저장

def run_submission(test_csv_path, submission_csv_path, tokenizer, shared_bert, binary_model, multi_model):
    test_df = pd.read_csv(test_csv_path)
    texts = test_df['text'].tolist()
    preds = predict_pipeline(texts, tokenizer, shared_bert, binary_model, multi_model)

    # 클래스 번호 → 캐글 라벨 매핑
    mapping = {
        0: "04",  # 일반대화
        1: "00",  # 협박
        2: "01",  # 갈취
        3: "02",  # 직장 내 괴롭힘
        4: "03"   # 기타 괴롭힘
    }

     # 안전하게 int 변환 후 문자열 라벨로 매핑
    preds_clean = [int(i) for i in preds]
    mapped_preds = [mapping[i] for i in preds_clean]

    print("✔ preds_clean 예시:", preds_clean[:10])
    print("✔ mapped_preds 예시:", mapped_preds[:10])

    # 제출 파일 생성
    submission = pd.DataFrame({
        "file_name": test_df["idx"],  # 또는 "id" 컬럼 이름에 맞게 수정
        "class": mapped_preds
    })

    submission["class"] = submission["class"].astype(str)
    submission.to_csv(submission_csv_path, index=False)
    print(f"submission.csv 저장 완료: {submission_csv_path}")


    # ✅ 클래스별 개수 출력
    print("\n📊 클래스 분포:")
    print(submission["class"].value_counts().sort_index())

In [None]:
tokenizer = BertTokenizer.from_pretrained("beomi/kcbert-base")
shared_bert = SharedBERT().to("cuda")
shared_bert.eval()

binary_model = BinaryClassifier().to("cuda")
binary_model.load_state_dict(torch.load("/content/drive/MyDrive/DLthon/binary_4.pt"))
binary_model.eval()

multi_model = MultiClassifier().to("cuda")
# multi_model.load_state_dict(torch.load("/content/drive/MyDrive/DLthon/multi_4000_generated.pt"))
multi_model.load_state_dict(torch.load("/content/drive/MyDrive/DLthon/multi_augmentation_3.pt"))
multi_model.eval()

MultiClassifier(
  (classifier): Sequential(
    (0): Linear(in_features=768, out_features=512, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=512, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=256, out_features=5, bias=True)
  )
)

In [None]:
run_submission(
    test_csv_path="/content/drive/MyDrive/DLthon/test.csv",
    submission_csv_path="/content/drive/MyDrive/DLthon/kc_bert_only_augmentation_0.68.csv",
    tokenizer=tokenizer,
    shared_bert=shared_bert,
    binary_model=binary_model,
    multi_model=multi_model
)

✔ preds_clean 예시: [2, 3, 3, 0, 4, 1, 1, 2, 0, 2]
✔ mapped_preds 예시: ['01', '02', '02', '04', '03', '00', '00', '01', '04', '01']
✅ submission.csv 저장 완료: /content/drive/MyDrive/DLthon/kc_bert_only_augmentation_0.68.csv

📊 클래스 분포:
class
00     99
01     98
02    103
03    112
04     88
Name: count, dtype: int64
