# DKTC 모델링 및 학습

## 1. 데이터 전처리

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 데이터 로드
train_df = pd.read_csv('../../data/train.csv')
general_df = pd.read_csv('../../data/train_general_conversation.csv')

# 데이터 합치기
df = pd.concat([train_df, general_df], ignore_index=True)

# 클래스 라벨 인코딩
encoder = LabelEncoder()
df['encoded_class'] = encoder.fit_transform(df['class'])

print("Combined DataFrame Head:")
display(df.head())
print("Class Label Mapping:")
for i, label in enumerate(encoder.classes_):
    print(f'{i}: {label}')

Combined DataFrame Head:


Unnamed: 0,idx,class,conversation,id,encoded_class
0,0.0,협박 대화,지금 너 스스로를 죽여달라고 애원하는 것인가?\n 아닙니다. 죄송합니다.\n 죽을 ...,,4
1,1.0,협박 대화,길동경찰서입니다.\n9시 40분 마트에 폭발물을 설치할거다.\n네?\n똑바로 들어 ...,,4
2,2.0,기타 괴롭힘 대화,너 되게 귀여운거 알지? 나보다 작은 남자는 첨봤어.\n그만해. 니들 놀리는거 재미...,,1
3,3.0,갈취 대화,어이 거기\n예??\n너 말이야 너. 이리 오라고\n무슨 일.\n너 옷 좋아보인다?...,,0
4,4.0,갈취 대화,저기요 혹시 날이 너무 뜨겁잖아요? 저희 회사에서 이 선크림 파는데 한 번 손등에 ...,,0


Class Label Mapping:
0: 갈취 대화
1: 기타 괴롭힘 대화
2: 일반 대화
3: 직장 내 괴롭힘 대화
4: 협박 대화


## 2. 모델 및 토크나이저 로드

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "monologg/koelectra-base-v3-discriminator"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(encoder.classes_))

# GPU 사용 설정
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)
print(f'Using device: {device}')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: mps


## 3. 데이터셋 및 데이터로더 생성

In [3]:
# 데이터 분할
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['conversation'].tolist(),
    df['encoded_class'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['encoded_class'].tolist()
)

# 토크나이징
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# PyTorch 데이터셋 클래스
class DKTCDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 데이터셋 생성
train_dataset = DKTCDataset(train_encodings, train_labels)
val_dataset = DKTCDataset(val_encodings, val_labels)

# 데이터로더 생성
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

## 4. 모델 학습 및 평가

In [4]:
import torch.optim as optim
from sklearn.metrics import f1_score
import numpy as np

def train_model(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    return f1_score(true_labels, predictions, average='weighted')

# 옵티마이저 설정
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# 학습 루프
epochs = 3
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer)
    val_f1 = evaluate_model(model, val_loader)
    print(f'Epoch {epoch + 1}/{epochs} | Train Loss: {train_loss:.4f} | Validation F1: {val_f1:.4f}')

Epoch 1/3 | Train Loss: 0.7417 | Validation F1: 0.8842
Epoch 2/3 | Train Loss: 0.3050 | Validation F1: 0.8855
Epoch 3/3 | Train Loss: 0.1918 | Validation F1: 0.8911
