In [1]:
import torch
print(torch.__version__)

2.7.1+cu118


In [2]:
!pip install sentencepiece torch torchvision --quiet

In [3]:
import os, re, unicodedata, random, math
import pandas as pd
from sklearn.model_selection import train_test_split

URL = "https://github.com/songys/Chatbot_data/raw/master/ChatbotData.csv"
df = pd.read_csv(URL)

# 데이터 컬럼: Q(질문), A(답변), label(의도 분류용 정답, 여기서는 사용하지 않음)
print(df.head())
print(df.shape)


                 Q            A  label
0           12시 땡!   하루가 또 가네요.      0
1      1지망 학교 떨어졌어    위로해 드립니다.      0
2     3박4일 놀러가고 싶다  여행은 언제나 좋죠.      0
3  3박4일 정도 놀러가고 싶다  여행은 언제나 좋죠.      0
4          PPL 심하네   눈살이 찌푸려지죠.      0
(11823, 3)


In [4]:
def normalize_text(s: str) -> str:
    # 1) NFKC 정규화
    s = unicodedata.normalize("NFKC", str(s))
    # 2) 허용 문자만 남기기 (한글, 영문, 숫자, 기본 문장부호)
    #    필요시 허용 문자 세트를 조정하세요.
    s = re.sub(r"[^0-9A-Za-z가-힣ㄱ-ㅎㅏ-ㅣ\s.,?!~’'\"()\-\:;@/]", " ", s)
    # 3) 공백 정리
    s = re.sub(r"\s+", " ", s).strip()
    return s

df = df.dropna(subset=["Q","A"]).copy()
df["Q_orig"] = df["Q"]
df["A_orig"] = df["A"]
df["Q"] = df["Q"].map(normalize_text)
df["A"] = df["A"].map(normalize_text)

# 너무 짧거나 너무 긴 샘플 제거 (길이 기준은 자유롭게 조정)
MIN_CHARS, MAX_CHARS = 1, 128
mask = (
    df["Q"].str.len().between(MIN_CHARS, MAX_CHARS) &
    df["A"].str.len().between(MIN_CHARS, MAX_CHARS)
)
df = df[mask].drop_duplicates(subset=["Q","A"]).reset_index(drop=True)
print("After cleaning:", df.shape)

# 학습/검증 분리
train_df, valid_df = train_test_split(df[["Q","A"]], test_size=0.05, random_state=42, shuffle=True)
print(len(train_df), len(valid_df))


After cleaning: (11750, 5)
11162 588


In [5]:
import io
import sentencepiece as spm

os.makedirs("artifacts", exist_ok=True)

# SentencePiece 학습용 말뭉치 파일 생성 (질문/답변 합치기)
corpus_path = "artifacts/spm_corpus.txt"
with io.open(corpus_path, "w", encoding="utf-8") as f:
    for s in pd.concat([train_df["Q"], train_df["A"]], axis=0).astype(str):
        if s: f.write(s + "\n")

spm_model_prefix = "artifacts/spm_ko"
VOCAB_SIZE = 8000  # 데이터 크기를 고려해 4k~16k 사이 추천

spm.SentencePieceTrainer.Train(
    input=corpus_path,
    model_prefix=spm_model_prefix,
    vocab_size=VOCAB_SIZE,
    model_type="unigram",            # "bpe"도 가능
    character_coverage=0.9995,       # 한/영 혼용 문서에 적합
    # 고정 special token IDs (PyTorch에서 쓰기 좋게)
    pad_id=0, unk_id=1, bos_id=2, eos_id=3,
    pad_piece="[PAD]", unk_piece="[UNK]", bos_piece="<s>", eos_piece="</s>",
    user_defined_symbols=[],
)

sp = spm.SentencePieceProcessor()
sp.load(f"{spm_model_prefix}.model")

PAD_ID = sp.pad_id()   # 0
UNK_ID = sp.unk_id()   # 1
BOS_ID = sp.bos_id()   # 2
EOS_ID = sp.eos_id()   # 3

print("Vocab size:", sp.get_piece_size(), "PAD/BOS/EOS:", PAD_ID, BOS_ID, EOS_ID)


Vocab size: 8000 PAD/BOS/EOS: 0 2 3


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: artifacts/spm_corpus.txt
  input_format: 
  model_prefix: artifacts/spm_ko
  model_type: UNIGRAM
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 1
  bos_id: 2
  eos_id: 3
  pad_id: 0
  unk_piece: [UNK]
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: [PAD]
  unk_surface:  ⁇ 
  enable_differential

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

MAX_TOKENS = 64  # 최대 서브워드 길이 (질문/답변 모두 동일 제한)

def encode_text(s: str, sp, max_len=MAX_TOKENS):
    ids = sp.encode(s, out_type=int)
    # 너무 길면 자르기 (eos 고려해서 -1)
    ids = ids[:max_len-2]  # BOS/EOS용 자리
    return ids

class ChatDataset(Dataset):
    def __init__(self, frame, sp, max_len=MAX_TOKENS):
        self.q = frame["Q"].tolist()
        self.a = frame["A"].tolist()
        self.sp = sp
        self.max_len = max_len

    def __len__(self): return len(self.q)

    def __getitem__(self, i):
        src = encode_text(self.q[i], self.sp, self.max_len)
        tgt = encode_text(self.a[i], self.sp, self.max_len)
        # 입력/출력 시퀀스 만들기
        src_ids = [BOS_ID] + src + [EOS_ID]
        tgt_in  = [BOS_ID] + tgt
        tgt_out = tgt + [EOS_ID]
        return torch.tensor(src_ids), torch.tensor(tgt_in), torch.tensor(tgt_out)

def collate_fn(batch):
    srcs, tgts_in, tgts_out = zip(*batch)
    def pad_to_max(seqs):
        maxlen = max(x.size(0) for x in seqs)
        padded = torch.full((len(seqs), maxlen), PAD_ID, dtype=torch.long)
        for i, s in enumerate(seqs):
            padded[i, :s.size(0)] = s
        return padded
    return pad_to_max(srcs), pad_to_max(tgts_in), pad_to_max(tgts_out)

train_ds = ChatDataset(train_df, sp, MAX_TOKENS)
valid_ds = ChatDataset(valid_df, sp, MAX_TOKENS)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, collate_fn=collate_fn, num_workers=0)
valid_loader = DataLoader(valid_ds, batch_size=128, shuffle=False, collate_fn=collate_fn, num_workers=0)

#########################################
# Positional Encoding (sine/cosine)
#########################################
import math
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2048, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(1))  # (max_len, 1, d_model)

    def forward(self, x): # x: (seq, batch, d_model)
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

#########################################
# Transformer Seq2Seq
#########################################
class TransformerChatbot(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=8, num_layers=4, dim_ff=1024, dropout=0.1):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
        self.pos_enc = PositionalEncoding(d_model, dropout=dropout)
        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_ff, dropout=dropout,
            batch_first=False  # PyTorch 기본: (seq, batch, dim)
        )
        self.generator = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt_in, src_key_padding_mask, tgt_key_padding_mask, memory_key_padding_mask):
        # src/tgt_in: (seq_len, batch)
        src = self.tok_emb(src) * math.sqrt(self.tok_emb.embedding_dim)
        tgt = self.tok_emb(tgt_in) * math.sqrt(self.tok_emb.embedding_dim)

        src = self.pos_enc(src)
        tgt = self.pos_enc(tgt)

        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(0)).to(src.device)
        out = self.transformer(
            src=src, tgt=tgt,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask,
            memory_key_padding_mask=memory_key_padding_mask
        )
        logits = self.generator(out)  # (seq, batch, vocab)
        return logits

VOCAB = sp.get_piece_size()
model = TransformerChatbot(VOCAB).to(DEVICE)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID, label_smoothing=0.1)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, betas=(0.9, 0.98), eps=1e-9)


Device: cuda




In [7]:
def make_padding_mask(batch_ids):  # (batch, seq) → (batch, seq) boolean
    return (batch_ids == PAD_ID)

def to_seq_first(x):  # (batch, seq) → (seq, batch)
    return x.transpose(0, 1).contiguous()


In [13]:
from tqdm import tqdm

def run_epoch(dataloader, train=True):
    model.train(train)
    total_loss, total_tokens = 0.0, 0
    for src, tgt_in, tgt_out in dataloader:
        # batch, seq → seq, batch
        src = to_seq_first(src).to(DEVICE)
        tgt_in = to_seq_first(tgt_in).to(DEVICE)
        tgt_out = to_seq_first(tgt_out).to(DEVICE)

        # key_padding_mask: (batch, seq)
        src_kpm = make_padding_mask(src.transpose(0,1)).to(DEVICE)
        tgt_kpm = make_padding_mask(tgt_in.transpose(0,1)).to(DEVICE)

        with torch.set_grad_enabled(train):
            logits = model(src, tgt_in, src_kpm, tgt_kpm, src_kpm)  # (seq, batch, vocab)
            loss = criterion(
                logits.view(-1, logits.size(-1)),
                tgt_out.reshape(-1)
            )
            if train:
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

        n_tokens = (tgt_out != PAD_ID).sum().item()
        total_loss += loss.item() * n_tokens
        total_tokens += n_tokens

    ppl = math.exp(total_loss / max(1, total_tokens))
    return total_loss / max(1, total_tokens), ppl

EPOCHS = 100 
# 에폭 조정에 따라 답변 성능이 달라짐
# 10 > 기본적인 인사만 할 수 있음 (ex 안녕하세요.)
# 50 > 상태의 대한 질문에 공감을 할 수 있음 (ex 싸우면서 정 들 거예요. 잠시 쉬어도 돼요.)
# 100 > 채팅봇 본인의 상태에 대한 대답을 할 수 있음 (ex 위로봇이요. 저는 배터리가 밥이예요.)

best_val = float("inf")

for epoch in range(1, EPOCHS+1):
    tr_loss, tr_ppl = run_epoch(train_loader, train=True)
    va_loss, va_ppl = run_epoch(valid_loader, train=False)
    print(f"[{epoch:02d}] train loss {tr_loss:.4f} | ppl {tr_ppl:.2f}  ||  valid loss {va_loss:.4f} | ppl {va_ppl:.2f}")
    if va_loss < best_val:
        best_val = va_loss
        torch.save(model.state_dict(), "artifacts/best_transformer.pt")


[01] train loss 2.2193 | ppl 9.20  ||  valid loss 4.0000 | ppl 54.60
[02] train loss 2.1635 | ppl 8.70  ||  valid loss 4.0130 | ppl 55.31
[03] train loss 2.1091 | ppl 8.24  ||  valid loss 4.0251 | ppl 55.98
[04] train loss 2.0541 | ppl 7.80  ||  valid loss 4.0081 | ppl 55.04
[05] train loss 2.0071 | ppl 7.44  ||  valid loss 4.0348 | ppl 56.53
[06] train loss 1.9574 | ppl 7.08  ||  valid loss 4.0194 | ppl 55.67
[07] train loss 1.9161 | ppl 6.79  ||  valid loss 4.0543 | ppl 57.65
[08] train loss 1.8736 | ppl 6.51  ||  valid loss 4.0303 | ppl 56.28
[09] train loss 1.8361 | ppl 6.27  ||  valid loss 4.0588 | ppl 57.90
[10] train loss 1.8003 | ppl 6.05  ||  valid loss 4.0780 | ppl 59.03
[11] train loss 1.7668 | ppl 5.85  ||  valid loss 4.0941 | ppl 59.99
[12] train loss 1.7431 | ppl 5.72  ||  valid loss 4.0851 | ppl 59.45
[13] train loss 1.7101 | ppl 5.53  ||  valid loss 4.1026 | ppl 60.49
[14] train loss 1.6876 | ppl 5.41  ||  valid loss 4.1207 | ppl 61.60
[15] train loss 1.6639 | ppl 5.28 

In [19]:
@torch.no_grad()
def greedy_decode(question: str, max_len=MAX_TOKENS):
    model.eval()
    # 인코더 입력
    src_ids = [BOS_ID] + sp.encode(normalize_text(question), out_type=int)[:max_len-2] + [EOS_ID]
    src = torch.tensor(src_ids, dtype=torch.long).unsqueeze(1).to(DEVICE)  # (seq, 1)
    src_kpm = make_padding_mask(src.transpose(0,1))  # (1, seq)
    src_kpm = src_kpm.to(DEVICE)

    # 디코더 입력 시작 (BOS)
    ys = torch.tensor([BOS_ID], dtype=torch.long, device=DEVICE).unsqueeze(1)  # (1, 1)

    for _ in range(max_len-1):
        tgt_kpm = make_padding_mask(ys.transpose(0,1))
        tgt_kpm = tgt_kpm.to(DEVICE)

        logits = model(src, ys, src_kpm, tgt_kpm, src_kpm)  # (tgt_seq, 1, vocab)
        next_token = logits[-1, 0].argmax(-1).item()
        ys = torch.cat([ys, torch.tensor([[next_token]], device=DEVICE)], dim=0)
        if next_token == EOS_ID: break

    out_ids = ys.squeeze(1).tolist()[1:]  # BOS 제거
    # EOS 이전까지만
    if EOS_ID in out_ids:
        out_ids = out_ids[:out_ids.index(EOS_ID)]
    return sp.decode(out_ids)

# 학습된 베스트 모델 로드(가장 좋은 검증 손실)
model.load_state_dict(torch.load("artifacts/best_transformer.pt", map_location=DEVICE))

# 샘플 질의 테스트
samples = [
    "안녕하세요",
    "너 이름이 뭐야?",
    "오늘 너무 피곤하다",
    "여자친구랑 싸웠어",
    "공부하기 싫어",
    "1+1이 뭐야?", # 연산을 학습하지 않았으므로 올바른 답변을 하지 못함
    "수련이 필요해", # 일상에서 많이 쓰이지 않는 단어들은 잘 답변하지 못함
    "밥 먹었어?"
]
for q in samples:
    print("Q:", q)
    print("A:", greedy_decode(q))
    print("-"*50)


Q: 안녕하세요
A: 안녕하세요.
--------------------------------------------------
Q: 너 이름이 뭐야?
A: 위로봇이요.
--------------------------------------------------
Q: 오늘 너무 피곤하다
A: 맛있는거 드세요.
--------------------------------------------------
Q: 여자친구랑 싸웠어
A: 싸우면서 정 들 거예요.
--------------------------------------------------
Q: 공부하기 싫어
A: 잠시 쉬어도 돼요.
--------------------------------------------------
Q: 1+1이 뭐야?
A: 잘 이겨내고 있네요.
--------------------------------------------------
Q: 수련이 필요해
A: 지금보다 더 잘 살 거예요.
--------------------------------------------------
Q: 밥 먹었어?
A: 저는 배터리가 밥이예요.
--------------------------------------------------
