In [1]:
# ===== 0) Library versions =====
import torch
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import konlpy

print("torch        :", torch.__version__)
print("numpy        :", np.__version__)
print("matplotlib   :", matplotlib.__version__)
print("konlpy       :", konlpy.__version__)

torch        : 2.7.1+cu118
numpy        : 2.2.6
matplotlib   : 3.10.3
konlpy       : 0.6.0


In [2]:
# ===== 1) Install =====

# 재현성 고정
import random, os
SEED = 42
random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

DEVICE: cuda


In [3]:
# ===== 2) NSMC 데이터 준비 =====
import pandas as pd
from pathlib import Path
import urllib.request

DATA_DIR = Path("./data_nsmc")
DATA_DIR.mkdir(parents=True, exist_ok=True)

train_path = DATA_DIR / "ratings_train.txt"
test_path  = DATA_DIR / "ratings_test.txt"

def _download_if_needed():
    base = "https://raw.githubusercontent.com/e9t/nsmc/master"
    files = [
        ("ratings_train.txt", f"{base}/ratings_train.txt"),
        ("ratings_test.txt",  f"{base}/ratings_test.txt"),
    ]
    for fname, url in files:
        fpath = DATA_DIR / fname
        if not fpath.exists():
            try:
                print(f"다운 중 {fname} ...")
                urllib.request.urlretrieve(url, fpath.as_posix())
            except Exception as e:
                print(f"다운 실패 {fname}: {e}\n 파일 위치를 확인 {fpath}.")

_download_if_needed()

# 로드
train_df = pd.read_csv(train_path, sep='\t')
test_df  = pd.read_csv(test_path,  sep='\t')

# 결측/중복 제거 + 라벨/텍스트 정리
def clean_df(df):
    df = df.dropna(subset=['document']).copy()
    df['document'] = df['document'].astype(str).str.strip()
    df = df[df['document'].str.len() > 0]
    df = df.drop_duplicates(subset=['document'])
    # label은 0/1 (부정/긍정)
    df['label'] = df['label'].astype(int)
    return df

train_df = clean_df(train_df)
test_df  = clean_df(test_df)

# 간단 분포 확인
print("학습 데이터 크기 :", len(train_df), train_df['label'].value_counts().to_dict())
print("테스트 데이터 크기 :", len(test_df),  test_df['label'].value_counts().to_dict())

# 학습/검증 분리
from sklearn.model_selection import train_test_split
tr_df, val_df = train_test_split(train_df, test_size=0.1, random_state=SEED, stratify=train_df['label'])
print("Train:", len(tr_df), "Val:", len(val_df), "Test:", len(test_df))

학습 데이터 크기 : 146182 {0: 73342, 1: 72840}
테스트 데이터 크기 : 49157 {1: 24711, 0: 24446}
Train: 131563 Val: 14619 Test: 49157


In [4]:
# ===== 3) SentencePiece 학습 =====
import sentencepiece as spm

SPM_DIR = Path("./spm_models")
SPM_DIR.mkdir(parents=True, exist_ok=True)

def write_corpus_txt(df, out_path):
    with open(out_path, "w", encoding="utf-8") as f:
        for s in df['document'].tolist():
            f.write(s.replace("\n", " ") + "\n")

def train_spm_model(train_df, model_prefix, vocab_size=8000, model_type="unigram"):
    txt_path = SPM_DIR / f"{model_prefix}.txt"
    write_corpus_txt(train_df, txt_path)

    # 필수: pad/unk/bos/eos id를 고정
    cmd = (
        f"--input={txt_path} "
        f"--model_prefix={SPM_DIR / model_prefix} "
        f"--vocab_size={vocab_size} "
        f"--model_type={model_type} "
        f"--character_coverage=0.9995 "
        f"--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 "
        f"--input_sentence_size=1000000 --shuffle_input_sentence=true"
    )
    print("SPM Train cmd:\n", cmd)
    spm.SentencePieceTrainer.Train(cmd)

def load_spm(model_path):
    sp = spm.SentencePieceProcessor()
    sp.Load(str(model_path))
    return sp

# 예: 기본 두 개 모델
SPM_CONFIGS = [
    dict(model_prefix="spm_unigram_8k", vocab_size=8000,  model_type="unigram"),
    dict(model_prefix="spm_bpe_8k",     vocab_size=8000,  model_type="bpe"),
    # 필요시 대형 vocab 추가
    dict(model_prefix="spm_unigram_16k", vocab_size=16000, model_type="unigram"),
]

# 없으면 학습
for cfg in SPM_CONFIGS:
    model_path = SPM_DIR / f"{cfg['model_prefix']}.model"
    if not model_path.exists():
        train_spm_model(tr_df, **cfg)
    else:
        print("Exists:", model_path.name)

Exists: spm_unigram_8k.model
Exists: spm_bpe_8k.model
Exists: spm_unigram_16k.model


In [5]:
# ===== 4) sp_tokenize 구현 =====
from torch.nn.utils.rnn import pad_sequence
import torch

def sp_tokenize(sp_processor, corpus, vocab_path):
    """
    sp_processor : sentencepiece.SentencePieceProcessor (로드된 모델)
    corpus       : List[str] (원문장 리스트)
    vocab_path   : str or Path (SentencePiece가 만든 .vocab 파일 경로)

    return:
      - tensor      : LongTensor [B, T] (pad=0)
      - word_index  : dict {piece: idx}
      - index_word  : dict {idx: piece}
    """
    # 1) 문장들을 ID 시퀀스로
    seqs = [torch.tensor(sp_processor.EncodeAsIds(s), dtype=torch.long) for s in corpus]
    tensor = pad_sequence(seqs, batch_first=True, padding_value=0)  # pad_id=0 으로 맞춰 학습했음

    # 2) vocab 읽어 사전 구성 (SentencePiece .vocab은 ID순으로 정렬되어 있음)
    word_index, index_word = {}, {}
    with open(vocab_path, encoding="utf-8") as f:
        for idx, line in enumerate(f):
            piece = line.split("\t")[0].strip()
            word_index[piece] = idx
            index_word[idx]   = piece

    return tensor, word_index, index_word

In [6]:
# ===== 5) KoNLPy 토크나이저 파이프라인 =====
from collections import Counter, defaultdict

# 사용 가능 분석기 점검
AVAILABLE = {}
try:
    from konlpy.tag import Okt
    AVAILABLE["okt"] = Okt()
except Exception as e:
    print("OKT 사용 불가 :", e)

try:
    from konlpy.tag import Mecab
    AVAILABLE["mecab"] = Mecab()
except Exception as e:
    print("MeCab 사용 불가 :", e)

try:
    from konlpy.tag import Kkma
    AVAILABLE["kkma"] = Kkma()
except Exception as e:
    print("Kkma 사용 불가 :", e)

print("가능한 모델 :", list(AVAILABLE.keys()))

def tokenize_konlpy(text, analyzer="okt"):
    if analyzer not in AVAILABLE:
        analyzer = "okt" if "okt" in AVAILABLE else list(AVAILABLE.keys())[0]
    if analyzer == "okt":
        return AVAILABLE[analyzer].morphs(text)
    elif analyzer == "mecab":
        return AVAILABLE[analyzer].morphs(text)
    elif analyzer == "kkma":
        # Kkma는 품질은 좋지만 상대적으로 느립니다.
        return AVAILABLE[analyzer].morphs(text)
    else:
        return text.split()

def build_vocab_from_tokens(token_lists, min_freq=1, pad_id=0, unk_id=1, max_vocab=None):
    """
    token_lists : List[List[str]]
    """
    cnt = Counter([t for lst in token_lists for t in lst])
    # ID 예약
    word2id = {"<pad>": pad_id, "<unk>": unk_id}
    start = 2
    # 빈도순으로 정렬
    most = cnt.most_common()
    if max_vocab is not None:
        most = most[:max(0, max_vocab - start)]
    for w, _ in most:
        if w not in word2id:
            word2id[w] = len(word2id)
    id2word = {i:w for w,i in word2id.items()}
    return word2id, id2word

def texts_to_tensor_by_vocab(texts, word2id, analyzer="okt", pad_id=0, unk_id=1):
    seqs = []
    for s in texts:
        toks = tokenize_konlpy(s, analyzer=analyzer)
        ids  = [word2id.get(t, unk_id) for t in toks]
        seqs.append(torch.tensor(ids, dtype=torch.long))
    tensor = pad_sequence(seqs, batch_first=True, padding_value=pad_id)
    return tensor

가능한 모델 : ['okt', 'mecab', 'kkma']


In [7]:
# ===== 6) 데이터셋, 모델 =====
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from tqdm import tqdm

class SimpleTensorDataset(Dataset):
    def __init__(self, x_tensor, y_array):
        self.x = x_tensor
        self.y = torch.tensor(y_array, dtype=torch.float32)
    def __len__(self): return len(self.y)
    def __getitem__(self, i): return self.x[i], self.y[i]

class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, hidden=256, num_layers=1, bidirectional=True, dropout=0.2, pad_id=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.lstm = nn.LSTM(embed_dim, hidden, num_layers=num_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout if num_layers > 1 else 0)
        out_dim = hidden * (2 if bidirectional else 1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(out_dim, 1)

    def forward(self, x, pad_id=0):
        # x: [B, T]
        mask = (x != pad_id).float()               # [B, T]
        emb  = self.embedding(x)                    # [B, T, E]
        out, _ = self.lstm(emb)                     # [B, T, H*D]
        # masked mean pooling
        mask_exp = mask.unsqueeze(-1)               # [B, T, 1]
        sum_out  = (out * mask_exp).sum(dim=1)      # [B, H*D]
        len_out  = mask.sum(dim=1).clamp(min=1)     # [B]
        pooled   = sum_out / len_out.unsqueeze(-1)
        logits   = self.fc(self.dropout(pooled)).squeeze(1)  # [B]
        return logits

def train_epoch(model, loader, optimizer, criterion, pad_id=0):
    model.train()
    total, correct, loss_sum = 0, 0, 0.0
    for x, y in tqdm(loader, leave=False):
        x = x.to(DEVICE); y = y.to(DEVICE)
        optimizer.zero_grad()
        logits = model(x, pad_id=pad_id)
        loss = criterion(logits, y)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        with torch.no_grad():
            preds = (torch.sigmoid(logits) >= 0.5).float()
            correct += (preds == y).sum().item()
            total += y.size(0)
            loss_sum += loss.item() * y.size(0)
    return loss_sum/total, correct/total

@torch.no_grad()
def eval_epoch(model, loader, criterion, pad_id=0):
    model.eval()
    total, correct, loss_sum = 0, 0, 0.0
    for x, y in loader:
        x = x.to(DEVICE); y = y.to(DEVICE)
        logits = model(x, pad_id=pad_id)
        loss = criterion(logits, y)
        preds = (torch.sigmoid(logits) >= 0.5).float()
        correct += (preds == y).sum().item()
        total += y.size(0)
        loss_sum += loss.item() * y.size(0)
    return loss_sum/total, correct/total

In [8]:
# ===== 7) 하이퍼파라미터 조정 =====
from dataclasses import dataclass, asdict
from typing import Optional, Dict
from sklearn.metrics import classification_report

BATCH_SIZE = 256
EPOCHS     = 5
EMBED_DIM  = 256
HIDDEN     = 256
DROPOUT    = 0.2
LR         = 0.001

@dataclass
class ExpConfig:
    name: str
    kind: str            # 'spm' or 'konlpy'
    spm_model: Optional[str] = None
    spm_vocab : Optional[str] = None
    analyzer  : Optional[str] = None         # 'okt'/'mecab'/'kkma'
    max_vocab : Optional[int] = None         # konlpy 전용: vocab cap
    note      : str = ""

def build_tensors_from_config(cfg: ExpConfig, tr_df, val_df, test_df):
    if cfg.kind == "spm":
        sp = load_spm(cfg.spm_model)
        # train/val/test → 텐서
        xtr, wi, iw = sp_tokenize(sp, tr_df['document'].tolist(), cfg.spm_vocab)
        xva, _, _   = sp_tokenize(sp, val_df['document'].tolist(), cfg.spm_vocab)
        xte, _, _   = sp_tokenize(sp, test_df['document'].tolist(), cfg.spm_vocab)
        vocab_size  = len(wi)
        pad_id = 0
        return (xtr, tr_df['label'].values,
                xva, val_df['label'].values,
                xte, test_df['label'].values,
                vocab_size, pad_id, wi, iw)

    elif cfg.kind == "konlpy":
        analyzer = cfg.analyzer if cfg.analyzer in AVAILABLE else ("okt" if "okt" in AVAILABLE else list(AVAILABLE.keys())[0])
        tr_tokens = [tokenize_konlpy(s, analyzer=analyzer) for s in tr_df['document'].tolist()]
        word2id, id2word = build_vocab_from_tokens(tr_tokens, min_freq=1, max_vocab=cfg.max_vocab, pad_id=0, unk_id=1)
        xtr = texts_to_tensor_by_vocab(tr_df['document'].tolist(), word2id, analyzer=analyzer, pad_id=0, unk_id=1)
        xva = texts_to_tensor_by_vocab(val_df['document'].tolist(), word2id, analyzer=analyzer, pad_id=0, unk_id=1)
        xte = texts_to_tensor_by_vocab(test_df['document'].tolist(), word2id, analyzer=analyzer, pad_id=0, unk_id=1)
        vocab_size = len(word2id)
        pad_id = 0
        return (xtr, tr_df['label'].values,
                xva, val_df['label'].values,
                xte, test_df['label'].values,
                vocab_size, pad_id, word2id, id2word)
    else:
        raise ValueError("Unknown kind")

def run_experiment(cfg: ExpConfig):
    print("\n==============================")
    print("Running:", cfg.name)
    print("==============================")
    (xtr, ytr, xva, yva, xte, yte, vocab_size, pad_id, w2i, i2w) = build_tensors_from_config(cfg, tr_df, val_df, test_df)

    # Dataloaders
    tr_loader  = DataLoader(SimpleTensorDataset(xtr, ytr), batch_size=BATCH_SIZE, shuffle=True)
    va_loader  = DataLoader(SimpleTensorDataset(xva, yva), batch_size=BATCH_SIZE, shuffle=False)
    te_loader  = DataLoader(SimpleTensorDataset(xte, yte), batch_size=BATCH_SIZE, shuffle=False)

    # Model
    model = BiLSTMClassifier(vocab_size=vocab_size, embed_dim=EMBED_DIM, hidden=HIDDEN, dropout=DROPOUT, pad_id=pad_id).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
    criterion = nn.BCEWithLogitsLoss()

    best_val_acc = 0.0
    best_state   = None

    for epoch in range(1, EPOCHS+1):
        tr_loss, tr_acc = train_epoch(model, tr_loader, optimizer, criterion, pad_id=pad_id)
        va_loss, va_acc = eval_epoch(model, va_loader, criterion, pad_id=pad_id)
        print(f"[{cfg.name}] Epoch {epoch:02d}/{EPOCHS} | "
              f"train loss {tr_loss:.4f} acc {tr_acc:.4f} | "
              f"val loss {va_loss:.4f} acc {va_acc:.4f}")
        if va_acc > best_val_acc:
            best_val_acc = va_acc
            best_state   = {k: v.cpu() for k, v in model.state_dict().items()}

    # best로 평가
    if best_state is not None:
        model.load_state_dict(best_state)

    te_loss, te_acc = eval_epoch(model, te_loader, criterion, pad_id=pad_id)
    print(f"[{cfg.name}] TEST acc {te_acc:.4f}, loss {te_loss:.4f}")

    # 상세 리포트
    y_true, y_pred = [], []
    model.eval()
    with torch.no_grad():
        for x, y in te_loader:
            x = x.to(DEVICE)
            logits = model(x, pad_id=pad_id)
            pred = (torch.sigmoid(logits) >= 0.5).long().cpu().numpy().tolist()
            y_pred += pred
    y_true = test_df['label'].astype(int).tolist()
    report = classification_report(y_true, y_pred, output_dict=True)
    return dict(
        name=cfg.name, kind=cfg.kind, note=cfg.note,
        vocab_size=vocab_size, test_acc=te_acc, test_loss=te_loss, val_best=best_val_acc, report=report
    )

In [None]:
# ===== 8) 실험 목록 & 실행 =====
EXP_LIST = [
    ExpConfig(name="SPM-Unigram-8k",  kind="spm",
              spm_model=str(SPM_DIR/"spm_unigram_8k.model"),
              spm_vocab=str(SPM_DIR/"spm_unigram_8k.vocab"),
              note="SentencePiece unigram, vocab=8k"),
    ExpConfig(name="SPM-BPE-8k",      kind="spm",
              spm_model=str(SPM_DIR/"spm_bpe_8k.model"),
              spm_vocab=str(SPM_DIR/"spm_bpe_8k.vocab"),
              note="SentencePiece BPE, vocab=8k"),
    ExpConfig(name="SPM-Unigram-16k", kind="spm",
              spm_model=str(SPM_DIR/"spm_unigram_16k.model"),
              spm_vocab=str(SPM_DIR/"spm_unigram_16k.vocab"),
              note="SentencePiece unigram, vocab=16k"),
    ExpConfig(name="OKT-30k", kind="konlpy", analyzer="okt", max_vocab=30000, note="OKT morphs, vocab cap 30k"),
]

if "mecab" in AVAILABLE:
    EXP_LIST.append(ExpConfig(name="MeCab-30k", kind="konlpy", analyzer="mecab", max_vocab=30000, note="MeCab morphs, vocab cap 30k"))
if "kkma" in AVAILABLE:
    EXP_LIST.append(ExpConfig(name="Kkma-30k", kind="konlpy", analyzer="kkma", max_vocab=30000, note="Kkma morphs, vocab cap 30k"))

RESULTS = []
for cfg in EXP_LIST:
    RESULTS.append(run_experiment(cfg))

# 결과 표
import pandas as pd
res_table = pd.DataFrame([{
    "name": r["name"],
    "kind": r["kind"],
    "vocab_size": r["vocab_size"],
    "val_best": round(float(r["val_best"]), 4),
    "test_acc": round(float(r["test_acc"]), 4),
    "test_loss": round(float(r["test_loss"]), 4),
    "note": r["note"]
} for r in RESULTS]).sort_values(by="test_acc", ascending=False)
res_table


Running: SPM-Unigram-8k


                                                 

[SPM-Unigram-8k] Epoch 01/5 | train loss 0.4445 acc 0.7863 | val loss 0.3595 acc 0.8412


                                                 

[SPM-Unigram-8k] Epoch 02/5 | train loss 0.3113 acc 0.8656 | val loss 0.3396 acc 0.8544


                                                 

[SPM-Unigram-8k] Epoch 03/5 | train loss 0.2717 acc 0.8865 | val loss 0.3555 acc 0.8516


                                                 

[SPM-Unigram-8k] Epoch 04/5 | train loss 0.2082 acc 0.9153 | val loss 0.3681 acc 0.8549


                                                 

[SPM-Unigram-8k] Epoch 05/5 | train loss 0.1427 acc 0.9447 | val loss 0.4294 acc 0.8482
[SPM-Unigram-8k] TEST acc 0.8535, loss 0.3653

Running: SPM-BPE-8k


                                                 

[SPM-BPE-8k] Epoch 01/5 | train loss 0.4428 acc 0.7867 | val loss 0.3614 acc 0.8380


                                                 

[SPM-BPE-8k] Epoch 02/5 | train loss 0.3139 acc 0.8649 | val loss 0.3347 acc 0.8540


                                                 

[SPM-BPE-8k] Epoch 03/5 | train loss 0.2642 acc 0.8894 | val loss 0.3409 acc 0.8554


                                                 

[SPM-BPE-8k] Epoch 04/5 | train loss 0.2001 acc 0.9193 | val loss 0.3745 acc 0.8520


                                                 

[SPM-BPE-8k] Epoch 05/5 | train loss 0.1325 acc 0.9493 | val loss 0.4364 acc 0.8514
[SPM-BPE-8k] TEST acc 0.8544, loss 0.3453

Running: SPM-Unigram-16k


                                                 

[SPM-Unigram-16k] Epoch 01/5 | train loss 0.4584 acc 0.7776 | val loss 0.3753 acc 0.8317


                                                 

[SPM-Unigram-16k] Epoch 02/5 | train loss 0.3088 acc 0.8679 | val loss 0.3462 acc 0.8501


                                                 

[SPM-Unigram-16k] Epoch 03/5 | train loss 0.2392 acc 0.9016 | val loss 0.3542 acc 0.8553


                                                 

[SPM-Unigram-16k] Epoch 04/5 | train loss 0.1741 acc 0.9315 | val loss 0.3882 acc 0.8498


                                                 

[SPM-Unigram-16k] Epoch 05/5 | train loss 0.1411 acc 0.9476 | val loss 0.4629 acc 0.8483
[SPM-Unigram-16k] TEST acc 0.8531, loss 0.3528

Running: OKT-30k


                                                 

[OKT-30k] Epoch 01/5 | train loss 0.4441 acc 0.7889 | val loss 0.3730 acc 0.8352


                                                 

[OKT-30k] Epoch 02/5 | train loss 0.3079 acc 0.8670 | val loss 0.3287 acc 0.8560


                                                 

[OKT-30k] Epoch 03/5 | train loss 0.2387 acc 0.9017 | val loss 0.3304 acc 0.8600


                                                 

[OKT-30k] Epoch 04/5 | train loss 0.1921 acc 0.9246 | val loss 0.3737 acc 0.8555


                                                 

[OKT-30k] Epoch 05/5 | train loss 0.1223 acc 0.9539 | val loss 0.4369 acc 0.8535
[OKT-30k] TEST acc 0.8551, loss 0.3390

Running: MeCab-30k


                                                 

[MeCab-30k] Epoch 01/5 | train loss 0.4021 acc 0.8131 | val loss 0.3345 acc 0.8537


                                                 

[MeCab-30k] Epoch 02/5 | train loss 0.2906 acc 0.8758 | val loss 0.3071 acc 0.8706


                                                 

[MeCab-30k] Epoch 03/5 | train loss 0.2293 acc 0.9069 | val loss 0.3111 acc 0.8729


                                                 

[MeCab-30k] Epoch 04/5 | train loss 0.1719 acc 0.9335 | val loss 0.3405 acc 0.8681


                                                 

[MeCab-30k] Epoch 05/5 | train loss 0.1158 acc 0.9583 | val loss 0.3904 acc 0.8673
[MeCab-30k] TEST acc 0.8661, loss 0.3202

Running: Kkma-30k
