# 네이버 영화리뷰 감정 분석: SentencePiece 성능 개선 (vocab_size 변경)

## 목표
1. SentencePiece의 `unigram` 모델 타입을 기준으로, `vocab_size`를 [4000, 8000, 16000, 32000]으로 변경하며 모델을 각각 학습
2. 각 `vocab_size`에 따른 감정 분석 모델(BiLSTM, 1D CNN)의 성능 변화를 비교 분석하여 최적의 `vocab_size`를 탐색

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from soynlp.normalizer import emoticon_normalize
import sentencepiece as spm
import os

import warnings
warnings.filterwarnings('ignore')

## 1. 데이터 로드 및 전처리

In [2]:
train_df = pd.read_table('ratings_train.txt')
test_df = pd.read_table('ratings_test.txt')

In [3]:
def preprocess_data(df, is_train=True):
    df_name = '훈련' if is_train else '테스트'
    print(f'--- {df_name} 데이터 전처리 ---')
    print(f'전처리 전 데이터 개수: {len(df)}')
    df.dropna(inplace=True)
    print(f'결측치 제거 후: {len(df)}개')
    if is_train:
        df.drop_duplicates(subset=['document'], inplace=True)
        print(f'중복 제거 후: {len(df)}개')
    df['document'] = df['document'].apply(lambda x: emoticon_normalize(x, num_repeats=2) if isinstance(x, str) else x)
    df.dropna(inplace=True)
    df = df[~df['document'].str.match('^[ㄱ-ㅎㅏ-ㅣ]+$', na=False)]
    min_char_len = 3
    max_char_len = 140
    df = df[df['document'].str.len() >= min_char_len]
    df['document'] = df['document'].str.slice(0, max_char_len)
    print(f'최종 데이터 개수: {len(df)}개')
    return df

train_df_clean = preprocess_data(train_df.copy(), is_train=True)
test_df_clean = preprocess_data(test_df.copy(), is_train=False)

--- 훈련 데이터 전처리 ---
전처리 전 데이터 개수: 150000
결측치 제거 후: 149995개
중복 제거 후: 146182개
최종 데이터 개수: 145594개
--- 테스트 데이터 전처리 ---
전처리 전 데이터 개수: 50000
결측치 제거 후: 49997개
최종 데이터 개수: 49369개


## 2. 모델 및 학습/평가 함수 정의

In [4]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, padding_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        out = self.fc(hidden)
        return self.sigmoid(out)

class CNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=n_filters, kernel_size=fs)
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x).permute(0, 2, 1)
        conved = [torch.relu(conv(embedded)) for conv in self.convs]
        pooled = [torch.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        out = self.fc(cat)
        return self.sigmoid(out)

In [5]:
def train_and_evaluate(model, train_loader, val_loader, test_loader, num_epochs, device, lr=1e-3, model_name='Model'):
    model = model.to(device)
    criterion = nn.BCELoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    print(f'--- {model_name} 모델 학습 시작 ---')
    for epoch in range(num_epochs):
        model.train()
        epoch_train_correct, epoch_train_total = 0, 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            predicted = (outputs > 0.5).float()
            epoch_train_correct += (predicted == labels).sum().item()
            epoch_train_total += labels.size(0)
        avg_train_acc = epoch_train_correct / epoch_train_total

        model.eval()
        epoch_val_correct, epoch_val_total = 0, 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs).squeeze()
                predicted = (outputs > 0.5).float()
                epoch_val_correct += (predicted == labels).sum().item()
                epoch_val_total += labels.size(0)
        avg_val_acc = epoch_val_correct / epoch_val_total
        
        print(f'Epoch [{epoch+1}/{num_epochs}] -> Train Acc: {avg_train_acc:.4f} | Val Acc: {avg_val_acc:.4f}')

    model.eval()
    test_correct, test_total = 0, 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs).squeeze()
            predicted = (outputs > 0.5).float()
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()
    test_accuracy = test_correct / test_total
    print(f'Final Test Accuracy: {test_accuracy:.4f}')
    return test_accuracy

## 3. vocab_size에 따른 성능 비교 실험

In [6]:
# 말뭉치 파일 생성
corpus_file = 'nsmc_corpus.txt'
with open(corpus_file, 'w', encoding='utf-8') as f:
    for doc in train_df_clean['document']:
        f.write(doc + '\n')

results = {}
vocab_sizes_to_test = [4000, 8000, 16000, 32000]
model_type = 'unigram'

for vocab_size in vocab_sizes_to_test:
    print(f'{'='*50}')
    print(f'Running Experiment for vocab_size = {vocab_size}')
    print(f'{'='*50}')
    
    # 1. SentencePiece 모델 학습
    model_prefix = f'nsmc_spm_{model_type}_{vocab_size}'
    params = f'--input={corpus_file} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type={model_type}'
    spm.SentencePieceTrainer.Train(params)
    
    # 2. 토크나이저 로드 및 데이터 준비
    sp = spm.SentencePieceProcessor()
    sp.Load(f'{model_prefix}.model')
    
    def sp_tokenize(s, corpus):
        sequences = [s.EncodeAsIds(text) for text in corpus]
        sequences = [torch.tensor(seq, dtype=torch.long) for seq in sequences]
        return pad_sequence(sequences, batch_first=True, padding_value=0)
        
    X_train = sp_tokenize(sp, train_df_clean['document'].tolist())
    X_test = sp_tokenize(sp, test_df_clean['document'].tolist())
    y_train = torch.tensor(train_df_clean['label'].values, dtype=torch.float32)
    y_test = torch.tensor(test_df_clean['label'].values, dtype=torch.float32)
    
    val_size = 40000
    X_val, y_val = X_train[:val_size], y_train[:val_size]
    X_train_final, y_train_final = X_train[val_size:], y_train[val_size:]

    train_dataset = TensorDataset(X_train_final, y_train_final)
    val_dataset = TensorDataset(X_val, y_val)
    test_dataset = TensorDataset(X_test, y_test)

    batch_size = 512
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    val_loader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)
    
    # 3. 모델 학습 및 평가
    if torch.backends.mps.is_available(): device = torch.device('mps')
    else: device = torch.device('cpu')
    
    VOCAB_SIZE = sp.GetPieceSize()
    EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM = 128, 128, 1
    NUM_EPOCHS, LEARNING_RATE = 7, 1e-3
    
    # BiLSTM
    bilstm_model = LSTMModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, padding_idx=0)
    bilstm_acc = train_and_evaluate(bilstm_model, train_loader, val_loader, test_loader, NUM_EPOCHS, device, lr=LEARNING_RATE, model_name=f'BiLSTM (vocab_size={vocab_size})')
    
    # 1D CNN
    N_FILTERS, FILTER_SIZES, DROPOUT = 100, [3,4,5], 0.5
    cnn_model = CNNModel(VOCAB_SIZE, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
    cnn_acc = train_and_evaluate(cnn_model, train_loader, val_loader, test_loader, NUM_EPOCHS, device, lr=LEARNING_RATE, model_name=f'1D CNN (vocab_size={vocab_size})')
    
    results[vocab_size] = {'BiLSTM_Accuracy': bilstm_acc, 'CNN_Accuracy': cnn_acc}

Running Experiment for vocab_size = 4000


sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=nsmc_corpus.txt --model_prefix=nsmc_spm_unigram_4000 --vocab_size=4000 --model_type=unigram
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: nsmc_corpus.txt
  input_format: 
  model_prefix: nsmc_spm_unigram_4000
  model_type: UNIGRAM
  vocab_size: 4000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id

--- BiLSTM (vocab_size=4000) 모델 학습 시작 ---
Epoch [1/7] -> Train Acc: 0.6930 | Val Acc: 0.7981
Epoch [2/7] -> Train Acc: 0.8239 | Val Acc: 0.8327
Epoch [3/7] -> Train Acc: 0.8506 | Val Acc: 0.8414
Epoch [4/7] -> Train Acc: 0.8654 | Val Acc: 0.8451
Epoch [5/7] -> Train Acc: 0.8773 | Val Acc: 0.8450
Epoch [6/7] -> Train Acc: 0.8890 | Val Acc: 0.8485
Epoch [7/7] -> Train Acc: 0.8999 | Val Acc: 0.8502
Final Test Accuracy: 0.8449
--- 1D CNN (vocab_size=4000) 모델 학습 시작 ---
Epoch [1/7] -> Train Acc: 0.6728 | Val Acc: 0.7744
Epoch [2/7] -> Train Acc: 0.7883 | Val Acc: 0.8190
Epoch [3/7] -> Train Acc: 0.8251 | Val Acc: 0.8330
Epoch [4/7] -> Train Acc: 0.8452 | Val Acc: 0.8423
Epoch [5/7] -> Train Acc: 0.8559 | Val Acc: 0.8447
Epoch [6/7] -> Train Acc: 0.8664 | Val Acc: 0.8489
Epoch [7/7] -> Train Acc: 0.8735 | Val Acc: 0.8497
Final Test Accuracy: 0.8471
Running Experiment for vocab_size = 8000


sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=nsmc_corpus.txt --model_prefix=nsmc_spm_unigram_8000 --vocab_size=8000 --model_type=unigram
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: nsmc_corpus.txt
  input_format: 
  model_prefix: nsmc_spm_unigram_8000
  model_type: UNIGRAM
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id

--- BiLSTM (vocab_size=8000) 모델 학습 시작 ---
Epoch [1/7] -> Train Acc: 0.6830 | Val Acc: 0.7891
Epoch [2/7] -> Train Acc: 0.8216 | Val Acc: 0.8233
Epoch [3/7] -> Train Acc: 0.8530 | Val Acc: 0.8403
Epoch [4/7] -> Train Acc: 0.8737 | Val Acc: 0.8459
Epoch [5/7] -> Train Acc: 0.8889 | Val Acc: 0.8513
Epoch [6/7] -> Train Acc: 0.9028 | Val Acc: 0.8494
Epoch [7/7] -> Train Acc: 0.9150 | Val Acc: 0.8502
Final Test Accuracy: 0.8445
--- 1D CNN (vocab_size=8000) 모델 학습 시작 ---
Epoch [1/7] -> Train Acc: 0.6470 | Val Acc: 0.7525
Epoch [2/7] -> Train Acc: 0.7710 | Val Acc: 0.8104
Epoch [3/7] -> Train Acc: 0.8227 | Val Acc: 0.8329
Epoch [4/7] -> Train Acc: 0.8479 | Val Acc: 0.8419
Epoch [5/7] -> Train Acc: 0.8627 | Val Acc: 0.8488
Epoch [6/7] -> Train Acc: 0.8749 | Val Acc: 0.8508
Epoch [7/7] -> Train Acc: 0.8819 | Val Acc: 0.8520
Final Test Accuracy: 0.8472
Running Experiment for vocab_size = 16000


sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=nsmc_corpus.txt --model_prefix=nsmc_spm_unigram_16000 --vocab_size=16000 --model_type=unigram
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: nsmc_corpus.txt
  input_format: 
  model_prefix: nsmc_spm_unigram_16000
  model_type: UNIGRAM
  vocab_size: 16000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  un

--- BiLSTM (vocab_size=16000) 모델 학습 시작 ---
Epoch [1/7] -> Train Acc: 0.6565 | Val Acc: 0.7205
Epoch [2/7] -> Train Acc: 0.7960 | Val Acc: 0.8172
Epoch [3/7] -> Train Acc: 0.8511 | Val Acc: 0.8355
Epoch [4/7] -> Train Acc: 0.8749 | Val Acc: 0.8426
Epoch [5/7] -> Train Acc: 0.8932 | Val Acc: 0.8468
Epoch [6/7] -> Train Acc: 0.9091 | Val Acc: 0.8470
Epoch [7/7] -> Train Acc: 0.9222 | Val Acc: 0.8462
Final Test Accuracy: 0.8440
--- 1D CNN (vocab_size=16000) 모델 학습 시작 ---
Epoch [1/7] -> Train Acc: 0.6366 | Val Acc: 0.7396
Epoch [2/7] -> Train Acc: 0.7641 | Val Acc: 0.7967
Epoch [3/7] -> Train Acc: 0.8191 | Val Acc: 0.8232
Epoch [4/7] -> Train Acc: 0.8499 | Val Acc: 0.8366
Epoch [5/7] -> Train Acc: 0.8672 | Val Acc: 0.8435
Epoch [6/7] -> Train Acc: 0.8822 | Val Acc: 0.8474
Epoch [7/7] -> Train Acc: 0.8922 | Val Acc: 0.8488
Final Test Accuracy: 0.8461
Running Experiment for vocab_size = 32000


sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=nsmc_corpus.txt --model_prefix=nsmc_spm_unigram_32000 --vocab_size=32000 --model_type=unigram
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: nsmc_corpus.txt
  input_format: 
  model_prefix: nsmc_spm_unigram_32000
  model_type: UNIGRAM
  vocab_size: 32000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  un

--- BiLSTM (vocab_size=32000) 모델 학습 시작 ---
Epoch [1/7] -> Train Acc: 0.6598 | Val Acc: 0.7437
Epoch [2/7] -> Train Acc: 0.7980 | Val Acc: 0.8085
Epoch [3/7] -> Train Acc: 0.8501 | Val Acc: 0.8236
Epoch [4/7] -> Train Acc: 0.8757 | Val Acc: 0.8362
Epoch [5/7] -> Train Acc: 0.9033 | Val Acc: 0.8407
Epoch [6/7] -> Train Acc: 0.9219 | Val Acc: 0.8401
Epoch [7/7] -> Train Acc: 0.9365 | Val Acc: 0.8397
Final Test Accuracy: 0.8384
--- 1D CNN (vocab_size=32000) 모델 학습 시작 ---
Epoch [1/7] -> Train Acc: 0.6394 | Val Acc: 0.7346
Epoch [2/7] -> Train Acc: 0.7562 | Val Acc: 0.7873
Epoch [3/7] -> Train Acc: 0.8166 | Val Acc: 0.8142
Epoch [4/7] -> Train Acc: 0.8521 | Val Acc: 0.8283
Epoch [5/7] -> Train Acc: 0.8761 | Val Acc: 0.8372
Epoch [6/7] -> Train Acc: 0.8937 | Val Acc: 0.8407
Epoch [7/7] -> Train Acc: 0.9072 | Val Acc: 0.8430
Final Test Accuracy: 0.8382


## 4. 최종 결과 비교

In [7]:
results_df = pd.DataFrame(results).T
results_df.index.name = 'Vocab Size'
print(results_df)

            BiLSTM_Accuracy  CNN_Accuracy
Vocab Size                               
4000               0.844882      0.847131
8000               0.844518      0.847171
16000              0.844011      0.846098
32000              0.838421      0.838198
