# 로이터 뉴스 분류: 머신러닝 vs. 딥러닝(1D-CNN) 모델 성능 비교

## 목표
- [이전 실험](./news_classification_vocab_size.ipynb)에서 찾은 최적의 머신러닝 모델(`VotingClassifier`)과 PyTorch로 구현한 딥러닝 모델(`1D-CNN`)의 성능 비교

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from tensorflow.keras.datasets import reuters
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

## 1. 최적 조건 설정 및 데이터 로드

In [2]:
OPTIMAL_NUM_WORDS = 20000
MAX_LEN = 300

(x_train_full, y_train_full), (x_test, y_test) = reuters.load_data(num_words=OPTIMAL_NUM_WORDS, test_split=0.2)

print(f'Vocabulary Size: {OPTIMAL_NUM_WORDS}')
print(f'Full Train samples: {len(x_train_full)}')
print(f'Test samples: {len(x_test)}')

Vocabulary Size: 20000
Full Train samples: 8982
Test samples: 2246


## 2. 비교 실험 1: 최적의 머신러닝 모델 (VotingClassifier)

In [3]:
x_train_str = [' '.join(map(str, doc)) for doc in x_train_full]
x_test_str = [' '.join(map(str, doc)) for doc in x_test]

vectorizer = CountVectorizer(token_pattern=r'[0-9]+').fit(x_train_str)
x_train_dtm = vectorizer.transform(x_train_str)
x_test_dtm = vectorizer.transform(x_test_str)

tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_dtm)
x_test_tfidf = tfidf_transformer.transform(x_test_dtm)

voting_classifier = VotingClassifier(estimators=[
         ('lr', LogisticRegression(C=10000, penalty='l2', max_iter=3000)),
         ('cb', ComplementNB()),
         ('grbt', GradientBoostingClassifier(random_state=0))
        ], voting='soft', n_jobs=-1)

print('Training VotingClassifier...')
voting_classifier.fit(x_train_tfidf, y_train_full)
predicted = voting_classifier.predict(x_test_tfidf)
ml_accuracy = accuracy_score(y_test, predicted)
print(f'VotingClassifier 최종 정확도: {ml_accuracy:.4f}')

Training VotingClassifier...
VotingClassifier 최종 정확도: 0.8206


## 3. 비교 실험 2: PyTorch 딥러닝 모델 (1D-CNN)

In [4]:
# 1. Device 설정 (MPS 가속)
if torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
print(f'Using device: {device}')

# 2. 데이터 패딩 및 Tensor 변환
x_train_pad = pad_sequences(x_train_full, maxlen=MAX_LEN)
x_test_pad = pad_sequences(x_test, maxlen=MAX_LEN)

# 3. Train / Validation 분리
x_train, x_val, y_train, y_val = train_test_split(x_train_pad, y_train_full, test_size=0.2, random_state=42)

# 4. DataLoader 생성
train_data = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(x_val), torch.from_numpy(y_val))
test_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test))

batch_size = 128
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

Using device: mps


In [5]:
# PyTorch 1D-CNN 모델 정의
class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, 
                      out_channels=n_filters, 
                      kernel_size=fs) 
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.embedding(text)
        embedded = embedded.permute(0, 2, 1) # Conv1d는 (N, C, L) 입력을 받음
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [6]:
# 모델, 손실 함수, 옵티마이저 정의
EMBEDDING_DIM = 128
N_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 46 # 로이터 뉴스의 카테고리 수
DROPOUT = 0.5

model = CNNClassifier(OPTIMAL_NUM_WORDS, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters())

# 모델 학습
epochs = 15
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device).long()
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    # 검증 데이터로 평가
    model.eval()
    val_loss, val_corrects = 0, 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device).long()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            val_corrects += torch.sum(preds == labels.data)
            
    print(f'Epoch {epoch+1}/{epochs} | Train Loss: {epoch_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(val_loader):.4f} | Val Acc: {val_corrects.float()/len(val_data):.4f}')

Epoch 1/15 | Train Loss: 2.2458 | Val Loss: 1.7329 | Val Acc: 0.6082
Epoch 2/15 | Train Loss: 1.7040 | Val Loss: 1.4230 | Val Acc: 0.6834
Epoch 3/15 | Train Loss: 1.4343 | Val Loss: 1.2875 | Val Acc: 0.6845
Epoch 4/15 | Train Loss: 1.2788 | Val Loss: 1.1605 | Val Acc: 0.7407
Epoch 5/15 | Train Loss: 1.1256 | Val Loss: 1.1060 | Val Acc: 0.7457
Epoch 6/15 | Train Loss: 1.0319 | Val Loss: 1.0122 | Val Acc: 0.7713
Epoch 7/15 | Train Loss: 0.9542 | Val Loss: 0.9974 | Val Acc: 0.7685
Epoch 8/15 | Train Loss: 0.9021 | Val Loss: 0.9343 | Val Acc: 0.7791
Epoch 9/15 | Train Loss: 0.8313 | Val Loss: 0.9063 | Val Acc: 0.7941
Epoch 10/15 | Train Loss: 0.7734 | Val Loss: 0.8857 | Val Acc: 0.7935
Epoch 11/15 | Train Loss: 0.7074 | Val Loss: 0.8748 | Val Acc: 0.7958
Epoch 12/15 | Train Loss: 0.6692 | Val Loss: 0.8949 | Val Acc: 0.7947
Epoch 13/15 | Train Loss: 0.6096 | Val Loss: 0.8868 | Val Acc: 0.7947
Epoch 14/15 | Train Loss: 0.5556 | Val Loss: 0.8692 | Val Acc: 0.8152
Epoch 15/15 | Train Loss: 0.5

In [7]:
# 최종 평가
model.eval()
test_corrects = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device).long()
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        test_corrects += torch.sum(preds == labels.data)

dl_accuracy = test_corrects.float() / len(test_data)
print(f'PyTorch 1D-CNN 최종 정확도: {dl_accuracy:.4f}')

PyTorch 1D-CNN 최종 정확도: 0.7907


## 4. 최종 결론

In [8]:
print(f'- 최적 머신러닝 모델 (VotingClassifier) 정확도: {ml_accuracy:.4f}')
print(f'- 딥러닝 모델 (PyTorch 1D-CNN) 정확도: {dl_accuracy:.4f}')

if dl_accuracy > ml_accuracy:
    print('>> 딥러닝 모델이 더 높은 성능을 보였습니다.')
else:
    print('>> 머신러닝 앙상블 모델이 더 높은 성능을 보였습니다.')

- 최적 머신러닝 모델 (VotingClassifier) 정확도: 0.8206
- 딥러닝 모델 (PyTorch 1D-CNN) 정확도: 0.7907
>> 머신러닝 앙상블 모델이 더 높은 성능을 보였습니다.
