In [1]:
%pip install transformers gradio

Collecting transformers
  Using cached transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
Collecting gradio
  Using cached gradio-5.45.0-py3-none-any.whl.metadata (16 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Using cached aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting brotli>=1.1.0 (from gradio)
  Using cached Brotli-1.1.0-cp312-cp312-win_amd64.whl.metadata (5.6 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Using cached fastapi-0.116.1-py3-none-any.whl.metadata (28 kB)
Collecting ffmpy (from gradio)
  Using cached ffmpy-0.6.1-py3-none-any.whl.metadata (2.9 kB)
Collecti

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spacy 3.7.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.17.4 which is incompatible.
weasel 0.3.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.17.4 which is incompatible.

[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import re
import time
import random
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.amp import GradScaler, autocast

from transformers import (
    AutoTokenizer,
    AutoModel,
    get_linear_schedule_with_warmup
)

# 한글 폰트 설정
plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["axes.unicode_minus"] = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [14]:
def clean_text(text):
    if pd.isna(text):
        return ""

    text = str(text)
    # 특수문자 제거 (한글, 영문, 숫자, 공백만 유지)
    text = re.sub(r'[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9\s]', ' ', text)
    # 연속된 공백을 하나로
    text = re.sub(r'\s+', ' ', text)
    # 앞뒤 공백 제거
    text = text.strip()

    return text

def load_ratings_data(train_path, test_path):
    train_df = pd.read_table(train_path)
    test_df = pd.read_table(test_path)
    train_df = train_df.dropna()
    return train_df, test_df

def split_train_val(train_df, val_ratio=0.15, random_state=42):
    np.random.seed(random_state)
    labels = train_df["label"].values
    indices = np.arange(len(train_df))
    val_indices = []

    for label in np.unique(labels):
        label_indices = indices[labels == label]
        n_val = int((len(label_indices)- val_ratio))
        np.random.shuffle(label_indices)
        val_indices.extend(label_indices[:n_val])

    val_indices = np.array(val_indices)
    train_indices = np.setdiff1d(indices, val_indices)

    train_data = train_df.iloc[train_indices].reset_index(drop=True)
    val_data = train_df.iloc[val_indices].reset_index(drop=True)

    return train_data, val_data

In [16]:
model_name = 'klue/roberta-base'

In [17]:
class NLPDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
model_name = 'klue/roberta-base'

In [23]:
# 데이터 경로 설정
train_path = "data/ratings_train.txt"
test_path = "data/ratings_test.txt"
train_df, test_df = load_ratings_data(train_path, test_path)
train_df['document'] = train_df['document'].apply(clean_text)
test_df['document'] = test_df['document'].apply(clean_text)
train_df = train_df[train_df['document'].str.len() > 0].reset_index(drop=True) # 판다스는 뭘 지워도 위치인덱스 재조정 안해주기 때문에 리셋해주기(iloc, mask 인덱스 같은거 사용하는 데에 충돌이 일어날 것 같은 경우 인덱스 리셋하지 마라 로직을 고려해서 사용해라)
test_df = test_df[test_df['document'].str.len() > 0].reset_index(drop=True)
train_data, val_data = split_train_val(train_df, val_ratio=0.15, random_state=42)

tokenizer = AutoTokenizer.from_pretrained(model_name)

max_length = 128
batch_size = 32

train_dataset = NLPDataset(
    train_data['document'].values,
    train_data['label'].values,
    tokenizer,
    max_length
)
val_dataset = NLPDataset(
    val_data['document'].values,
    val_data['label'].values,
    tokenizer,
    max_length
)
test_dataset = NLPDataset(
    test_df['document'].values,
    test_df['label'].values,
    tokenizer,
    max_length
)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [21]:
class SentimentClassifier(nn.Module):
	def __init__(self, model_name, n_classes, dropout_rate):
		super(SentimentClassifier, self).__init__()
		self.bart = AutoModel.from_pretrained(model_name)
		self.dropout = nn.Dropout(dropout_rate)
		self.fc = nn.Linear(self.bart.config.hidden_size, n_classes)
	
	def forward(self, input_ids, attention_mask):
		outputs = self.bart(input_ids=input_ids, attention_mask=attention_mask)
		pooled_output = outputs.pooler_output # 토큰을 기반으로 attention_mask => 문장 수준의 임베딩
		output = self.dropout(pooled_output)
		logits = self.fc(output)
		return logits

In [22]:
model = SentimentClassifier(
    model_name=model_name,
    n_classes=2,
    dropout_rate=0.2
)
model = model.to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
def validate_model(model, val_loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                outputs = model(input_ids, attention_mask)

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100.0 * correct / total

In [29]:
def train_model(model, train_loader, val_loader, epochs, lr, patience):
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01) # weight_decay 학습 조기 종료를 막고 기울기의 급격한 변화를 막는다
    criterion = nn.CrossEntropyLoss()
    total_steps = len(train_loader) * epochs

    # weight_decay => SOTA
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )

    # NLP에서 이거 안 쓰면 답이 없음
    scaler = GradScaler(enabled=(device.type == 'cuda'))

    train_losses, val_accuracies, train_accuracies = [], [], []
    best_val_acc = 0.0
    epochs_no_improve = 0

    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        print("-" * 30)

        model.train()
        total_loss, correct_predictions, total_predictions = 0.0, 0, 0
        start_time = time.time()

        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            with autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

            if (batch_idx + 1) % 200 == 0:
                current_acc = 100.0 * correct_predictions / total_predictions
                current_loss = total_loss / (batch_idx + 1)
                print(f"  Batch {batch_idx + 1:4d}: Loss {current_loss:.4f}, Acc {current_acc:.2f}%")

        epoch_time = time.time() - start_time
        train_acc = 100.0 * correct_predictions / total_predictions
        avg_loss = total_loss / len(train_loader)
        val_acc = validate_model(model, val_loader) # 검증

        train_losses.append(avg_loss)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)

        print(f"\nResult:")
        print(f"  Train Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%")
        print(f"  Val Acc: {val_acc:.2f}%")
        print(f"  Time: {epoch_time:.1f}s, Learning Rate: {scheduler.get_last_lr()[0]:.2e}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')
            print(f"  ✓ Best model saved (Val Acc: {best_val_acc:.2f}%)")
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            print(f"  ! No improvement. (Patience: {epochs_no_improve}/{patience})")

        if epochs_no_improve >= patience:
            print(f"\n{patience} epochs without improvement. Early stopping.")
            break

    print(f"\nTrain complete! Best val accuracy: {best_val_acc:.2f}%")
    return train_losses, val_accuracies, train_accuracies

In [None]:
train_losses, val_accuracies, train_accuracies = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=5,
    lr=3e-5,
    patience=2
)

In [None]:
def plot_training_history(train_losses, val_accuracies, train_accuracies):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    epochs_ran = len(train_losses)
    x_axis = range(1, epochs_ran + 1)

    ax1.plot(x_axis, train_losses, 'b-o', label='Train Loss', linewidth=2)
    ax1.set_title('Training Loss', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    ax1.set_xticks(x_axis)

    ax2.plot(x_axis, train_accuracies, 'b-o', label='Train Accuracy', linewidth=2)
    ax2.plot(x_axis, val_accuracies, 'r-o', label='Validation Accuracy', linewidth=2)
    ax2.set_title('Training and Validation Accuracy', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy (%)')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    ax2.set_xticks(x_axis)

    plt.tight_layout()
    plt.show()

plot_training_history(train_losses, val_accuracies, train_accuracies)