In [4]:
!pip install pytorch-lightning torchmetrics  datasets transformers

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.0.post0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics
  Downloading torchmetrics-1.6.3-py3-none-any.whl.metadata (20 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.14.0-py3-none-any.whl.metadata (5.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12=

In [3]:
import torch
import torch.nn as nn  # 신경망 모듈 제공
from torch.utils.data import DataLoader  # 데이터 로딩 및 배치 처리
from transformers import DistilBertTokenizer  # DistilBERT 토크나이저로 텍스트 전처리
from datasets import load_dataset  # Hugging Face의 IMDB 데이터셋 로드
import pytorch_lightning as pl  # PyTorch Lightning으로 학습 구조화
from torchmetrics import Accuracy  # 정확도 계산을 위한 메트릭

# 1. 데이터 모듈 정의
class IMDBDataModule(pl.LightningDataModule):
    def __init__(self, batch_size=32, max_length=128):
        # 데이터 모듈 초기화
        super().__init__()
        self.batch_size = batch_size  # 배치 크기: 한 번에 처리할 샘플 수
        self.max_length = max_length  # 텍스트 최대 길이: 토큰화 시 잘림/패딩 기준
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')  # DistilBERT 토크나이저 로드

    def prepare_data(self):
        # 데이터셋 다운로드 (학습 전에 한 번만 실행)
        # IMDB 데이터셋을 Hugging Face에서 로드 (train: 25,000, test: 25,000 샘플)
        self.dataset = load_dataset('imdb')

    def setup(self, stage=None):
        # 데이터셋을 train, val, test로 분리
        self.train_dataset = self.dataset['train']  # 학습 데이터: 25,000개 리뷰
        self.test_dataset = self.dataset['test']    # 테스트 데이터: 25,000개 리뷰
        # 테스트 데이터의 20%를 검증 데이터로 분리
        val_size = int(0.2 * len(self.test_dataset))  # 5,000개
        self.val_dataset, self.test_dataset = torch.utils.data.random_split(
            self.test_dataset, [val_size, len(self.test_dataset) - val_size]  # val: 5,000, test: 20,000
        )

    def _collate_fn(self, batch):
        # 배치 데이터를 토큰화하고 텐서로 변환
        texts = [sample['text'] for sample in batch]  # 배치에서 텍스트 추출
        labels = torch.tensor([sample['label'] for sample in batch], dtype=torch.long)  # 레이블(0: 부정, 1: 긍정)
        encodings = self.tokenizer(
            texts,
            truncation=True,  # max_length 초과 시 자름
            padding='max_length',  # max_length에 맞춰 패딩
            max_length=self.max_length,  # 최대 시퀀스 길이
            return_tensors='pt'  # PyTorch 텐서로 반환
        )
        return {
            'input_ids': encodings['input_ids'],  # 토큰화된 입력 ID [batch_size, max_length]
            'attention_mask': encodings['attention_mask'],  # 패딩 구분 마스크 [batch_size, max_length]
            'labels': labels  # 타겟 레이블 [batch_size]
        }

    def train_dataloader(self):
        # 학습 데이터 로더: 셔플링 적용
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True,
                          collate_fn=self._collate_fn, num_workers=2)  # 멀티프로세싱으로 속도 향상

    def val_dataloader(self):
        # 검증 데이터 로더: 셔플링 없음
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False,
                          collate_fn=self._collate_fn, num_workers=2)

    def test_dataloader(self):
        # 테스트 데이터 로더: 셔플링 없음
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False,
                          collate_fn=self._collate_fn, num_workers=2)

# 2. 모델 정의
class IMDBClassifier(pl.LightningModule):
    def __init__(self, model_type='lstm', vocab_size=30522, embedding_dim=100, hidden_dim=128,
                 num_layers=1, bidirectional=False, learning_rate=1e-3):
        # 모델 초기화
        super().__init__()
        self.save_hyperparameters()  # 하이퍼파라미터 저장 (로그 및 재사용 가능)

        # 임베딩 레이어: 단어를 밀집 벡터로 변환
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)  # vocab_size는 DistilBERT 기준

        # 모델 선택: RNN, LSTM, GRU 중 하나
        if model_type == 'rnn':
            self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=num_layers,
                              bidirectional=bidirectional, batch_first=True)  # 기본 RNN
        elif model_type == 'lstm':
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                               bidirectional=bidirectional, batch_first=True)  # 장기 의존성 처리
        elif model_type == 'gru':
            self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers,
                              bidirectional=bidirectional, batch_first=True)  # LSTM보다 간소화된 구조
        else:
            raise ValueError("model_type must be 'rnn', 'lstm', or 'gru'")

        # 출력 차원: 양방향이면 hidden_dim * 2
        output_dim = hidden_dim * 2 if bidirectional else hidden_dim
        self.fc = nn.Linear(output_dim, 2)  # 이진 분류 (긍정/부정)
        self.dropout = nn.Dropout(0.5)  # 과적합 방지

        # 정확도 메트릭: 이진 분류지만 logits가 [batch_size, 2]이므로 multiclass로 설정
        self.train_accuracy = Accuracy(task="multiclass", num_classes=2)
        self.val_accuracy = Accuracy(task="multiclass", num_classes=2)
        self.test_accuracy = Accuracy(task="multiclass", num_classes=2)

    def forward(self, input_ids, attention_mask):
        # 순전파: 입력에서 출력까지 계산
        embedded = self.embedding(input_ids)  # [batch_size, seq_len, embedding_dim]
        output, hidden = self.rnn(embedded)   # output: [batch_size, seq_len, hidden_dim * num_directions]
                                              # hidden: RNN은 h_n, LSTM은 (h_n, c_n)

        # Bidirectional 처리
        if self.hparams.model_type == 'lstm' and self.hparams.bidirectional:
            # Bidirectional LSTM: 순방향과 역방향의 마지막 히든 상태 결합
            h_n = hidden[0]  # h_n: [num_layers * num_directions, batch_size, hidden_dim]
            batch_size = embedded.size(0)
            h_n = h_n.view(self.hparams.num_layers, 2, batch_size, self.hparams.hidden_dim)  # 방향별 분리
            forward_last = h_n[-1, 0, :, :]  # 순방향 마지막 상태
            backward_first = h_n[-1, 1, :, :]  # 역방향 마지막 상태
            combined = torch.cat((forward_last, backward_first), dim=1)  # [batch_size, hidden_dim * 2]
        elif self.hparams.bidirectional:
            # Bidirectional RNN/GRU: output에서 양방향 상태 추출
            forward_last = output[:, -1, :self.hparams.hidden_dim]  # 순방향 마지막
            backward_first = output[:, 0, self.hparams.hidden_dim:]  # 역방향 첫 번째
            combined = torch.cat((forward_last, backward_first), dim=1)  # [batch_size, hidden_dim * 2]
        else:
            # 단방향: 마지막 타임스텝만 사용
            combined = output[:, -1, :]  # [batch_size, hidden_dim]

        combined = self.dropout(combined)  # 드롭아웃 적용
        logits = self.fc(combined)  # 최종 출력: [batch_size, 2]
        return logits

    def training_step(self, batch, batch_idx):
        # 학습 스텝: 손실 계산 및 메트릭 로깅
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']  # RNN 계열에서는 사용 안 함 (유지용)
        labels = batch['labels']
        logits = self(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(logits, labels)  # 크로스 엔트로피 손실

        preds = torch.argmax(logits, dim=1)  # 예측 클래스 인덱스 추출
        self.train_accuracy(preds, labels)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)  # 손실 로깅
        self.log('train_acc', self.train_accuracy, on_step=True, on_epoch=True, prog_bar=True)  # 정확도 로깅
        return loss

    def validation_step(self, batch, batch_idx):
        # 검증 스텝
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        logits = self(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(logits, labels)

        preds = torch.argmax(logits, dim=1)
        self.val_accuracy(preds, labels)
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        self.log('val_acc', self.val_accuracy, on_epoch=True, prog_bar=True)

    def test_step(self, batch, batch_idx):
        # 테스트 스텝
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        logits = self(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(logits, labels)

        preds = torch.argmax(logits, dim=1)
        self.test_accuracy(preds, labels)
        self.log('test_loss', loss, on_epoch=True)
        self.log('test_acc', self.test_accuracy, on_epoch=True)

    def configure_optimizers(self):
        # 옵티마이저 설정: Adam 사용
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
        return optimizer

# 3. 학습 실행 함수
def train_model(model_type):
    # 모델별 학습 실행
    batch_size = 32  # 배치 크기
    max_epochs = 5   # 학습 에포크 수
    bidirectional = False  # 양방향 여부

    data_module = IMDBDataModule(batch_size=batch_size)  # 데이터 모듈 초기화

    if model_type == 'bidirectional_lstm':
        model_type = 'lstm'
        bidirectional = True

    model = IMDBClassifier(
        model_type=model_type,  # 모델 타입 전달
        vocab_size=data_module.tokenizer.vocab_size,  # DistilBERT의 어휘 크기
        embedding_dim=100,  # 임베딩 차원
        hidden_dim=128,     # 히든 상태 차원
        num_layers=1,       # RNN 레이어 수
        bidirectional=bidirectional,  # 양방향 여부
        learning_rate=1e-3  # 학습률
    )

    trainer = pl.Trainer(
        max_epochs=max_epochs,
        accelerator='gpu' if torch.cuda.is_available() else 'cpu',  # GPU 사용 가능 시 활용
        devices=1,  # 단일 장치 사용
        log_every_n_steps=10,  # 10 스텝마다 로그
        enable_progress_bar=True  # 진행 바 표시
    )

    trainer.fit(model, data_module)  # 학습 실행
    trainer.test(model, datamodule=data_module)  # 테스트 실행

# 4. 모델별 실행
if __name__ == "__main__":
    # 순차적으로 RNN, LSTM, GRU, Bidirectional LSTM 학습
    print("Training RNN...")
    train_model('rnn')

    print("\nTraining LSTM...")
    train_model('lstm')

    print("\nTraining GRU...")
    train_model('gru')

    print("\nTraining Bidirectional LSTM...")
    train_model('bidirectional_lstm')

Training RNN...


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | embedding      | Embedding          | 3.1 M  | train
1 | rnn            | RNN                | 29.4 K | train
2 | fc             | Linear             | 258    | train
3 | dropout        | Dropout            | 0      | train
4 | train_accuracy | MulticlassAccuracy | 0      | train
5 | val_accuracy   | MulticlassAccuracy | 0      | train
6 | test_accuracy  | MulticlassAccuracy | 0      | train
--------------------------------------------------------------
3.1 M     Trainable params
0         Non-t

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs



Training LSTM...


INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | embedding      | Embedding          | 3.1 M  | train
1 | rnn            | LSTM               | 117 K  | train
2 | fc             | Linear             | 258    | train
3 | dropout        | Dropout            | 0      | train
4 | train_accuracy | MulticlassAccuracy | 0      | train
5 | val_accuracy   | MulticlassAccuracy | 0      | train
6 | test_accuracy  | MulticlassAccuracy | 0      | train
--------------------------------------------------------------
3.2 M     Trainable params
0         Non-trainable params
3.2 M     Total params
12.681    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs



Training GRU...


INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | embedding      | Embedding          | 3.1 M  | train
1 | rnn            | GRU                | 88.3 K | train
2 | fc             | Linear             | 258    | train
3 | dropout        | Dropout            | 0      | train
4 | train_accuracy | MulticlassAccuracy | 0      | train
5 | val_accuracy   | MulticlassAccuracy | 0      | train
6 | test_accuracy  | MulticlassAccuracy | 0      | train
--------------------------------------------------------------
3.1 M     Trainable params
0         Non-trainable params
3.1 M     Total params
12.563    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs



Training Bidirectional LSTM...


INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | embedding      | Embedding          | 3.1 M  | train
1 | rnn            | LSTM               | 235 K  | train
2 | fc             | Linear             | 514    | train
3 | dropout        | Dropout            | 0      | train
4 | train_accuracy | MulticlassAccuracy | 0      | train
5 | val_accuracy   | MulticlassAccuracy | 0      | train
6 | test_accuracy  | MulticlassAccuracy | 0      | train
--------------------------------------------------------------
3.3 M     Trainable params
0         Non-trainable params
3.3 M     Total params
13.153    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]