In [1]:
!pip install torch torchtext spacy matplotlib
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm
!pip install torchtext==0.6.0

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m83.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation suc

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter

#############################
# 1. 資料讀取與前處理
#############################
def load_data(data_dir):
    # 組合檔案路徑
    train_src_path = os.path.join(data_dir, "train.txt.src")
    train_tgt_path = os.path.join(data_dir, "train.txt.tgt")
    val_src_path = os.path.join(data_dir, "val.txt.src")
    val_tgt_path = os.path.join(data_dir, "val.txt.tgt")
    test_src_path = os.path.join(data_dir, "test.txt.src")
    test_tgt_path = os.path.join(data_dir, "test.txt.tgt")

    with open(train_src_path, "r", encoding="utf-8") as f:
        train_src = f.readlines()
    with open(train_tgt_path, "r", encoding="utf-8") as f:
        train_tgt = f.readlines()
    with open(val_src_path, "r", encoding="utf-8") as f:
        val_src = f.readlines()
    with open(val_tgt_path, "r", encoding="utf-8") as f:
        val_tgt = f.readlines()
    with open(test_src_path, "r", encoding="utf-8") as f:
        test_src = f.readlines()
    with open(test_tgt_path, "r", encoding="utf-8") as f:
        test_tgt = f.readlines()

    return train_src, train_tgt, val_src, val_tgt, test_src, test_tgt

# 調整你的資料夾路徑
data_dir = "/content/drive/MyDrive/cnndm/data"
train_src, train_tgt, val_src, val_tgt, test_src, test_tgt = load_data(data_dir)
print("訓練集筆數：", len(train_src))


def tokenize(text):
    return text.lower().strip().split()

def numericalize(text, word2idx, add_special_tokens=True):
    tokens = tokenize(text)
    if add_special_tokens:
        tokens = ["<SOS>"] + tokens + ["<EOS>"]
    return [word2idx.get(token, word2idx["<UNK>"]) for token in tokens]

def build_vocab(texts, min_freq=5, max_size=None):
    counter = Counter()
    for line in texts:
        tokens = tokenize(line)
        counter.update(tokens)
    vocab_tokens = [token for token, freq in counter.items() if freq >= min_freq]
    if max_size is not None:
        vocab_tokens = sorted(vocab_tokens, key=lambda w: counter[w], reverse=True)[:max_size]
    vocab = {"<PAD>", "<SOS>", "<EOS>", "<UNK>"} | set(vocab_tokens)
    word2idx = {word: idx for idx, word in enumerate(sorted(vocab))}
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

# 建立來源與目標詞彙表（依照需求調整 min_freq 與 max_size）
src_word2idx, src_idx2word = build_vocab(train_src, min_freq=5, max_size=50000)
tgt_word2idx, tgt_idx2word = build_vocab(train_tgt, min_freq=5, max_size=30000)
print("來源字典大小:", len(src_word2idx))
print("目標字典大小:", len(tgt_word2idx))

#############################
# 2. Dataset 與 DataLoader 定義
#############################

def numericalize(text, word2idx, add_special_tokens=True, max_length=None):
    tokens = tokenize(text)
    if max_length is not None:
        tokens = tokens[:max_length]
    if add_special_tokens:
        tokens = ["<SOS>"] + tokens + ["<EOS>"]
    return [word2idx.get(token, word2idx["<UNK>"]) for token in tokens]


class CNNDMDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, src_word2idx, tgt_word2idx, src_max_length=None, tgt_max_length=None):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.src_word2idx = src_word2idx
        self.tgt_word2idx = tgt_word2idx
        self.src_max_length = src_max_length
        self.tgt_max_length = tgt_max_length

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src = numericalize(self.src_texts[idx], self.src_word2idx, max_length=self.src_max_length)
        tgt = numericalize(self.tgt_texts[idx], self.tgt_word2idx, max_length=self.tgt_max_length)
        return torch.tensor(src), torch.tensor(tgt)

def collate_fn(batch, pad_idx):
    src_batch, tgt_batch = zip(*batch)
    src_padded = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=pad_idx)
    tgt_padded = torch.nn.utils.rnn.pad_sequence(tgt_batch, padding_value=pad_idx)
    return src_padded, tgt_padded

# 建立訓練資料集與 DataLoader
# train_dataset = CNNDMDataset(train_src, train_tgt, src_word2idx, tgt_word2idx)
train_dataset = CNNDMDataset(train_src, train_tgt, src_word2idx, tgt_word2idx, src_max_length=400, tgt_max_length=100)

pad_idx_src = src_word2idx["<PAD>"]
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True,
                          collate_fn=lambda batch: collate_fn(batch, pad_idx_src))

#############################
# 3. 模型定義：Encoder, Attention, Decoder, Seq2Seq
#############################
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hid_dim * 2, hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src: [src_len, batch_size]
        embedded = self.dropout(self.embedding(src))  # [src_len, batch_size, emb_dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        hidden_cat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        hidden = torch.tanh(self.fc(hidden_cat))  # [batch_size, hid_dim]
        return outputs, hidden  # outputs: [src_len, batch_size, hid_dim*2]

class Attention(nn.Module):
    def __init__(self, hid_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hid_dim * 3, hid_dim)
        self.v = nn.Linear(hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden: [batch_size, hid_dim]
        # encoder_outputs: [src_len, batch_size, hid_dim*2]
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)  # [batch_size, src_len, hid_dim]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)   # [batch_size, src_len, hid_dim*2]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)  # [batch_size, src_len]
        return torch.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, attention):
        super(Decoder, self).__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM((hid_dim * 2) + emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear((hid_dim * 2) + emb_dim + hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        # input: [batch_size]，當前時間步 token
        input = input.unsqueeze(0)  # [1, batch_size]
        embedded = self.dropout(self.embedding(input))  # [1, batch_size, emb_dim]
        # 判斷 hidden 維度：第一次使用 Encoder 輸出的 [batch_size, hid_dim] 或 Decoder 輸出 [n_layers, batch_size, hid_dim]
        att_hidden = hidden if hidden.dim() == 2 else hidden[-1]
        a = self.attention(att_hidden, encoder_outputs)  # [batch_size, src_len]
        a = a.unsqueeze(1)  # [batch_size, 1, src_len]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)  # [batch_size, src_len, hid_dim*2]
        weighted = torch.bmm(a, encoder_outputs)  # [batch_size, 1, hid_dim*2]
        weighted = weighted.permute(1, 0, 2)  # [1, batch_size, hid_dim*2]
        rnn_input = torch.cat((embedded, weighted), dim=2)  # [1, batch_size, emb_dim + hid_dim*2]
        output, (hidden, cell) = self.rnn(rnn_input)
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src: [src_len, batch_size], trg: [trg_len, batch_size]
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[0, :]  # 預設第一個 token 為 <SOS>

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

#############################
# 4. 模型訓練設定與訓練迴圈
#############################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
INPUT_DIM = len(src_word2idx)
OUTPUT_DIM = len(tgt_word2idx)
ENC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
DROPOUT = 0.5

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
attention = Attention(HID_DIM)
decoder = Decoder(OUTPUT_DIM, 256, HID_DIM, N_LAYERS, DROPOUT, attention)
model = Seq2Seq(encoder, decoder, device).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=tgt_word2idx["<PAD>"])
optimizer = optim.Adam(model.parameters(), lr=1e-3)

scaler = torch.cuda.amp.GradScaler()

from tqdm import tqdm
import torch.cuda.amp

def train_one_epoch(model, data_loader, optimizer, criterion, clip=1.0):
    model.train()
    epoch_loss = 0
    scaler = torch.cuda.amp.GradScaler()
    for src_batch, tgt_batch in tqdm(data_loader, desc="Training batches"):
        src_batch = src_batch.to(device)
        tgt_batch = tgt_batch.to(device)
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            output = model(src_batch, tgt_batch, teacher_forcing_ratio=0.5)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)  # 排除 <SOS>
            tgt_batch = tgt_batch[1:].view(-1)
            loss = criterion(output, tgt_batch)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)



# def train_one_epoch(model, data_loader, optimizer, criterion, clip=1.0):
#     model.train()
#     epoch_loss = 0
#     for src_batch, tgt_batch in data_loader:
#         src_batch = src_batch.to(device)
#         tgt_batch = tgt_batch.to(device)
#         optimizer.zero_grad()
#         output = model(src_batch, tgt_batch, teacher_forcing_ratio=0.5)
#         output_dim = output.shape[-1]
#         output = output[1:].view(-1, output_dim)  # 排除 <SOS>
#         tgt_batch = tgt_batch[1:].view(-1)
#         loss = criterion(output, tgt_batch)
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
#         optimizer.step()
#         epoch_loss += loss.item()
#     return epoch_loss / len(data_loader)

NUM_EPOCHS = 5
for epoch in range(NUM_EPOCHS):
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch+1}, Train Loss = {train_loss:.4f}")


訓練集筆數： 287227
來源字典大小: 50004
目標字典大小: 30004


  scaler = torch.cuda.amp.GradScaler()
  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Training batches: 100%|██████████| 17952/17952 [2:40:28<00:00,  1.86it/s]


Epoch 1, Train Loss = 4.0931


Training batches: 100%|██████████| 17952/17952 [2:40:51<00:00,  1.86it/s]


Epoch 2, Train Loss = 3.8424


Training batches:  25%|██▍       | 4419/17952 [39:29<2:00:01,  1.88it/s]


# Preparing Data

First, we'll import all the modules as before, with the addition of the `matplotlib` modules used for viewing the attention.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator



import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np

import random
import math
import time

Next, we'll set the random seed for reproducability.

In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

As before, we'll import spaCy and define the German and English tokenizers.

In [None]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [None]:
from torchtext.data import Field, TabularDataset, BucketIterator

# 將 tokenizer 改成英文的
import spacy
spacy_en = spacy.load('en_core_web_sm')
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]
MAX_SRC_LEN = 100  # 根據需要調整

def tokenize_en_truncate(text):
    tokens = [tok.text for tok in spacy_en.tokenizer(text)]
    return tokens[:MAX_SRC_LEN]

SRC = Field(tokenize = tokenize_en_truncate,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True,
            include_lengths = True)

TRG = Field(tokenize = tokenize_en,  # 目標摘要一般可以不用截斷，或另設定 MAX_TGT_LEN
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)


When using packed padded sequences, we need to tell PyTorch how long the actual (non-padded) sequences are. Luckily for us, TorchText's `Field` objects allow us to use the `include_lengths` argument, this will cause our `batch.src` to be a tuple. The first element of the tuple is the same as before, a batch of numericalized source sentence as a tensor, and the second element is the non-padded lengths of each source sentence within the batch.

In [None]:
# SRC = Field(tokenize = tokenize_de,
#             init_token = '<sos>',
#             eos_token = '<eos>',
#             lower = True,
#             include_lengths = True)

# TRG = Field(tokenize = tokenize_en,
#             init_token = '<sos>',
#             eos_token = '<eos>',
#             lower = True)

We then load the data.

In [None]:
# def merge_src_tgt_to_tsv(src_file, tgt_file, out_file):
#     with open(src_file, 'r', encoding='utf-8') as f_src, \
#          open(tgt_file, 'r', encoding='utf-8') as f_tgt, \
#          open(out_file, 'w', encoding='utf-8') as f_out:

#         for src_line, tgt_line in zip(f_src, f_tgt):
#             src_line = src_line.strip()
#             tgt_line = tgt_line.strip()
#             # 以 TAB 分隔，第一欄是文章，第二欄是摘要
#             f_out.write(f"{src_line}\t{tgt_line}\n")

# # 假設你的檔案都在 /content/drive/MyDrive/cnndm/data/ 裡
# data_dir = "/content/drive/MyDrive/cnndm/data"

# merge_src_tgt_to_tsv(f"{data_dir}/train.txt.src", f"{data_dir}/train.txt.tgt", f"{data_dir}/train.tsv")
# merge_src_tgt_to_tsv(f"{data_dir}/val.txt.src",   f"{data_dir}/val.txt.tgt",   f"{data_dir}/val.tsv")
# merge_src_tgt_to_tsv(f"{data_dir}/test.txt.src",  f"{data_dir}/test.txt.tgt",  f"{data_dir}/test.tsv")


In [None]:
# from torchtext.data import Field, TabularDataset, BucketIterator

# # 將 tokenizer 改成英文的
# import spacy
# spacy_en = spacy.load('en_core_web_sm')
# def tokenize_en(text):
#     return [tok.text for tok in spacy_en.tokenizer(text)]

# # 定義 Fields，注意：這裡 source 與 target 都是英文
# SRC = Field(tokenize = tokenize_en,
#             init_token = '<sos>',
#             eos_token = '<eos>',
#             lower = True,
#             include_lengths = True)

# TRG = Field(tokenize = tokenize_en,
#             init_token = '<sos>',
#             eos_token = '<eos>',
#             lower = True)

In [None]:
# # 定義欄位順序：第一欄為 src, 第二欄為 tgt
# fields = [('src', SRC), ('tgt', TRG)]

In [None]:
# # 設定 TSV 檔案所在的路徑
# data_path = "/content/drive/MyDrive/cnndm/data"  # 根據你實際的路徑調整

# train_data, valid_data, test_data = TabularDataset.splits(
#     path = data_path,
#     train = "train.tsv",
#     validation = "val.tsv",
#     test = "test.tsv",
#     format = "tsv",
#     fields = fields
# )


In [None]:
# import pandas as pd

# # 讀取檔案
# train_df = pd.read_csv("/content/drive/MyDrive/cnndm/data/train.tsv", sep="\t", header=None, names=["src", "tgt"])
# val_df   = pd.read_csv("/content/drive/MyDrive/cnndm/data/val.tsv", sep="\t", header=None, names=["src", "tgt"])
# test_df  = pd.read_csv("/content/drive/MyDrive/cnndm/data/test.tsv", sep="\t", header=None, names=["src", "tgt"])

# print(train_df.head())


In [None]:
# SRC.build_vocab(train_df, min_freq = 2)
# TRG.build_vocab(train_df, min_freq = 2)

In [None]:
# train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
#                                                     fields = (SRC, TRG))

In [None]:
fields = [('src', SRC), ('tgt', TRG)]
data_path = "/content/drive/MyDrive/cnndm/data"

train_data, valid_data, test_data = TabularDataset.splits(
    path = data_path,
    train = "train.tsv",
    validation = "val.tsv",
    test = "test.tsv",
    format = "tsv",
    fields = fields
)


And build the vocabulary.

In [None]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

Next, we handle the iterators.

One quirk about packed padded sequences is that all elements in the batch need to be sorted by their non-padded lengths in descending order, i.e. the first sentence in the batch needs to be the longest. We use two arguments of the iterator to handle this, `sort_within_batch` which tells the iterator that the contents of the batch need to be sorted, and `sort_key` a function which tells the iterator how to sort the elements in the batch. Here, we sort by the length of the `src` sentence.

In [None]:
BATCH_SIZE = 8

In [None]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
#     (train_df, val_df, test_df),
#      batch_size = BATCH_SIZE,
#      sort_within_batch = True,
#      sort_key = lambda x : len(x.src),
#      device = device)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
     batch_size = BATCH_SIZE,
     sort_within_batch = True,
     sort_key = lambda x : len(x.src),
     device = device)

## Building the Model

### Encoder

Next up, we define the encoder.

The changes here all within the `forward` method. It now accepts the lengths of the source sentences as well as the sentences themselves.

After the source sentence (padded automatically within the iterator) has been embedded, we can then use `pack_padded_sequence` on it with the lengths of the sentences. Note that the tensor containing the lengths of the sequences must be a CPU tensor as of the latest version of PyTorch, which we explicitly do so with `to('cpu')`. `packed_embedded` will then be our packed padded sequence. This can be then fed to our RNN as normal which will return `packed_outputs`, a packed tensor containing all of the hidden states from the sequence, and `hidden` which is simply the final hidden state from our sequence. `hidden` is a standard tensor and not packed in any way, the only difference is that as the input was a packed sequence, this tensor is from the final **non-padded element** in the sequence.

We then unpack our `packed_outputs` using `pad_packed_sequence` which returns the `outputs` and the lengths of each, which we don't need.

The first dimension of `outputs` is the padded sequence lengths however due to using a packed padded sequence the values of tensors when a padding token was the input will be all zeros.

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)

        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_len):

        #src = [src len, batch size]
        #src_len = [batch size]

        embedded = self.dropout(self.embedding(src))

        #embedded = [src len, batch size, emb dim]

        #need to explicitly put lengths on cpu!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len.to('cpu'))

        packed_outputs, hidden = self.rnn(packed_embedded)

        #packed_outputs is a packed sequence containing all hidden states
        #hidden is now from the final non-padded element in the batch

        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs)

        #outputs is now a non-packed sequence, all hidden states obtained
        #  when the input is a pad token are all zeros

        #outputs = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]

        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer

        #hidden [-2, :, : ] is the last of the forwards RNN
        #hidden [-1, :, : ] is the last of the backwards RNN

        #initial decoder hidden is final hidden state of the forwards and backwards
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))

        #outputs = [src len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]

        return outputs, hidden

### Attention

The attention module is where we calculate the attention values over the source sentence.

Previously, we allowed this module to "pay attention" to padding tokens within the source sentence. However, using *masking*, we can force the attention to only be over non-padding elements.

The `forward` method now takes a `mask` input. This is a **[batch size, source sentence length]** tensor that is 1 when the source sentence token is not a padding token, and 0 when it is a padding token. For example, if the source sentence is: ["hello", "how", "are", "you", "?", `<pad>`, `<pad>`], then the mask would be [1, 1, 1, 1, 1, 0, 0].

We apply the mask after the attention has been calculated, but before it has been normalized by the `softmax` function. It is applied using `masked_fill`. This fills the tensor at each element where the first argument (`mask == 0`) is true, with the value given by the second argument (`-1e10`). In other words, it will take the un-normalized attention values, and change the attention values over padded elements to be `-1e10`. As these numbers will be miniscule compared to the other values they will become zero when passed through the `softmax` layer, ensuring no attention is payed to padding tokens in the source sentence.

In [None]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()

        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)

    def forward(self, hidden, encoder_outputs, mask):

        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]

        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]

        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]

        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2)))

        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(2)

        #attention = [batch size, src len]

        attention = attention.masked_fill(mask == 0, -1e10)

        return F.softmax(attention, dim = 1)

### Decoder

The decoder only needs a few small changes. It needs to accept a mask over the source sentence and pass this to the attention module. As we want to view the values of attention during inference, we also return the attention tensor.

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)

        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs, mask):

        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        #mask = [batch size, src len]

        input = input.unsqueeze(0)

        #input = [1, batch size]

        embedded = self.dropout(self.embedding(input))

        #embedded = [1, batch size, emb dim]

        a = self.attention(hidden, encoder_outputs, mask)

        #a = [batch size, src len]

        a = a.unsqueeze(1)

        #a = [batch size, 1, src len]

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        #encoder_outputs = [batch size, src len, enc hid dim * 2]

        weighted = torch.bmm(a, encoder_outputs)

        #weighted = [batch size, 1, enc hid dim * 2]

        weighted = weighted.permute(1, 0, 2)

        #weighted = [1, batch size, enc hid dim * 2]

        rnn_input = torch.cat((embedded, weighted), dim = 2)

        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]

        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))

        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]

        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)

        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))

        #prediction = [batch size, output dim]

        return prediction, hidden.squeeze(0), a.squeeze(1)

### Seq2Seq

The overarching seq2seq model also needs a few changes for packed padded sequences, masking and inference.

We need to tell it what the indexes are for the pad token and also pass the source sentence lengths as input to the `forward` method.

We use the pad token index to create the masks, by creating a mask tensor that is 1 wherever the source sentence is not equal to the pad token. This is all done within the `create_mask` function.

The sequence lengths as needed to pass to the encoder to use packed padded sequences.

The attention at each time-step is stored in the `attentions`

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.device = device

    def create_mask(self, src):
        mask = (src != self.src_pad_idx).permute(1, 0)
        return mask

    def forward(self, src, src_len, trg, teacher_forcing_ratio = 0.5):

        #src = [src len, batch size]
        #src_len = [batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time

        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src, src_len)

        #first input to the decoder is the <sos> tokens
        input = trg[0,:]

        mask = self.create_mask(src)

        #mask = [batch size, src len]

        for t in range(1, trg_len):

            #insert input token embedding, previous hidden state, all encoder hidden states
            #  and mask
            #receive output tensor (predictions) and new hidden state
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs, mask)

            #place predictions in a tensor holding predictions for each token
            outputs[t] = output

            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio

            #get the highest predicted token from our predictions
            top1 = output.argmax(1)

            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

## Training the Seq2Seq Model

Next up, initializing the model and placing it on the GPU.

In [None]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
# ENC_EMB_DIM = 256
# DEC_EMB_DIM = 256
# ENC_HID_DIM = 512
# DEC_HID_DIM = 512
# ENC_DROPOUT = 0.5
# DEC_DROPOUT = 0.5
# SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]

# attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
# enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
# dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

# model = Seq2Seq(enc, dec, SRC_PAD_IDX, device).to(device)


# 將原本的 256 改為 128
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128

# 將原本的 512 改為 256
ENC_HID_DIM = 256
DEC_HID_DIM = 256

# dropout 可以適度降低或維持 0.5，通常影響不大，但可以嘗試 0.3
ENC_DROPOUT = 0.3
DEC_DROPOUT = 0.3

# 其餘部分維持不變
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, SRC_PAD_IDX, device).to(device)


Then, we initialize the model parameters.

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(119237, 128)
    (rnn): GRU(128, 256, bidirectional=True)
    (fc): Linear(in_features=512, out_features=256, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=768, out_features=256, bias=True)
      (v): Linear(in_features=256, out_features=1, bias=False)
    )
    (embedding): Embedding(96947, 128)
    (rnn): GRU(640, 256)
    (fc_out): Linear(in_features=896, out_features=96947, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
)

We'll print out the number of trainable parameters in the model, noticing that it has the exact same amount of parameters as the model without these improvements.

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 116,244,019 trainable parameters


Then we define our optimizer and criterion.

The `ignore_index` for the criterion needs to be the index of the pad token for the target language, not the source language.

In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

Next, we'll define our training and evaluation loops.

As we are using `include_lengths = True` for our source field, `batch.src` is now a tuple with the first element being the numericalized tensor representing the sentence and the second element being the lengths of each sentence within the batch.

Our model also returns the attention vectors over the batch of source source sentences for each decoding time-step. We won't use these during the training/evaluation, but we will later for inference.

In [None]:
# def train(model, iterator, optimizer, criterion, clip):

#     model.train()

#     epoch_loss = 0

#     for i, batch in enumerate(iterator):

#         src, src_len = batch.src
#         trg = batch.tgt

#         optimizer.zero_grad()

#         output = model(src, src_len, trg)

#         #trg = [trg len, batch size]
#         #output = [trg len, batch size, output dim]

#         output_dim = output.shape[-1]

#         output = output[1:].view(-1, output_dim)
#         trg = trg[1:].view(-1)

#         #trg = [(trg len - 1) * batch size]
#         #output = [(trg len - 1) * batch size, output dim]

#         loss = criterion(output, trg)

#         loss.backward()

#         torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

#         optimizer.step()

#         epoch_loss += loss.item()

#     return epoch_loss / len(iterator)

In [None]:
from tqdm import tqdm

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    # 用 tqdm 包裹 iterator，total 為迭代器的長度
    for i, batch in tqdm(enumerate(iterator), total=len(iterator), desc="Training"):
        src, src_len = batch.src
        tgt = batch.tgt  # 注意這裡使用 tgt

        optimizer.zero_grad()
        output = model(src, src_len, tgt)

        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        tgt = tgt[1:].view(-1)

        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


In [None]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in tqdm(enumerate(iterator), total=len(iterator), desc="Evaluating"):
            src, src_len = batch.src
            tgt = batch.tgt

            output = model(src, src_len, tgt, 0)  # teacher_forcing_ratio=0
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            tgt = tgt[1:].view(-1)

            loss = criterion(output, tgt)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)


Then, we'll define a useful function for timing how long epochs take.

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

The penultimate step is to train our model. Notice how it takes almost half the time as our model without the improvements added in this notebook.

In [None]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Training: 100%|██████████| 35904/35904 [4:46:44<00:00,  2.09it/s]
Evaluating: 100%|██████████| 1671/1671 [06:24<00:00,  4.35it/s]


Epoch: 01 | Time: 293m 8s
	Train Loss: 6.126 | Train PPL: 457.464
	 Val. Loss: 6.609 |  Val. PPL: 741.920


Training: 100%|██████████| 35904/35904 [4:45:04<00:00,  2.10it/s]
Evaluating: 100%|██████████| 1671/1671 [06:21<00:00,  4.38it/s]


Epoch: 02 | Time: 291m 25s
	Train Loss: 5.661 | Train PPL: 287.536
	 Val. Loss: 6.579 |  Val. PPL: 720.005


Training: 100%|██████████| 35904/35904 [4:45:22<00:00,  2.10it/s]
Evaluating: 100%|██████████| 1671/1671 [06:24<00:00,  4.35it/s]


Epoch: 03 | Time: 291m 47s
	Train Loss: 5.500 | Train PPL: 244.584
	 Val. Loss: 6.581 |  Val. PPL: 721.437


Training: 100%|██████████| 35904/35904 [4:46:55<00:00,  2.09it/s]
Evaluating: 100%|██████████| 1671/1671 [06:26<00:00,  4.33it/s]


Epoch: 04 | Time: 293m 21s
	Train Loss: 5.391 | Train PPL: 219.327
	 Val. Loss: 6.575 |  Val. PPL: 717.190


Training:  91%|█████████ | 32653/35904 [4:20:30<26:57,  2.01it/s]

Finally, we load the parameters from our best validation loss and get our results on the test set.

We get the improved test perplexity whilst almost being twice as fast!

In [None]:
model.load_state_dict(torch.load('tut4-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

## Inference

Now we can use our trained model to generate translations.

**Note:** these translations will be poor compared to examples shown in paper as they use hidden dimension sizes of 1000 and train for 4 days! They have been cherry picked in order to show off what attention should look like on a sufficiently sized model.

Our `translate_sentence` will do the following:
- ensure our model is in evaluation mode, which it should always be for inference
- tokenize the source sentence if it has not been tokenized (is a string)
- numericalize the source sentence
- convert it to a tensor and add a batch dimension
- get the length of the source sentence and convert to a tensor
- feed the source sentence into the encoder
- create the mask for the source sentence
- create a list to hold the output sentence, initialized with an `<sos>` token
- create a tensor to hold the attention values
- while we have not hit a maximum length
  - get the input tensor, which should be either `<sos>` or the last predicted token
  - feed the input, all encoder outputs, hidden state and mask into the decoder
  - store attention values
  - get the predicted next token
  - add prediction to current output sentence prediction
  - break if the prediction was an `<eos>` token
- convert the output sentence from indexes to tokens
- return the output sentence (with the `<sos>` token removed) and the attention values over the sequence

In [None]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):

    model.eval()

    if isinstance(sentence, str):
        nlp = spacy.load('de')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]

    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)

    src_len = torch.LongTensor([len(src_indexes)])

    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor, src_len)

    mask = model.create_mask(src_tensor)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    attentions = torch.zeros(max_len, 1, len(src_indexes)).to(device)

    for i in range(max_len):

        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)

        with torch.no_grad():
            output, hidden, attention = model.decoder(trg_tensor, hidden, encoder_outputs, mask)

        attentions[i] = attention

        pred_token = output.argmax(1).item()

        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break

    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]

    return trg_tokens[1:], attentions[:len(trg_tokens)-1]

Next, we'll make a function that displays the model's attention over the source sentence for each target token generated.

In [None]:
# def display_attention(sentence, translation, attention):

#     fig = plt.figure(figsize=(20,5))
#     ax = fig.add_subplot(111)

#     attention = attention.squeeze(1).cpu().detach().numpy()

#     cax = ax.matshow(attention, cmap='bone')

#     ax.tick_params(labelsize=15)

#     x_ticks = [''] + ['<sos>'] + [t.lower() for t in sentence] + ['<eos>']
#     y_ticks = [''] + translation

#     ax.set_xticklabels(x_ticks, rotation=45)
#     ax.set_yticklabels(y_ticks)

#     ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
#     ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

#     plt.show()
#     plt.close()
def display_attention(sentence, translation, attention):
    fig, ax = plt.subplots(figsize=(18, 8))  # Increase figure size

    # Convert attention to numpy
    attention = attention.squeeze(1).cpu().detach().numpy()

    # Plot attention heatmap
    cax = ax.matshow(attention, cmap='bone')

    # Prepare tick labels
    x_ticks = [''] + ['<sos>'] + [t.lower() for t in sentence] + ['<eos>']
    y_ticks = [''] + translation

    # # Optionally truncate if too long
    # max_display_len = 10
    # if len(x_ticks) > max_display_len:
    #     x_ticks = x_ticks[:max_display_len] + ['...']
    #     attention = attention[:, :max_display_len]

    ax.set_xticklabels(x_ticks, rotation=55, ha='right')
    ax.set_yticklabels(y_ticks)
    ax.tick_params(labelsize=6)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    # Adjust padding around the plot
    plt.subplots_adjust(bottom=0.3, top=0.9, left=0.15, right=0.95)

    # Add colorbar if you want to interpret intensity scale
    plt.colorbar(cax)
    plt.savefig('attention_plot.png', dpi=300, bbox_inches='tight')
    plt.show()
    plt.close()

Now, we'll grab some translations from our dataset and see how well our model did. Note, we're going to cherry pick examples here so it gives us something interesting to look at, but feel free to change the `example_idx` value to look at different examples.

First, we'll get a source and target from our dataset.

In [None]:
example_idx = 12

src = vars(train_data.examples[example_idx])['src']
trg = vars(train_data.examples[example_idx])['tgt']

print(f'src = {src}')
print(f'trg = {trg}')

Then we'll use our `translate_sentence` function to get our predicted translation and attention. We show this graphically by having the source sentence on the x-axis and the predicted translation on the y-axis. The lighter the square at the intersection between two words, the more attention the model gave to that source word when translating that target word.

Below is an example the model attempted to translate, it gets the translation correct except changes *are fighting* to just *fighting*.

In [None]:
translation, attention = translate_sentence(src, SRC, TRG, model, device)

print(f'predicted trg = {translation}')

In [None]:
display_attention(src, translation, attention)

Translations from the training set could simply be memorized by the model. So it's only fair we look at translations from the validation and testing set too.

Starting with the validation set, let's get an example.

In [None]:
example_idx = 14

src = vars(valid_data.examples[example_idx])['src']
trg = vars(valid_data.examples[example_idx])['tgt']

print(f'src = {src}')
print(f'trg = {trg}')

Then let's generate our translation and view the attention.

Here, we can see the translation is the same except for swapping *female* with *woman*.

In [None]:
translation, attention = translate_sentence(src, SRC, TRG, model, device)

print(f'predicted trg = {translation}')

display_attention(src, translation, attention)

Finally, let's get an example from the test set.

In [None]:
example_idx = 18

src = vars(test_data.examples[example_idx])['src']
trg = vars(test_data.examples[example_idx])['tgt']

print(f'src = {src}')
print(f'trg = {trg}')

Again, it produces a slightly different translation than target, a more literal version of the source sentence. It swaps *mountain climbing* for *climbing a mountain*.

In [None]:
translation, attention = translate_sentence(src, SRC, TRG, model, device)

print(f'predicted trg = {translation}')

display_attention(src, translation, attention)

## BLEU

Previously we have only cared about the loss/perplexity of the model. However there metrics that are specifically designed for measuring the quality of a translation - the most popular is *BLEU*. Without going into too much detail, BLEU looks at the overlap in the predicted and actual target sequences in terms of their n-grams. It will give us a number between 0 and 1 for each sequence, where 1 means there is perfect overlap, i.e. a perfect translation, although is usually shown between 0 and 100. BLEU was designed for multiple candidate translations per source sequence, however in this dataset we only have one candidate per source.

We define a `calculate_bleu` function which calculates the BLEU score over a provided TorchText dataset. This function creates a corpus of the actual and predicted translation for each source sentence and then calculates the BLEU score.

In [None]:
from torchtext.data.metrics import bleu_score

def calculate_bleu(data, src_field, trg_field, model, device, max_len = 50):

    trgs = []
    pred_trgs = []

    for datum in data:

        src = vars(datum)['src']
        trg = vars(datum)['tgt']

        pred_trg, _ = translate_sentence(src, src_field, trg_field, model, device, max_len)

        #cut off <eos> token
        pred_trg = pred_trg[:-1]

        pred_trgs.append(pred_trg)
        trgs.append([trg])

    return bleu_score(pred_trgs, trgs)

We get a BLEU of around 28. If we compare it to the paper that the attention model is attempting to replicate, they achieve a BLEU score of 26.75. This is similar to our score, however they are using a completely different dataset and their model size is much larger - 1000 hidden dimensions which takes 4 days to train! - so we cannot really compare against that either.

This number isn't really interpretable, we can't really say much about it. The most useful part of a BLEU score is that it can be used to compare different models on the same dataset, where the one with the **higher** BLEU score is "better".

In [None]:
bleu_score = calculate_bleu(test_data, SRC, TRG, model, device)

print(f'BLEU score = {bleu_score*100:.2f}')

In the next tutorials we will be moving away from using recurrent neural networks and start looking at other ways to construct sequence-to-sequence models. Specifically, in the next tutorial we will be using convolutional neural networks.

In [None]:
# Save the checkpoint dictionary to Google Drive
checkpoint = {
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'valid_loss': best_valid_loss
}
checkpoint_path = '/content/drive/MyDrive/PPL_final_checkpoint.pt'
torch.save(checkpoint, checkpoint_path)


In [None]:
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
start_epoch = checkpoint['epoch'] + 1  # 從上次訓練結束後的下一個 epoch 開始
best_valid_loss = checkpoint['valid_loss']
print(f"Resuming training from epoch {start_epoch} with best_valid_loss = {best_valid_loss:.3f}")

In [None]:
ADDITIONAL_EPOCHS = 2  # 設定你想再訓練的 epoch 數

for epoch in range(start_epoch, start_epoch + ADDITIONAL_EPOCHS):
    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')

    # 將 checkpoint 存到 Google Drive 中，方便下次繼續
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'valid_loss': best_valid_loss
    }
    torch.save(checkpoint, checkpoint_path)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
