In [None]:
!pip install torch torchtext spacy matplotlib
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm
!pip install torchtext==0.6.0
!pip install rouge-score
!pip install rouge
!pip install rouge2

Collecting torchtext
  Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter

#############################
# 1. Preprocessing
#############################
def load_data(data_dir):
    train_src_path = os.path.join(data_dir, "train.txt.src")
    train_tgt_path = os.path.join(data_dir, "train.txt.tgt")
    val_src_path = os.path.join(data_dir, "val.txt.src")
    val_tgt_path = os.path.join(data_dir, "val.txt.tgt")
    test_src_path = os.path.join(data_dir, "test.txt.src")
    test_tgt_path = os.path.join(data_dir, "test.txt.tgt")

    with open(train_src_path, "r", encoding="utf-8") as f:
        train_src = f.readlines()
    with open(train_tgt_path, "r", encoding="utf-8") as f:
        train_tgt = f.readlines()
    with open(val_src_path, "r", encoding="utf-8") as f:
        val_src = f.readlines()
    with open(val_tgt_path, "r", encoding="utf-8") as f:
        val_tgt = f.readlines()
    with open(test_src_path, "r", encoding="utf-8") as f:
        test_src = f.readlines()
    with open(test_tgt_path, "r", encoding="utf-8") as f:
        test_tgt = f.readlines()

    return train_src, train_tgt, val_src, val_tgt, test_src, test_tgt


data_dir = "/content/drive/MyDrive/cnndm/data"
train_src, train_tgt, val_src, val_tgt, test_src, test_tgt = load_data(data_dir)
print("Number of training sets：", len(train_src))


def tokenize(text):
    return text.lower().strip().split()

def numericalize(text, word2idx, add_special_tokens=True):
    tokens = tokenize(text)
    if add_special_tokens:
        tokens = ["<SOS>"] + tokens + ["<EOS>"]
    return [word2idx.get(token, word2idx["<UNK>"]) for token in tokens]

def build_vocab(texts, min_freq=5, max_size=None):
    counter = Counter()
    for line in texts:
        tokens = tokenize(line)
        counter.update(tokens)
    vocab_tokens = [token for token, freq in counter.items() if freq >= min_freq]
    if max_size is not None:
        vocab_tokens = sorted(vocab_tokens, key=lambda w: counter[w], reverse=True)[:max_size]
    vocab = {"<PAD>", "<SOS>", "<EOS>", "<UNK>"} | set(vocab_tokens)
    word2idx = {word: idx for idx, word in enumerate(sorted(vocab))}
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

src_word2idx, src_idx2word = build_vocab(train_src, min_freq=5, max_size=50000)
tgt_word2idx, tgt_idx2word = build_vocab(train_tgt, min_freq=5, max_size=30000)
print("Source dictionary size:", len(src_word2idx))
print("Target dictionary size:", len(tgt_word2idx))

#############################
# 2. Def Dataset and DataLoader
#############################

def numericalize(text, word2idx, add_special_tokens=True, max_length=None):
    tokens = tokenize(text)
    if max_length is not None:
        tokens = tokens[:max_length]
    if add_special_tokens:
        tokens = ["<SOS>"] + tokens + ["<EOS>"]
    return [word2idx.get(token, word2idx["<UNK>"]) for token in tokens]


class CNNDMDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, src_word2idx, tgt_word2idx, src_max_length=None, tgt_max_length=None):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.src_word2idx = src_word2idx
        self.tgt_word2idx = tgt_word2idx
        self.src_max_length = src_max_length
        self.tgt_max_length = tgt_max_length

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src = numericalize(self.src_texts[idx], self.src_word2idx, max_length=self.src_max_length)
        tgt = numericalize(self.tgt_texts[idx], self.tgt_word2idx, max_length=self.tgt_max_length)
        return torch.tensor(src), torch.tensor(tgt)

def collate_fn(batch, pad_idx):
    src_batch, tgt_batch = zip(*batch)
    src_padded = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=pad_idx)
    tgt_padded = torch.nn.utils.rnn.pad_sequence(tgt_batch, padding_value=pad_idx)
    return src_padded, tgt_padded

# DataLoader
train_dataset = CNNDMDataset(train_src, train_tgt, src_word2idx, tgt_word2idx, src_max_length=400, tgt_max_length=100)

pad_idx_src = src_word2idx["<PAD>"]
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True,
                          collate_fn=lambda batch: collate_fn(batch, pad_idx_src))

#############################
# 3. MODEL：Encoder, Attention, Decoder, Seq2Seq
#############################
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hid_dim * 2, hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):

        embedded = self.dropout(self.embedding(src))  # [src_len, batch_size, emb_dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        hidden_cat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        hidden = torch.tanh(self.fc(hidden_cat))  # [batch_size, hid_dim]
        return outputs, hidden  # outputs: [src_len, batch_size, hid_dim*2]

class Attention(nn.Module):
    def __init__(self, hid_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hid_dim * 3, hid_dim)
        self.v = nn.Linear(hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)  # [batch_size, src_len, hid_dim]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)   # [batch_size, src_len, hid_dim*2]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)  # [batch_size, src_len]
        return torch.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, attention):
        super(Decoder, self).__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM((hid_dim * 2) + emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear((hid_dim * 2) + emb_dim + hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(0)  # [1, batch_size]
        embedded = self.dropout(self.embedding(input))  # [1, batch_size, emb_dim]
        att_hidden = hidden if hidden.dim() == 2 else hidden[-1]
        a = self.attention(att_hidden, encoder_outputs)  # [batch_size, src_len]
        a = a.unsqueeze(1)  # [batch_size, 1, src_len]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)  # [batch_size, src_len, hid_dim*2]
        weighted = torch.bmm(a, encoder_outputs)  # [batch_size, 1, hid_dim*2]
        weighted = weighted.permute(1, 0, 2)  # [1, batch_size, hid_dim*2]
        rnn_input = torch.cat((embedded, weighted), dim=2)  # [1, batch_size, emb_dim + hid_dim*2]
        output, (hidden, cell) = self.rnn(rnn_input)
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):

        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[0, :]

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

#############################
# 4. Model training settings and training cycles
#############################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
INPUT_DIM = len(src_word2idx)
OUTPUT_DIM = len(tgt_word2idx)
ENC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
DROPOUT = 0.5

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
attention = Attention(HID_DIM)
decoder = Decoder(OUTPUT_DIM, 256, HID_DIM, N_LAYERS, DROPOUT, attention)
model = Seq2Seq(encoder, decoder, device).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=tgt_word2idx["<PAD>"])
optimizer = optim.Adam(model.parameters(), lr=1e-3)

scaler = torch.cuda.amp.GradScaler()

from tqdm import tqdm
import torch.cuda.amp

def train_one_epoch(model, data_loader, optimizer, criterion, clip=1.0):
    model.train()
    epoch_loss = 0
    scaler = torch.cuda.amp.GradScaler()
    for src_batch, tgt_batch in tqdm(data_loader, desc="Training batches"):
        src_batch = src_batch.to(device)
        tgt_batch = tgt_batch.to(device)
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            output = model(src_batch, tgt_batch, teacher_forcing_ratio=0.5)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            tgt_batch = tgt_batch[1:].view(-1)
            loss = criterion(output, tgt_batch)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)


NUM_EPOCHS = 10
for epoch in range(NUM_EPOCHS):
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch+1}, Train Loss = {train_loss:.4f}")


訓練集筆數： 287227
來源字典大小: 50004
目標字典大小: 30004


  scaler = torch.cuda.amp.GradScaler()
  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Training batches: 100%|██████████| 17952/17952 [2:36:01<00:00,  1.92it/s]


Epoch 1, Train Loss = 4.0950


Training batches: 100%|██████████| 17952/17952 [2:35:56<00:00,  1.92it/s]


Epoch 2, Train Loss = 3.8702


Training batches: 100%|██████████| 17952/17952 [2:36:23<00:00,  1.91it/s]


Epoch 3, Train Loss = 3.7618


Training batches: 100%|██████████| 17952/17952 [2:36:15<00:00,  1.91it/s]


Epoch 4, Train Loss = 3.7079


Training batches: 100%|██████████| 17952/17952 [2:35:58<00:00,  1.92it/s]


Epoch 5, Train Loss = 3.6739


Training batches: 100%|██████████| 17952/17952 [2:35:53<00:00,  1.92it/s]


Epoch 6, Train Loss = 3.5895


Training batches: 100%|██████████| 17952/17952 [2:35:52<00:00,  1.92it/s]


Epoch 7, Train Loss = 3.5855


Training batches: 100%|██████████| 17952/17952 [2:35:48<00:00,  1.92it/s]


Epoch 8, Train Loss = 3.6436


Training batches:  12%|█▏        | 2147/17952 [18:34<2:18:31,  1.90it/s]

In [None]:
def generate_predictions(model, dataset, device, idx2word, max_length=100):
    model.eval()
    predictions = []
    loader = DataLoader(dataset, batch_size=32, shuffle=False,
                        collate_fn=lambda batch: collate_fn(batch, pad_idx_src))
    with torch.no_grad():
        for src_batch, _ in loader:
            src_batch = src_batch.to(device)
            batch_size = src_batch.shape[1]
            encoder_outputs, hidden = model.encoder(src_batch)
            input_token = torch.tensor([tgt_word2idx["<SOS>"]]*batch_size, device=device)
            batch_preds = [[] for _ in range(batch_size)]
            for t in range(max_length):
                output, hidden = model.decoder(input_token, hidden, encoder_outputs)
                top1 = output.argmax(1)
                input_token = top1
                for i, token in enumerate(top1.tolist()):
                    if token == tgt_word2idx["<EOS>"]:
                        continue
                    batch_preds[i].append(token)
            for pred in batch_preds:
                sentence = " ".join([idx2word[token] for token in pred])
                predictions.append(sentence)
    return predictions

test_dataset = CNNDMDataset(test_src, test_tgt, src_word2idx, tgt_word2idx, src_max_length=400, tgt_max_length=100)
predictions = generate_predictions(model, test_dataset, device, tgt_idx2word, max_length=100)
with open("summaries.txt", "w", encoding="utf-8") as f:
    for pred in predictions:
        f.write(pred + "\n")


In [None]:
save_path = "/content/drive/MyDrive/cnndm/data/summaries.txt"

with open(save_path, "w", encoding="utf-8") as f:
    for pred in predictions:
        f.write(pred + "\n")

print("The Ans file is stored in：", save_path)


In [None]:

save_path = "/content/drive/MyDrive/cnndm/data/summaries.txt"

with open(save_path, "w", encoding="utf-8") as f:
    for pred in predictions:
        f.write(pred + "\n")

print("The Ans file is stored in：", save_path)


In [None]:
from rouge import Rouge, FilesRouge

rouge = Rouge()

hyp_path = '/content/drive/MyDrive/cnndm/data/test.txt.tgt'
ref_path = '/content/drive/MyDrive/cnndm/data/summaries.txt'

hypothesis = []
with open(hyp_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for l in lines:
        hypothesis.append(l.strip())

reference = []
with open(ref_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for l in lines:
        reference.append(l.strip())

scores = rouge.get_scores(hypothesis, reference, avg=True)
print("ROUGE Evaluation Scores:")
print("ROUGE-1 F-score: {:.4f}".format(scores["rouge-1"]["f"]))
print("ROUGE-2 F-score: {:.4f}".format(scores["rouge-2"]["f"]))
print("ROUGE-L F-score: {:.4f}".format(scores["rouge-l"]["f"]))


In [None]:
# def init_weights(m):
#     for name, param in m.named_parameters():
#         if 'weight' in name:
#             nn.init.normal_(param.data, mean=0, std=0.01)
#         else:
#             nn.init.constant_(param.data, 0)

# model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(119237, 128)
    (rnn): GRU(128, 256, bidirectional=True)
    (fc): Linear(in_features=512, out_features=256, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=768, out_features=256, bias=True)
      (v): Linear(in_features=256, out_features=1, bias=False)
    )
    (embedding): Embedding(96947, 128)
    (rnn): GRU(640, 256)
    (fc_out): Linear(in_features=896, out_features=96947, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
)

In [None]:
# N_EPOCHS = 10
# CLIP = 1

# best_valid_loss = float('inf')

# for epoch in range(N_EPOCHS):

#     start_time = time.time()

#     train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
#     valid_loss = evaluate(model, valid_iterator, criterion)

#     end_time = time.time()

#     epoch_mins, epoch_secs = epoch_time(start_time, end_time)

#     if valid_loss < best_valid_loss:
#         best_valid_loss = valid_loss
#         torch.save(model.state_dict(), 'tut4-model.pt')

#     print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
#     print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
#     print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Training: 100%|██████████| 35904/35904 [4:46:44<00:00,  2.09it/s]
Evaluating: 100%|██████████| 1671/1671 [06:24<00:00,  4.35it/s]


Epoch: 01 | Time: 293m 8s
	Train Loss: 6.126 | Train PPL: 457.464
	 Val. Loss: 6.609 |  Val. PPL: 741.920


Training: 100%|██████████| 35904/35904 [4:45:04<00:00,  2.10it/s]
Evaluating: 100%|██████████| 1671/1671 [06:21<00:00,  4.38it/s]


Epoch: 02 | Time: 291m 25s
	Train Loss: 5.661 | Train PPL: 287.536
	 Val. Loss: 6.579 |  Val. PPL: 720.005


Training: 100%|██████████| 35904/35904 [4:45:22<00:00,  2.10it/s]
Evaluating: 100%|██████████| 1671/1671 [06:24<00:00,  4.35it/s]


Epoch: 03 | Time: 291m 47s
	Train Loss: 5.500 | Train PPL: 244.584
	 Val. Loss: 6.581 |  Val. PPL: 721.437


Training: 100%|██████████| 35904/35904 [4:46:55<00:00,  2.09it/s]
Evaluating: 100%|██████████| 1671/1671 [06:26<00:00,  4.33it/s]


Epoch: 04 | Time: 293m 21s
	Train Loss: 5.391 | Train PPL: 219.327
	 Val. Loss: 6.575 |  Val. PPL: 717.190


Training:  91%|█████████ | 32653/35904 [4:20:30<26:57,  2.01it/s]

Finally, we load the parameters from our best validation loss and get our results on the test set.

We get the improved test perplexity whilst almost being twice as fast!