In [None]:
import re,io,os,sys
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [1]:
import re,io,os,sys
import torch
import torch.nn as nn
import torch.optim as optim
import random
import torchtext,spacy
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from torchtext.data.metrics import bleu_score
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
import pandas as pd
import numpy as np


from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!python -m spacy download zh_core_web_sm
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting zh-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.4.0/zh_core_web_sm-3.4.0-py3-none-any.whl (48.4 MB)
[K     |████████████████████████████████| 48.4 MB 1.5 MB/s 
Collecting spacy-pkuseg<0.1.0,>=0.0.27
  Downloading spacy_pkuseg-0.0.32-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[K     |████████████████████████████████| 2.4 MB 15.0 MB/s 
Installing collected packages: spacy-pkuseg, zh-core-web-sm
Successfully installed spacy-pkuseg-0.0.32 zh-core-web-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('zh_core_web_sm')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.

In [None]:

def getJsonFile(filePath,dir_path = "/content/drive/MyDrive/Colab Notebooks/ithome/torchtext_anki/"):
    return pd.read_json(dir_path+filePath, lines=True, orient='records')

train_json = getJsonFile("anki_train.json")
test_json = getJsonFile("anki_test.json")

total_json = pd.concat([train_json, test_json], axis=0)

zh_tokenizer = get_tokenizer('spacy', language='zh_core_web_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

def tokenize_eng(text):
    text = re.sub(r"([.!?])", r" \1", text.lower())
    return en_tokenizer(text)

def tokenize_zh(text):
    # print(text)
    regex = re.compile(r'[^\u4e00-\u9fa5A-Za-z0-9]')
    text = regex.sub(' ', text.lower())
    return zh_tokenizer(text)


In [None]:

def build_vocab(sentence_list, tokenizer):
  counter = Counter()
  for string_ in sentence_list:
    counter.update(tokenizer(string_))
  return vocab(counter, min_freq =1 , specials=['<unk>', '<bos>', '<eos>', '<pad>'])

# chinese.build_vocab(train_data, max_size=50000, min_freq=50, vectors="glove.6B.100d")
en_vocab = build_vocab(total_json.English, tokenize_eng)
zh_vocab = build_vocab(total_json.Chinese, tokenize_zh)


print ("中文語料的字元表長度: " , len(zh_vocab.vocab) , ", 英文的字元表長度: " ,len(en_vocab.vocab))

print(en_vocab.vocab.get_itos()[1])
print(en_vocab.vocab.get_stoi()['arabic'])

中文語料的字元表長度:  14461 , 英文的字元表長度:  6933
<bos>
4291


In [None]:

def data_process(sentence_list):
  data = []
  for _ , s in sentence_list.iterrows():
    data.append((tokenize_zh(s.Chinese),  tokenize_eng(s.English)))
    # data.append({"zh" : tokenize_zh(s.Chinese), "en" : tokenize_eng(s.English) })
  return data

train_data = data_process(train_json)
test_data = data_process(test_json)


print ("Sample English:", test_data[0][0] , "=> Chinese:", test_data[0][1])


Sample English: ['我', '最近', '忙', '得', '很'] => Chinese: ['i', "'ve", 'been', 'very', 'busy', 'lately', '.']


In [None]:

def translate_sentence(model, sentence_token, zh_vocab, en_vocab, device, max_length=25):

    # 加入開始符號跟結束符號，代表一個句子
    sentence = ['<bos>'] + sentence_token + ['<eos>']

    # 然後把文字轉自 vactor
    text_to_indices = [zh_vocab.vocab.get_stoi()[token] for token in sentence]

    # 再把 vactor list 轉換成 Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # 先宣告 outputs ，然後裡面放一個開符號
    outputs = [en_vocab.vocab.get_stoi()["<bos>"]]

    for i in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        # 機率最大的數值再把它放進output
        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        # 如果是結束字元 eos 的話就中斷不然會一直預測下去
        if best_guess == en_vocab.vocab.get_stoi()["<eos>"]:
            break


    # 再把 vactor 轉成文字， itos = integer to string
    translated_sentence = [en_vocab.vocab.get_itos()[idx] for idx in outputs]
    # remove start token
    return translated_sentence[1:]



def bleu(data, model, tokenize_zh, zh_vocab, en_vocab, device):
    model.eval()
    targets_corpus = []
    outputs_corpus = []

    for (src_token, trg_token) in data:

        prediction = translate_sentence(model, src_token, zh_vocab, en_vocab, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets_corpus.append([trg_token])
        outputs_corpus.append(prediction)


    return bleu_score(outputs_corpus,targets_corpus)

def save_checkpoint(state, filename="/content/drive/MyDrive/Colab Notebooks/ithome/checkpoints/seq2seq_transformer.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [None]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device

        # 呼叫內建的 transformer
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N)
            .to(self.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        )

        src_padding_mask = self.make_src_mask(src)
        # 訓練的時候要加入mask 不然會看到後面的答案
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device
        )

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out


In [None]:
checkPointPath = "/content/drive/MyDrive/Colab Notebooks/ithome/checkpoints/seq2seq_transformer.pth.tar"

# We're ready to define everything we need for training our Seq2Seq model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load_model = True
save_model = True

# Training hyperparameters
num_epochs = 10000
learning_rate = 3e-4
batch_size = 32

# Model hyperparameters
src_vocab_size = len(zh_vocab.vocab)
trg_vocab_size = len(en_vocab.vocab)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 100
forward_expansion = 4
src_pad_idx = en_vocab.vocab.get_stoi()["<pad>"]

# Tensorboard to get nice loss plot
writer = SummaryWriter("runs/loss_plot")
step = 0

In [None]:

PAD_IDX = zh_vocab.vocab.get_stoi()['<pad>']
BOS_IDX = zh_vocab.vocab.get_stoi()['<bos>']
EOS_IDX = zh_vocab.vocab.get_stoi()['<eos>']


def generate_batch(data_batch):
  zh_batch, en_batch = [], []
  for (zh_token, en_token) in data_batch:
    zh_item = torch.tensor([zh_vocab.vocab.get_stoi()[token] for token in zh_token], dtype=torch.long)
    zh_batch.append(torch.cat([torch.tensor([BOS_IDX]), zh_item, torch.tensor([EOS_IDX])], dim=0))

    en_item = torch.tensor([en_vocab.vocab.get_stoi()[token] for token in en_token], dtype=torch.long)
    en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))

  zh_batch = pad_sequence(zh_batch, padding_value=PAD_IDX)
  en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
  return zh_batch, en_batch

train_iter = DataLoader(train_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch)


test_iter = DataLoader(test_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch)


In [None]:

model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

pad_idx = en_vocab.vocab.get_stoi()["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if os.path.isfile(checkPointPath):
    load_checkpoint(torch.load(checkPointPath), model, optimizer)

model.eval()
# running on entire test data takes a while


score = bleu(test_data[1:100], model, tokenize_zh, zh_vocab, en_vocab, device)
print(f"Bleu score {score*100:.2f}")

=> Loading checkpoint
Bleu score 26.37


In [None]:

sentence = "你想要来我家看猫吗?"
example_token = tokenize_zh(sentence)


for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    if save_model:
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(
        model, example_token, zh_vocab, en_vocab, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")
    model.train()
    losses = []

    for batch_idx, batch in enumerate(train_iter):
        # Get input and targets and get to cuda
        inp_data = batch[0].to(device)
        target = batch[1].to(device)

        # Forward prop
        output = model(inp_data, target[:-1, :])

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin.
        # Let's also remove the start token while we're at it
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()

        loss = criterion(output, target)
        losses.append(loss.item())

        # Back prop
        loss.backward()
        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

    mean_loss = sum(losses) / len(losses)
    scheduler.step(mean_loss)

# running on entire test data takes a while
score = bleu(test_data[1:100], model, tokenize_zh, zh_vocab, en_vocab, device)
print(f"Bleu score {score*100:.2f}")

[Epoch 0 / 10000]
=> Saving checkpoint
Translated example sentence: 
 ['would', 'you', 'like', 'to', 'go', 'to', 'my', 'family', '?', '<eos>']
[Epoch 1 / 10000]
=> Saving checkpoint
Translated example sentence: 
 ['would', 'you', 'like', 'to', 'have', 'a', 'supermarket', '?', '<eos>']
[Epoch 2 / 10000]
=> Saving checkpoint
Translated example sentence: 
 ['would', 'you', 'like', 'to', 'have', 'a', 'cat', '?', '<eos>']
[Epoch 3 / 10000]
=> Saving checkpoint
Translated example sentence: 
 ['would', 'you', 'like', 'to', 'go', 'to', 'my', 'house', '?', '<eos>']
[Epoch 4 / 10000]
=> Saving checkpoint
Translated example sentence: 
 ['would', 'you', 'like', 'to', 'come', 'to', 'my', 'house', '?', '<eos>']
[Epoch 5 / 10000]
=> Saving checkpoint
Translated example sentence: 
 ['do', 'you', 'want', 'to', 'come', 'to', 'my', 'house', 'on', 'a', 'cat', '?', '<eos>']
[Epoch 6 / 10000]
=> Saving checkpoint
Translated example sentence: 
 ['would', 'you', 'like', 'to', 'come', 'to', 'my', 'place', '?',

# Next 
