In [None]:
pip install torchtext==0.6.0

In [1]:
!python -m spacy download en
!python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
from torch.utils.tensorboard import SummaryWriter
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import warnings
warnings.filterwarnings('ignore')

In [3]:
from torchtext.data.metrics import bleu_score
import sys

In [4]:
spacy_ger = spacy.load('de')
spacy_eng = spacy.load('en')

### DATA PREPROCESSING

In [5]:
def tokenize_ger(text):
  return [tok.text for tok in spacy_ger.tokenizer(text)]
def tokenize_eng(text):
  return [tok.text for tok in spacy_eng.tokenizer(text)]

In [6]:
german = Field(tokenize = tokenize_ger, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize = tokenize_eng, lower=True, init_token='<sos>', eos_token='<eos>')

In [7]:
train_data, valid_data, test_data = Multi30k.splits(
    exts = ('.de','.en'), fields = (german, english)
)

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 1.04MB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 275kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 262kB/s]


In [8]:
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

### Helper Fucntions

In [9]:



def translate_sentence(model, sentence, german, english, device, max_length=50):
    spacy_ger = spacy.load("de")


    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    outputs = [english.vocab.stoi["<sos>"]]
    for i in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        if best_guess == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
    # remove start token
    return translated_sentence[1:]


def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)

### MODEL

In [10]:
class Transformer(nn.Module):
    def __init__(self,
               embedding_size, 
               src_vocab_size,
               trg_vocab_size,
               src_pad_idx,
               num_heads,
               num_encoder_layers, 
               num_decoder_layers,
               forward_nodes,
               dropout,
               max_len,
               device):
    
      super(Transformer, self).__init__()
      self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
      self.src_position_embedding = nn.Embedding(max_len, embedding_size)
      self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
      self.trg_position_embedding = nn.Embedding(max_len, embedding_size)
      self.device = device 
      self.transformer = nn.Transformer(
          embedding_size,
          num_heads,
          num_encoder_layers,
          num_decoder_layers,
          forward_nodes,
          dropout,
      )

      self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
      self.dropout = nn.Dropout(dropout)
      self.src_pad_idx = src_pad_idx
    
    def make_src_mask(self, src):
      src_mask = src.transpose(0,1) == self.src_pad_idx
      return src_mask
    
    def forward(self, src, trg):
      src_seq_length, N = src.shape
      trg_seq_length, N = trg.shape
      
      src_positions = (torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, N).to(self.device))
      trg_positions = (torch.arange(0, trg_seq_length).unsqueeze(1).expand(trg_seq_length, N).to(self.device))

      embed_src = self.dropout(self.src_word_embedding(src)+ self.src_position_embedding(src_positions))
      embed_trg = self.dropout(self.trg_word_embedding(trg)+ self.trg_position_embedding(trg_positions))

      src_padding_mask = self.make_src_mask(src)
      trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)

      out = self.transformer(
          embed_src,
          embed_trg,
          src_key_padding_mask = src_padding_mask,
          tgt_mask = trg_mask
      )
      out = self.fc_out(out)

      return out




### Training

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epoch = 5
learning_rate = 3e-4
batch_size = 32

src_vocab_size = len(german.vocab)
trg_vocab_size = len(english.vocab)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 100 # for positional embedding
forward_nodes = 4 * embedding_size
src_pad_idx = english.vocab.stoi['<pad>']

writer = SummaryWriter("runs/loss_plot")
step = 0

In [12]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = batch_size,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src)
)

In [13]:
model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_nodes,
    dropout,
    max_len,
    device
).to(device)


optimizer = optim.Adam(model.parameters(), lr= learning_rate)
pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index= pad_idx)

In [14]:
ex_sentence = 'ein pferd geht unter einer brücke neben einem boot'
original = 'a horse goes besides a bridge next to a boat'
for epoch in range(num_epoch):
  print(f"[Epoch {epoch} / {num_epoch}]")

  model.eval()
  translated_sentence = translate_sentence(model, ex_sentence, german, english, device, max_length=50)
  print(f"Translated example is {translated_sentence[:-1]}")

  model.train()

  for batch_idx, batch in enumerate(train_iterator):
    inp_data = batch.src.to(device)
    target = batch.trg.to(device)

    output = model(inp_data, target[:-1])
    output = output.reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)
    optimizer.zero_grad()
    
    loss = criterion(output, target)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    optimizer.step()

    writer.add_scalar("Training loss", loss, global_step=step)
score = bleu(test_data, model, german, english, device)
print(f"Bleu score {score*100:.2f}")




[Epoch 0 / 5]
Translated example is ['flood', 'dancing', 'footbridge', 'flood', 'dishes', 'paper', 'color', 'lunch', 'twigs', 'buy', 'twigs', 'lunch', 'wares', 'lunch', 'clouds', 'transport', 'gambling', 'theater', 'dancing', 'chopped', 'touching', 'mannequin', 'company', 'screams', 'twigs', 'gambling', 'footbridge', 'dreads', 'marathon', 'twigs', 'twigs', 'role', 'handrail', 'pulling', 'twigs', 'buy', 'sunbathing', 'marathon', 'gambling']
[Epoch 1 / 5]
Translated example is ['a', 'brown', 'and', 'a', 'woman', 'are', 'walking', 'next', 'to', 'a', 'building', '.']
[Epoch 2 / 5]
Translated example is ['a', 'horse', 'walks', 'under', 'a', 'boat', 'next', 'to', 'a', 'boat', '.']
[Epoch 3 / 5]
Translated example is ['a', 'horse', 'is', 'walking', 'under', 'a', 'bridge', 'next', 'to', 'a', 'boat', '.']
[Epoch 4 / 5]
Translated example is ['a', 'horse', 'is', 'walking', 'under', 'a', 'boat', 'beside', 'a', 'boat', '.']
Bleu score 31.64
