In [1]:
!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/f2/17/e7c588245aece7aa93f360894179374830daf60d7ed0bbb59332de3b3b61/torchtext-0.6.0-py3-none-any.whl (64kB)
[K     |████████████████████████████████| 71kB 4.1MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 7.2MB/s 
Installing collected packages: sentencepiece, torchtext
  Found existing installation: torchtext 0.3.1
    Uninstalling torchtext-0.3.1:
      Successfully uninstalled torchtext-0.3.1
Successfully installed sentencepiece-0.1.94 torchtext-0.6.0


In [2]:
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys


def translate_sentence(model, sentence, german, english, device, max_length=50):

    # Load german tokenizer
    spacy_ger = spacy.load("de")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # print(tokens)

    # sys.exit()
    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field,BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter

In [None]:
!python -m spacy download de

In [6]:
spacy_ger = spacy.load('de')
spacy_eng = spacy.load('en')

In [7]:
def tokenizer_ger(text):
  return [tok.text for tok in spacy_ger.tokenizer(text)]
def tokenizer_eng(text):
  return [tok.text for tok in spacy_eng.tokenizer(text)]

In [8]:
german = Field(tokenize=tokenizer_ger,lower=True,init_token='<sos>',eos_token='<eos>')
english = Field(tokenize=tokenizer_eng,lower=True,init_token='<sos>',eos_token='<eos>')

In [9]:
train_data,validation_data,test_data = Multi30k.splits(exts=('.de','.en'),fields=(german,english))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 620kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 224kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 215kB/s]


In [10]:
german.build_vocab(train_data,max_size=10000,min_freq=2)
english.build_vocab(train_data,max_size=10000,min_freq=2)

In [11]:
class Encoder(nn.Module):
  def __init__(self,input_size,embedding_size,hidden_size,num_layers,p):
    super(Encoder,self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    
    self.dropout = nn.Dropout(p)
    self.embedding = nn.Embedding(input_size,embedding_size)
    self.rnn = nn.LSTM(embedding_size,hidden_size,num_layers,dropout=p)
  
  def forward(self,x):
    x = self.embedding(x)
    embedding = self.dropout(x)
    outputs,(hidden,cell) = self.rnn(embedding)
    return hidden,cell

In [12]:
class Decoder(nn.Module):
  def __init__(self,input_size,embedding_size,hidden_size,output_size,num_layers,p):
    super(Decoder,self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.dropout = nn.Dropout(p)
    self.embedding = nn.Embedding(input_size,embedding_size)
    self.rnn = nn.LSTM(embedding_size,hidden_size,num_layers)
    self.fc = nn.Linear(hidden_size,output_size)
  
  def forward(self,x,hidden,cell):
    x = x.unsqueeze(0)
    x = self.embedding(x)
    embedding = self.dropout(x)
    outputs,(hidden,cell) = self.rnn(embedding,(hidden,cell))
    predictions = self.fc(outputs)
    predictions = predictions.squeeze(0)
    return predictions,hidden,cell

In [13]:
class Seq2Seq(nn.Module):
  def __init__(self,encoder,decoder):
    super(Seq2Seq,self).__init__()
    self.decoder = decoder
    self.encoder = encoder
  
  def forward(self,source,target,teacher_force_ratio=0.5):
    batch_size = source.shape[1]
    target_len = target.shape[0]
    target_vocab_size = len(english.vocab)

    outputs = torch.zeros(target_len,batch_size,target_vocab_size).to(device)

    hidden,cell = self.encoder(source)

    x = target[0]

    for t in range(1,target_len):
      output,hidden,cell = self.decoder(x,hidden,cell)
      outputs[t] = output
      best_guess = output.argmax(1)
      x = target[t] if random.random()<teacher_force_ratio else best_guess
    return outputs

In [14]:
num_epochs = 20
learning_rate = 0.001
batch_size = 64

load_model = False
device = torch.device('cpu')
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout= 0.5

In [15]:
writer = SummaryWriter(f'runs/loss_plot')
step = 0

In [16]:
train_iterator,valid_iterator,test_iterator = BucketIterator.splits(
(train_data,validation_data,test_data),
batch_size = batch_size,
sort_within_batch = True,
sort_key = lambda x:len(x.src),
device = device)

In [17]:
encoder_net = Encoder(input_size_encoder,encoder_embedding_size,hidden_size,num_layers,enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder,decoder_embedding_size,hidden_size,output_size,num_layers,dec_dropout).to(device)

In [18]:
model = Seq2Seq(encoder_net,decoder_net).to(device)
optimizer = optim.Adam(model.parameters(),lr = learning_rate)

In [19]:
pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [20]:
if load_model:
  load_checkpoint(torch.load('my_checkpoint.pth.ptar'),model,optimizer)

In [None]:
for epoch in range(num_epochs):
  print(f"Epoch [{epoch}/{num_epochs}]")
  checkpoint = {'state_dict':model.state_dict(),'optimizer':optimizer.state_dict()}
  save_checkpoint(checkpoint)

  for batch_idx,batch in enumerate(train_iterator):
    inp_data = batch.src.to(device)
    target = batch.trg.to(device)

    output = model(inp_data,target)

    output = output[1:].reshape(-1,output.shape[2])
    target = target[1:].reshape(-1)
    optimizer.zero_grad()
    loss = criterion(output,target)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1)
    optimizer.step()

    writer.add_scalar('Training Loss',loss,global_step=step)
    step+=1



Epoch [0/20]
=> Saving checkpoint
Epoch [1/20]
=> Saving checkpoint
Epoch [2/20]
=> Saving checkpoint
Epoch [3/20]
=> Saving checkpoint
