In [24]:
import os 
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import warnings
warnings.filterwarnings('ignore')

from data import TranslationDataset
from rnn import RNN, RNNTools
from transformers import Transformers, TransformersTools

In [2]:
# configurable parameters, change as needed

# set to true if loading existing model file, false if training a new model
skip_training = True
data_dir = 'data'
rnn_model_save_path = 'models/rnn.pth'
tra_model_save_path = 'models/transformers.pth'


In [3]:
# create dirs if not existing
os.makedirs(data_dir, exist_ok=True)
os.makedirs('models', exist_ok=True)
os.makedirs('logs', exist_ok=True)

In [4]:
# additional settings, automatically selects cuda if available
if skip_training:
    device_type = 'cpu'
elif torch.cuda.is_available():
    device_type = 'cuda:0'
else:
    device_type = 'cpu'

# set manually if needed e.g. device_type = 'cpu'
print("Using device type:", device_type)
device = torch.device(device_type)

Using device type: cpu


In [5]:
trainset = TranslationDataset(data_dir, train=True)
testset = TranslationDataset(data_dir, train=False)
print('Number of sentence pairs in the training set: ', len(trainset))
print('Number of sentence pairs in the test set: ', len(testset))

Number of sentence pairs in the training set:  8682
Number of sentence pairs in the test set:  2171


## RNN

In [6]:
trainloader = DataLoader(dataset=trainset, batch_size=64, shuffle=True, collate_fn=RNNTools.collate, pin_memory=True)
testloader = DataLoader(dataset=testset, batch_size=64, shuffle=False, collate_fn=RNNTools.collate)

In [7]:
rnn = RNN(trainset.input_lang.n_words, trainset.output_lang.n_words, embed_size=256, hidden_size=256)
rnn.to(device)

RNN(
  (encoder): Encoder(
    (embedding): Embedding(4489, 256)
    (gru): GRU(256, 256)
  )
  (decoder): Decoder(
    (embedding): Embedding(2925, 256)
    (gru): GRU(256, 256)
    (out): Linear(in_features=256, out_features=2925, bias=True)
  )
)

In [8]:
if not skip_training:
    PADDING_VALUE = 0 
    teacher_forcing_ratio = 0.5
    num_epochs = 2

    optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001)    
    criterion = nn.NLLLoss(ignore_index=PADDING_VALUE)
    
    rnn.train()
    
    for epoch in range(num_epochs):
        total_loss = 0
        total_data = 0
        for src_seqs, src_seq_lengths, tgt_seqs in trainloader:
            src_seqs, tgt_seqs = src_seqs.to(device), tgt_seqs.to(device)
            
            if torch.rand(1) < teacher_forcing_ratio:
                teacher_forcing=True
            else:
                teacher_forcing=False
            
            # forward pass
            outputs = rnn(src_seqs, tgt_seqs, src_seq_lengths, teacher_forcing)
            loss = criterion(outputs.permute(0, 2, 1).to(device), tgt_seqs)
            
            # compute loss metric
            total_loss += (loss.item() * src_seqs.shape[1])
            total_data += src_seqs.shape[1]

            # backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("epoch: {0} training loss: {1:.3f}".format(epoch, total_loss/total_data))

In [9]:
if not skip_training:
    torch.save(rnn.state_dict(), rnn_model_save_path)

In [10]:
if skip_training:
    rnn.load_state_dict(torch.load(rnn_model_save_path, map_location=lambda storage, loc: storage))
    print('RNN model loaded from: {}'.format(rnn_model_save_path))
    rnn.to(device)
    rnn.eval()

RNN model loaded from: models/rnn.pth


In [11]:
rnntools = RNNTools(device)

In [12]:
print('Translate test data:')
print('-----------------------------')
pad_src_seqs, src_seq_lengths, pad_tgt_seqs = next(iter(testloader))
out_seqs = rnntools.translate(rnn, pad_src_seqs, src_seq_lengths)

for i in random.sample(range(0, 64), 20):
    print('SRC:', rnntools.seq_to_string(pad_src_seqs[:,i], testset.input_lang))
    print('TGT:', rnntools.seq_to_string(pad_tgt_seqs[:,i], testset.output_lang))
    print('OUT:', rnntools.seq_to_string(out_seqs[:,i], testset.output_lang))
    print('')

Translate test data:
-----------------------------
SRC: nous sommes des gagneurs .
TGT: we re winners .
OUT: we re classmates .

SRC: je suis juste une fille normale .
TGT: i m just an average girl .
OUT: i m just a bad girl .

SRC: je suis deja amoureux de tom .
TGT: i m already in love with tom .
OUT: i m much tom s mobile phone .

SRC: elle est plus vieille que lui .
TGT: she s older than him .
OUT: she s taller than him .

SRC: vous faites aller ca trop loin .
TGT: you re carrying this too far .
OUT: you re carrying this too far .

SRC: nous ne sommes pas coupables .
TGT: we re not guilty .
OUT: we re not dressed .

SRC: vous n y etes pas bonnes .
TGT: you re not good at this .
OUT: you re not good at this .

SRC: nous sommes en train de mourir .
TGT: we re dying .
OUT: we re dying .

SRC: il est tout sauf mort .
TGT: he is all but dead .
OUT: he is not tall .

SRC: nous sommes debout .
TGT: we re standing .
OUT: we re ruined .

SRC: je suis impatiente de te voir danser .
TGT: i m 

In [13]:
score = rnntools.compute_bleu_score(rnn, trainloader, trainset.output_lang)
print(f'BLEU score on training data: {score*100}')
score = rnntools.compute_bleu_score(rnn, testloader, trainset.output_lang)
print(f'BLEU score on test data: {score*100}')

BLEU score on training data: 96.69817090034485
BLEU score on test data: 47.73730933666229


## Transformers

In [14]:
# skip_training = True

In [15]:
trainloader = DataLoader(dataset=trainset, batch_size=64, shuffle=True, collate_fn=TransformersTools.collate, pin_memory=True)
testloader = DataLoader(dataset=testset, batch_size=64, shuffle=False, collate_fn=TransformersTools.collate)

In [16]:
tra = Transformers(trainset.input_lang.n_words, trainset.output_lang.n_words, n_blocks=3, n_features=256, n_heads=16, n_hidden=1024)
tra.to(device)

Transformers(
  (encoder): Encoder(
    (embedding): Embedding(4489, 256, padding_idx=0)
    (positional_encoding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder_blocks): ModuleList(
      (0-2): 3 x EncoderBlock(
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (dropout1): Dropout(p=0.1, inplace=False)
        (layer_norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): Dropout(p=0.1, inplace=False)
          (2): ReLU()
          (3): Linear(in_features=1024, out_features=256, bias=True)
        )
        (dropout2): Dropout(p=0.1, inplace=False)
        (layer_norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (decoder): Decoder(
    (embedding): Embedding(2925, 256, padding_idx=0)
 

In [17]:
if not skip_training:
    PADDING_VALUE = 0
    num_epochs = 2

    optimizer = torch.optim.Adam(tra.parameters(), lr=0.001)
    criterion = nn.NLLLoss(ignore_index=PADDING_VALUE)

    for epoch in range(num_epochs):
        total_loss = 0
        total_data = 0
        for src_seqs, src_mask, tgt_seqs in trainloader:
            src_seqs, src_mask, tgt_seqs = src_seqs.to(device), src_mask.to(device), tgt_seqs.to(device)
            
            # forward
            outputs = tra(src_seqs, tgt_seqs, src_mask)
            
            # compute loss metric
            loss = criterion(outputs.permute(0, 2, 1).to(device), tgt_seqs[1:])
            total_loss += (loss.item() * src_seqs.shape[1])
            total_data += src_seqs.shape[1]

            # backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print("epoch: {0} training loss: {1:.3f}".format(epoch, total_loss/total_data))


In [18]:
if not skip_training:
    torch.save(tra.state_dict(), tra_model_save_path)

In [19]:
if skip_training:
    tra.load_state_dict(torch.load(tra_model_save_path, map_location=lambda storage, loc: storage))
    print('Transformers model loaded from: {}'.format(tra_model_save_path))
    tra.to(device)
    tra.eval()

Transformers model loaded from: models/transformers.pth


In [20]:
tratools = TransformersTools(device)

In [26]:
print('Translate test data:')
print('-----------------------------')
src_seqs, src_mask, tgt_seqs = next(iter(testloader))
out_seqs = tratools.translate(tra, src_seqs, src_mask)

for i in random.sample(range(0, 64), 20):
    print('SRC:', tratools.seq_to_string(src_seqs[:,i], testset.input_lang))
    print('TGT:', tratools.seq_to_string(tgt_seqs[1:,i], testset.output_lang))
    print('OUT:', tratools.seq_to_string(out_seqs[:,i], testset.output_lang))
    print('')

Translate test data:
-----------------------------
SRC: elle rassemble du materiel pour un livre .
TGT: she s collecting material for a book .
OUT: she is collecting material for a book .

SRC: je suis ravi que vous ayez souleve ca .
TGT: i m glad you brought that up .
OUT: i m glad you brought that up .

SRC: nous sommes des gagneurs .
TGT: we re winners .
OUT: we re survivors .

SRC: je suis impatiente de te voir danser .
TGT: i m looking forward to seeing you dance .
OUT: i m looking forward to seeing you dance .

SRC: c est une tres belle fille .
TGT: she s a really nice girl .
OUT: she s a beauty .

SRC: je ne suis pas un toxico .
TGT: i m not a drug addict .
OUT: i m not a drug .

SRC: je suis un artiste .
TGT: i am an artist .
OUT: i m an artist .

SRC: je ne suis pas facilement offense .
TGT: i m not easily offended .
OUT: i m not easily offended .

SRC: elle est aimee de ses amies .
TGT: she s loved by her friends .
OUT: she is deaf friends with her friends .

SRC: je suis le 

In [22]:
score = tratools.compute_bleu_score(tra, trainloader, trainset.output_lang)
print(f'BLEU score on training data: {score*100}')
score = tratools.compute_bleu_score(tra, testloader, trainset.output_lang)
print(f'BLEU score on test data: {score*100}')

BLEU score on training data: 95.7484686427289
BLEU score on test data: 58.79185315508608
