# Transformer complete code for English  to French translation

In [None]:
#@title Fetching English-French Dataset
!mkdir data
!wget -O data/train.en.txt https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/tok/train.lc.norm.tok.en
!wget -O data/train.fr.txt https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/tok/train.lc.norm.tok.fr
!wget -O data/val.en.txt https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/tok/val.lc.norm.tok.en
!wget -O data/val.fr.txt https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/tok/val.lc.norm.tok.fr
!wget -O data/test.en.txt https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/tok/test_2017_mscoco.lc.norm.tok.en
!wget -O data/test.fr.txt https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/tok/test_2017_mscoco.lc.norm.tok.fr

In [2]:
#@title Library imports
!pip install portalocker --quiet
!pip install sacrebleu --quiet
!pip install sentencepiece --quiet

from torchtext.datasets import Multi30k
import sentencepiece as spm
import numpy as np
from torch.utils.data import Dataset
import torch
import torch.nn as nn
import sacrebleu
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m628.8 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h

**The class represents a multi-headed attention mechanism used in the Transformer model for capturing dependencies between different parts of the input sequence.**

In [3]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(MultiHeadedAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads
        self.values = nn.Linear(embed_size, embed_size)
        self.keys = nn.Linear(embed_size, embed_size)
        self.queries = nn.Linear(embed_size, embed_size)
        self.fc = nn.Linear(embed_size, embed_size)

    def forward(self, query, keys, values, mask):
        # Get number of training examples
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Linear transformations for values, keys, and queries
        values = self.values(values)  # (N, value_len, embed_size)
        keys = self.keys(keys)  # (N, key_len, embed_size)
        queries = self.queries(query)  # (N, query_len, embed_size)

        # Split the embedding into different heads pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim) # (N, value_len, heads, embed_size)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim) # (N, key_len, heads, embed_size)
        queries = queries.reshape(N, query_len, self.heads, self.head_dim) # (N, query_len, heads, embed_size)

        #N*H,Q,D
        queries_reshaped = queries.permute(0, 2, 1, 3).contiguous().view(-1, queries.size(1), queries.size(-1))
        #N*H,K,D
        keys_reshaped = keys.permute(0, 2, 1, 3).contiguous().view(-1, keys.size(1), keys.size(-1))

        # Compute energy (query-key interaction)
        #N*H,Q,K
        energy = torch.matmul(queries_reshaped, keys_reshaped.transpose(1, 2))
        #N,H,Q,K
        energy = energy.view(queries.size(0), queries.size(2), queries.size(1), keys.size(1))
        if mask is not None:
            energy = energy.masked_fill(mask == False, float("-inf"))

        # Apply softmax to obtain attention weights
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)

        #N,H,V,D
        values_reshaped = values.permute(0, 2, 1, 3).contiguous()

        # Compute the weighted sum using attention weights
        #N,Q,H,D
        out = torch.matmul(attention, values_reshaped).permute(0,2,1,3).contiguous().reshape(N, query_len,self.heads * self.head_dim)

        #N,Q,embed_size
        out = self.fc(out)

        return out

**This TransformerBlock represents a single block within the Transformer model, containing an attention mechanism, residual network, normalization, a feedforward network, and dropout for regularization.**

In [4]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, expansion):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadedAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        # Feedforward network with expansion for introducing non-linearity
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, expansion * embed_size),
            nn.ReLU(),
            nn.Linear(expansion * embed_size, embed_size),
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask):
        # Attention mechanism using MultiHeadedAttention
        attention = self.attention(query, key, value, mask)
        # Residual connection and normalization for the first stage
        x = self.dropout(self.norm1(attention + query))
        # Feedforward network
        forward = self.feed_forward(x)
        # Residual connection and normalization for the second stage
        out = self.dropout(self.norm2(forward + x))
        return out

**This Encoder class represents the encoder component of a Transformer model, incorporating word embeddings, positional embeddings, and multiple layers of Transformer blocks for encoding the input sequence.**

In [5]:
class Encoder(nn.Module):
    def __init__( self, src_vocab_size, embed_size, num_layers,
                 heads, forward_expansion, dropout, max_length):

        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        # Stacked Transformer blocks for encoding the input sequence
        self.layers = nn.ModuleList([
                TransformerBlock(embed_size, heads, dropout, forward_expansion)
                for _ in range(num_layers)])

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):

        N, seq_length = x.shape
        # Generate positional indices for the input sequence
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(DEVICE)
         # Apply dropout to the sum of word and positional embeddings
        x = self.dropout(
            (self.word_embedding(x) + self.position_embedding(positions)))
        # Pass the input through stacked Transformer blocks
        for layer in self.layers:
            x = layer(x, x, x, mask)

        return x

**The DecoderBlock represents a block within the decoder, and the Decoder class represents the decoder component of a Transformer model.**

In [6]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout):
        super(DecoderBlock, self).__init__()
        self.norm = nn.LayerNorm(embed_size)
        self.attention = MultiHeadedAttention(embed_size, heads=heads)
        self.transformer_block = TransformerBlock(
            embed_size, heads, dropout, forward_expansion
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, value, key, src_mask, trg_mask):
        # Self-attention mechanism for decoding
        attention = self.attention(query, query, query, trg_mask)
        # Residual connection and normalization for the self-attention output
        query = self.dropout(self.norm(attention + query))
        # Pass through the transformer block for further processing
        out = self.transformer_block(query, key, value, src_mask)

        return out
class Decoder(nn.Module):
    def __init__(self, trg_vocab_size, embed_size, num_layers,
        heads, forward_expansion, dropout, max_length):

        super(Decoder, self).__init__()
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)
        # Stacked DecoderBlocks for decoding the target sequence
        self.layers = nn.ModuleList(
            [DecoderBlock(embed_size, heads, forward_expansion, dropout)
                for _ in range(num_layers)])

        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):

        N, seq_length = x.shape
        # Generate positional indices for the input sequenc
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(DEVICE)
        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))
        # Pass through stacked DecoderBlocks for decoding
        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)
        # Output through fully connected layer
        out = self.fc_out(x)
        return out

**The Transformer class represents the overall Transformer model, and the ModelBuilder class is a utility class for building models with specific configurations.**

In [7]:
class Transformer(nn.Module):
    def __init__( self, src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx,
                 embed_size=512, num_layers=6, forward_expansion=4, heads=8, dropout=0.1, max_length=100):

        super(Transformer, self).__init__()
        # Instantiate the Encoder and Decoder
        self.encoder = Encoder( src_vocab_size, embed_size, num_layers, heads,
            forward_expansion, dropout, max_length)

        self.decoder = Decoder( trg_vocab_size, embed_size, num_layers, heads,
            forward_expansion, dropout, max_length)

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx

    def make_src_mask(self, src):
        # Create a mask to ignore padding tokens in the source sequence
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask.to(DEVICE)

    def make_trg_mask(self, trg):
        # Create a mask to ignore padding tokens and future tokens in the target sequence
        trg_mask1 = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        N, trg_len = trg.shape
        trg_mask2 = torch.tril(torch.ones((trg_len, trg_len), dtype=bool)
        ).expand(N, 1, trg_len,trg_len).to(DEVICE)
        trg_mask = trg_mask1 & trg_mask2

        return trg_mask


    def forward(self, src, trg):
        # Generate source and target masks
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        # Pass the source sequence through the encoder
        enc_src = self.encoder(src, src_mask)
        # Pass the target sequence and encoded source sequence through the decoder
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out

class ModelBuilder:
    def __init__(self, model_config):
        self.model_config = model_config
    def make_model(self):
        model = Transformer(**self.model_config).to(DEVICE)
        # Initialize model parameters using Xavier uniform initialization
        for p in model.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        return model

**The following code blocks contain code for Multi30k English-French dataset preparation and translation training using the above model architecture.**

In [8]:
SRC = 'en'
TRG = 'fr'
en_vocab_size = 6261
fr_vocab_size = 6261
vocab_sizes = {"en": en_vocab_size, "fr": fr_vocab_size}

In [9]:
spm.SentencePieceTrainer.train\
(f'--input=data//train.en.txt --model_prefix=Multi30k_en --user_defined_symbols=<pad> --vocab_size=6261')
spm.SentencePieceTrainer.train\
(f'--input=data//train.fr.txt --model_prefix=Multi30k_fr --user_defined_symbols=<pad> --vocab_size=6261')

en_sp = spm.SentencePieceProcessor()
en_sp.load('Multi30k_en.model')

fr_sp = spm.SentencePieceProcessor()
fr_sp.load('Multi30k_fr.model')

tokenizers = {"en": en_sp.encode_as_ids, "fr": fr_sp.encode_as_ids}
detokenizers = {"en":en_sp.decode_ids, "fr":fr_sp.decode_ids}

In [10]:
# indexes of special symbols
UNK, BOS, EOS, PAD = 0, 1, 2, 3

In [11]:
train_iter_en = []
with open('data//train.en.txt', encoding='utf-8') as ip_file :
  train_iter_en = ip_file.readlines()
train_iter_fr = []
with open('data//train.fr.txt', encoding='utf-8') as ip_file :
  train_iter_fr = ip_file.readlines()

valid_iter_en = []
with open('data//val.en.txt', encoding='utf-8') as ip_file :
  valid_iter_en = ip_file.readlines()
valid_iter_fr = []
with open('data//val.fr.txt', encoding='utf-8') as ip_file :
  valid_iter_fr = ip_file.readlines()

test_iter_en = []
with open('data//test.en.txt', encoding='utf-8') as ip_file :
  test_iter_en = ip_file.readlines()
test_iter_fr = []
with open('data//test.fr.txt', encoding='utf-8') as ip_file :
  test_iter_fr = ip_file.readlines()

train_set = list(zip(train_iter_en, train_iter_fr))
valid_set = list(zip(valid_iter_en, valid_iter_fr))
test_set = list(zip(test_iter_en, test_iter_fr))

train_set = [(x.rstrip('\n'), y.rstrip('\n')) for x, y in train_set if x!='']
valid_set = [(x.rstrip('\n'), y.rstrip('\n')) for x, y in valid_set if x!='']
test_set = [(x.rstrip('\n'), y.rstrip('\n')) for x, y in test_set if x!='']
print(len(train_set), len(valid_set), len(test_set))

29000 1014 461


In [12]:
MAX_SEQ_LEN = 50
def tokenize_dataset(dataset):
    return [(torch.tensor([BOS]+tokenizers[SRC](src_text)[0:MAX_SEQ_LEN-2]+[EOS]),
             torch.tensor([BOS]+tokenizers[TRG](trg_text)[0:MAX_SEQ_LEN-2]+[EOS]))
            for src_text, trg_text in dataset]

train_tokenized = tokenize_dataset(train_set)
valid_tokenized = tokenize_dataset(valid_set)
test_tokenized  = tokenize_dataset(test_set)

In [13]:
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]


def pad_sequence(batch):
    src_seqs  = [src for src, trg in batch]
    trg_seqs  = [trg for src, trg in batch]
    src_padded = torch.nn.utils.rnn.pad_sequence(src_seqs,
                                batch_first=True, padding_value = PAD)
    trg_padded = torch.nn.utils.rnn.pad_sequence(trg_seqs,
                                batch_first=True, padding_value = PAD)
    return src_padded, trg_padded

In [14]:
class Dataloaders:
    def __init__(self):
        self.train_dataset = TranslationDataset(train_tokenized)
        self.valid_dataset = TranslationDataset(valid_tokenized)
        self.test_dataset  = TranslationDataset(test_tokenized)

        self.train_loader = torch.utils.data.DataLoader(self.train_dataset, batch_size=BATCH_SIZE,
                                                shuffle=True, collate_fn = pad_sequence)


        self.valid_loader = torch.utils.data.DataLoader(self.valid_dataset, batch_size=BATCH_SIZE,
                                                shuffle=True, collate_fn=pad_sequence)

        self.test_loader = torch.utils.data.DataLoader(self.test_dataset, batch_size=BATCH_SIZE,
                                                shuffle=True, collate_fn=pad_sequence)


**Training and Evaluation functions**

In [15]:
def prepare_batch(x, y):
    src = x.to(DEVICE)
    tgt = y[:, :-1].to(DEVICE)
    tgt_y = y[:, 1:].contiguous().view(-1).to(DEVICE)
    return src, tgt, tgt_y

In [16]:
def train_epoch(model, dataloaders):
    model.train()
    losses= []
    grad_norm_clip = 1.0
    for x, y  in  dataloaders.train_loader:
        optimizer.zero_grad()
        src, trg_in, trg_out = prepare_batch(x,y)
        out = model.forward(src, trg_in).to(DEVICE)
        out = out.contiguous().view(out.shape[0]*out.shape[1],  -1)
        loss = loss_fn(out, trg_out).to(DEVICE)
        loss.backward()
        optimizer.step()
        scheduler.step()
        losses.append(loss.item())
    return np.mean(losses)

def validate(model, dataloaders):
    model.eval()
    losses = []
    with torch.no_grad():
        for x, y in dataloaders.valid_loader:
            src, trg_in, trg_out = prepare_batch(x,y)
            out = model.forward(src, trg_in).to(DEVICE)
            out = out.contiguous().view(out.shape[0]*out.shape[1],  -1)
            loss = loss_fn(out, trg_out).to(DEVICE)
            losses.append(loss.item())
    return np.mean(losses)

def train(model, dataloaders, epochs, early_stop_count, warmup_steps):
    best_valid_loss = float('inf')
    train_size = len(dataloaders.train_loader)*BATCH_SIZE
    for ep in range(epochs):
        train_loss = train_epoch(model, dataloaders)
        valid_loss = validate(model, dataloaders)

        print(f'Epoch: {ep}: train_loss={train_loss:.5f}, valid_loss={valid_loss:.5f}')
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            EARLY_STOP = 2
        else:
            if scheduler.last_epoch>2*warmup_steps:
                EARLY_STOP -= 1
                if EARLY_STOP<=0:
                    return train_loss, valid_loss
    return train_loss, valid_loss

In [17]:
BATCH_SIZE = 128
data_loaders = Dataloaders()
MAX_SEQ_LEN = 50

EPOCS = 10
EARLY_STOP = 1
config = {'src_pad_idx':PAD, 'trg_pad_idx':PAD, 'src_vocab_size':vocab_sizes[SRC],
          'trg_vocab_size':vocab_sizes[TRG], 'embed_size':512,'num_layers':2,
          'forward_expansion':2,'heads':8,'dropout':0.1,'max_length':MAX_SEQ_LEN}
model = ModelBuilder(config).make_model()

warmup_steps = 3*len(data_loaders.train_loader)
lr_fn = lambda step: 512**(-0.5) * min([(step+1)**(-0.5), (step+1)*warmup_steps**(-1.5)])
optimizer = torch.optim.Adam(model.parameters(), lr=0.5, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_fn)
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD)

train_loss, valid_loss = train(model, data_loaders, EPOCS, EARLY_STOP, warmup_steps)

Epoch: 0: train_loss=5.55131, valid_loss=3.53056
Epoch: 1: train_loss=2.85581, valid_loss=2.12786
Epoch: 2: train_loss=1.84443, valid_loss=1.54125
Epoch: 3: train_loss=1.35291, valid_loss=1.27453
Epoch: 4: train_loss=1.06789, valid_loss=1.14850
Epoch: 5: train_loss=0.89309, valid_loss=1.08078
Epoch: 6: train_loss=0.77116, valid_loss=1.05666
Epoch: 7: train_loss=0.67675, valid_loss=1.04649
Epoch: 8: train_loss=0.59974, valid_loss=1.02408
Epoch: 9: train_loss=0.53824, valid_loss=1.01756


In [18]:
def remove_pad(sent):
    if sent.count(EOS)>0:
      sent = sent[0:sent.index(EOS)+1]
    while sent and sent[-1] == PAD:
            sent = sent[:-1]
    return sent

def decode_sentence(detokenizer, sentence_ids):
    if not isinstance(sentence_ids, list):
        sentence_ids = sentence_ids.tolist()
    sentence_ids = remove_pad(sentence_ids)
    return detokenizer(sentence_ids).replace("<bos>", "")\
           .replace("<eos>", "").strip().replace(" .", ".")

def translate(model, x):
    with torch.no_grad():
        dB = x.size(0)
        y = torch.tensor([[BOS]*dB]).view(dB, 1).to(DEVICE)
        x_mask = (x != PAD).unsqueeze(1).unsqueeze(2)
        enc_op = model.encoder(x, x_mask)
        for i in range(MAX_SEQ_LEN):
            trg_mask1 = (y != PAD).unsqueeze(1).unsqueeze(2)
            N, trg_len = y.shape
            trg_mask2 = torch.tril(torch.ones((trg_len, trg_len), dtype=bool)).expand(
                N, 1, trg_len,trg_len
            ).to(DEVICE)
            y_mask = trg_mask1 & trg_mask2
            out = model.decoder(y, enc_op, x_mask, y_mask)
            out = torch.softmax(out, dim=-1 )
            logits = out
            last_output = logits.argmax(-1)[:, -1]
            last_output = last_output.view(dB, 1)
            y = torch.cat((y, last_output), 1).to(DEVICE)
    return y

def evaluate(model, dataloader, num_batch=None):
    model.eval()
    refs, cans, bleus = [], [], []
    with torch.no_grad():
        for idx, (x, y) in enumerate(dataloader):
            src, trg_in, trg_out = prepare_batch(x,y)
            out = model.forward(src, trg_in).to(DEVICE)
            out = out.contiguous().view(out.shape[0]*out.shape[1],  -1)

            # src, trg_in, trg_out, src_pad_mask, trg_pad_mask = make_batch_input(x,y)
            translation = translate(model, src)
            trg_out = trg_out.view(x.size(0), -1)
            refs = refs + [decode_sentence(detokenizers[TRG], trg_out[i]) for i in range(len(src))]
            cans = cans + [decode_sentence(detokenizers[TRG], translation[i]) for i in range(len(src))]
            if num_batch and idx>=num_batch:
                break
        print(min([len(x) for x in refs]))
        bleus.append(sacrebleu.corpus_bleu(cans, [refs]).score)
        # print some examples
        for i in range(3):
            print(f'src:  {decode_sentence(detokenizers[SRC], src[i])}')
            print(f'trg:  {decode_sentence(detokenizers[TRG], trg_out[i])}')
            print(f'pred: {decode_sentence(detokenizers[TRG], translation[i])}')
            print('*'*50)
        return np.mean(bleus)

In [19]:
valid_bleu  = evaluate(model, data_loaders.valid_loader)
test_bleu  = evaluate(model, data_loaders.test_loader)
print(f'valid_bleu: {valid_bleu:.4f}, test_bleu: {test_bleu:.4f}')

25
src:  a man in a brightly-colored ski jacket stands with others on a european street.
trg:  un homme avec une veste de ski de couleur vive est debout parmi d&apos; autres dans une rue d&apos; europe.
pred: un homme en veste de couleur vive est debout avec d&apos; autres personnes en europe.
**************************************************
src:  street scene of blond woman in gold coat and pink mini-skirt in front of a rear-facing police motorcycle.
trg:  une femme blonde dans une rue , avec un manteau doré et une mini-jupe rose devant une moto de police vue de dos.
pred: une scène blonde en manteaux en manteaux doré et rouge rayé devant une moto , faisant de la moto.
**************************************************
src:  a woman in a restaurant is drinking out of a coconut , using a straw.
trg:  une femme dans un restaurant , est en train de boire une noix de coco , à l&apos; aide d&apos; une paille.
pred: une femme dans un restaurant boit une noix de coco , en utilisant une pai

In [20]:
def translate_this_sentence(text: str):
    input = torch.tensor([[BOS] + tokenizers[SRC](text) + [EOS]]).to(DEVICE)
    output = translate(model, input)
    return decode_sentence(detokenizers[TRG], output[0])

translate_this_sentence("a man is opening a present and posing with it for a picture")

'un homme ouvre un cadeau et pose avec celui-ci pour une photo'

# Thank You🦊