In [2]:
!git clone https://github.com/anilbhatt1/ERA1_S16_transformers_speedup.git

Cloning into 'ERA1_S16_transformers_speedup'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 19 (delta 5), reused 19 (delta 5), pack-reused 0[K
Receiving objects: 100% (19/19), 9.75 KiB | 9.75 MiB/s, done.
Resolving deltas: 100% (5/5), done.


In [3]:
import os
os.chdir('/content/ERA1_S16_transformers_speedup/')
!pip install -r requirements.txt -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
os.chdir('/content/ERA1_S16_transformers_speedup/coursedocs')
!ls

config.py  dataset.py  model.py  train.ipynb  train_ocp.ipynb  train.py


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset
import os

import math

from dataset import BilingualDataset, causal_mask
import torchtext.datasets as datasets
from torch.utils.data import Dataset, DataLoader, random_split
import warnings
from tqdm import tqdm
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
import torchmetrics
from torch.utils.tensorboard import SummaryWriter
from torch.nn.utils.rnn import pad_sequence

torch.cuda.amp.autocast(enabled=True)
print(hasattr(torch.nn.functional, 'scaled_dot_product_attention'))

True


###config.py

In [6]:
def get_config():
    return {
        "batch-size": 2048*2, #s16
        "num_epochs": 20,
        "lr": 10**-3,   #s16
        "seq_len": 160, #s16
        "d_model": 512,
        "lang_src": "en",
        "lang_tgt": "fr", #s16
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": False,   #s16
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

def get_weights_file_path(config, epoch:str):
    model_folder = config["model_folder"]
    model_basename = config["model_basename"]
    model_filename = f"{model_basename}{epoch}.pt"
    return str(Path('.')/model_folder/model_filename)

###dataset.py

In [87]:
class BilingualDataset(Dataset):
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.seq_len = seq_len

        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, index):
        src_target_pair = self.ds[index]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        # Tranform the text to tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # print(f' getitem : len(enc_input_tokens) : {len(enc_input_tokens)} - len(dec_input_tokens) : {len(dec_input_tokens)}')

        # Add the start and end of sentence  & padding tokens
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2 # We will add <s> and </s>
        # We will only add  <s> and </s> only on the label
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 # EOS not added for decoder

        #Make sure the number of padding tokens is not negativ. If it is , sentence too long
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            print(f'Issue index : {index}')
            print(f'Issue src_text : {src_text}')
            print(f'Issue tgt_text : {tgt_text}')
            print(f'Issue len(enc_input_tokens) : {len(enc_input_tokens)}')
            print(f'Issue len(dec_input_tokens) : {len(dec_input_tokens)}')
            print(f'Issue enc_num_padding_tokens : {enc_num_padding_tokens}')
            print(f'Issue dec_num_padding_tokens : {dec_num_padding_tokens}')
            print(f'self.seq_len : {self.seq_len}')
            raise ValueError("Sentence too long")

        # Add <s> and </s> token
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0
        )

        # Add only </s> token (decoder label) - </s> is never given to decoder. It has to predict it
        # Hence addding eos token in label
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Double check the size of tensors to make sure they are all seq_len long
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input, #seq_len
            "decoder_input": decoder_input, #seq_len
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1,1,seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1,seq_len) & (1, seq_len, seq_len)"
            "label": label,  #seq_len
            "src_text": src_text,  #s16
            "tgt_text": tgt_text,
            "enc_token_len": len(enc_input_tokens),    #s16
            "dec_token_len": len(dec_input_tokens),    #s16
            # "sos_token": self.sos_token,
            # "eos_token": self.eos_token,
            # "pad_token": self.pad_token,
            # "enc_input_tokens": enc_input_tokens,
            # "dec_input_tokens": dec_input_tokens
        }



###model.py

In [8]:
class LayerNormalization(nn.Module):
    def __init__(self, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)

class ResidualConnection(nn.Module):
    def __init__(self, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization()

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h
        assert d_model % h == 0, "d_model not divisible by h"

        self.d_k = d_model // h
        self.w_q = nn.Linear(d_model, d_model, bias=False)
        self.w_k = nn.Linear(d_model, d_model, bias=False)
        self.w_v = nn.Linear(d_model, d_model, bias=False)
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Output Layer
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        _MASKING_VALUE=-1e30 if attention_scores.dtype == torch.float32 else -1e4  #s16
        if mask is not None:
            attention_scores.masked_fill_(mask == 0, value=_MASKING_VALUE) #s16
        attention_scores = attention_scores.softmax(dim=-1)
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q)
        key = self.w_k(k)
        value = self.w_v(v)

        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        return self.w_o(x)

class EncoderBlock(nn.Module):

    def __init__(self, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

class Encoder(nn.Module):

    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

class DecoderBlock(nn.Module):
    def __init__(self, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block, dropout):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.self_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

class Decoder(nn.Module):
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

class ProjectionLayer(nn.Module):
    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # b, seq_len, d_model -> b, seq_len, vocab_size
        return torch.log_softmax(self.proj(x), dim=-1)

class Transformer(nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embded, src_pos, tgt_pos, projection_layer):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embded
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)

    def decode(self, encoder_output, src_mask, tgt, tgt_mask):
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)

    def project(self, x):
        return self.projection_layer(x)


def build_transformer(src_vocab_size, tgt_vocab_size, src_seq_len, tgt_seq_len, d_model=512, N=6, h=8, dropout=0.1, d_ff=2048):
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)

    encoder_blocks = []
    for _ in range(N//2): #S16
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    decoder_blocks = []
    for _ in range(N//2): #S16
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    e1, e2, e3 = encoder_blocks                   #S16
    d1, d2, d3 = decoder_blocks                   #S16
    encoder_blocks1 = [e1, e2, e3, e3, e2, e1]    #S16
    decoder_blocks1 = [d1, d2, d3, d3, d2, d1]    #S16

    encoder = Encoder(nn.ModuleList(encoder_blocks1))
    decoder = Decoder(nn.ModuleList(decoder_blocks1))

    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    for p in transformer.parameters():
        if p.dim() > 1:
            # nn.init.xavier_uniform_(p)  #s16
            nn.init.normal_(p, std=0.02)  #s16

    n_params = sum(p.numel() for p in transformer.parameters())
    print(f'Total parameters : {n_params}')

    return transformer

###train.py

In [108]:
def causal_mask(size):
    # Create a causal mask
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

def build_sos_eos_pad_token(tokenizer_tgt):
    sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
    eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
    pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)
    return sos_token, eos_token, pad_token

# Part of validation
def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id("[SOS]")
    eos_idx = tokenizer_tgt.token_to_id("[EOS]")

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(source, source_mask)
    # Initialize the decoder input with the start of sentence token
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)

    while True:
        if decoder_input.size(1) == max_len:
            break

        # build mask for target
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculate output
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # get next token
        prob = model.project(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [decoder_input, torch.empty(1,1).type_as(source).fill_(next_word.item()).to(device)], dim=1)

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)

# Part of validation
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_step, writer, num_examples=2):
    model.eval()
    count = 0

    source_texts = []
    expected = []
    predicted = []

    try:
        with os.popen('stty size', 'r') as console:
            _, console_width = console.read().split()
            console_width = int(console_width)
    except:
        console_width = 80

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch['encoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)

            assert encoder_input.size(0) == 1, "Batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)

            print_msg('-'*console_width)
            print_msg(f"Source: {source_text}")
            print_msg(f"Target: {target_text}")
            print_msg(f"Predicted: {model_out_text}")

            if count == num_examples:
                print_msg('-'*console_width)
                break
    if writer:
        # Compute the character error rate
        metric = torchmetrics.CharErrorRate()
        cer = metric(predicted, expected)
        writer.add_scalar('validation/cer', cer, global_step)
        writer.flush()

        # Compute the word error rate
        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        writer.add_scalar('validation/wer', wer, global_step)
        writer.flush()

        # Compute the BLEU metric
        metric = torchmetrics.BLEUScore()
        bleu = metric(predicted, expected)
        writer.add_scalar('validation/bleu', bleu, global_step)
        writer.flush()

# part of data module
def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]

# part of data module
def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        # Most code taken from https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

# part of data module
def get_ds(config):
    # Only has train split, so we divide it ourselves
    ds_raw = load_dataset('opus_books', f"{config['lang_src']}-{config['lang_tgt']}", split='train')

    #Build tokenizers
    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])

    #s16
    sorted_ds = sorted(ds_raw, key=lambda x: len(x['translation'][config['lang_src']]))

    #s16
    # filtered_sorted_ds = [k for k in sorted_ds if len(k['translation'][config['lang_src']]) < 120]
    # filtered_sorted_ds = [k for k in filtered_sorted_ds if len(k['translation'][config['lang_tgt']]) < 120]
    filtered_sorted_ds = [k for k in sorted_ds if len(tokenizer_src.encode(k['translation'][config['lang_src']]).ids) < 150]
    filtered_sorted_ds = [k for k in filtered_sorted_ds if (len(k['translation'][config['lang_src']]) + 10) > len(k['translation'][config['lang_tgt']])]
    filtered_sorted_ds = [k for k in filtered_sorted_ds if len(k['translation'][config['lang_src']]) > 40]
    filtered_sorted_ds = [k for k in filtered_sorted_ds if len(k['translation'][config['lang_tgt']]) > 40]

    #Keep 90% for training, 10% for validation
    train_ds_size = int(len(filtered_sorted_ds) * 0.9)
    val_ds_size = len(filtered_sorted_ds) - train_ds_size
    train_ds_filt, val_ds_filt = random_split(filtered_sorted_ds, [train_ds_size, val_ds_size])

    #s16
    train_ds = BilingualDataset(train_ds_filt, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_filt, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    print(f'len(sorted_ds) : {len(sorted_ds)}')
    print(f'len(filtered_sorted_ds) : {len(filtered_sorted_ds)}')
    print(f'len(train_ds) : {len(train_ds)}')
    print(f'len(val_ds) : {len(val_ds)}')

    #Find the max length of each sentence in source & target sentence
    max_len_src = 0
    max_len_tgt = 0

    for idx, item in enumerate(filtered_sorted_ds):
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')

    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True, collate_fn=collate_fn) #s16
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

#s16
def collate_fn(batch):
    # print(f'collate_fn : {len(batch)}')
    # for b in batch:
    #     print(f"e_t_l:{b['enc_token_len']} - d_t_l:{b['dec_token_len']}")

    encoder_input_max = max(x["enc_token_len"] for x in batch)
    decoder_input_max = max(x["dec_token_len"] for x in batch)

    max_token_len = max([encoder_input_max, decoder_input_max])
    max_token_len_2 = max_token_len + 2

    # print(f'collate_fn - en_token_len_max : {encoder_input_max}, de_token_len_max : {decoder_input_max}, max_token_len : {max_token_len}')

    encoder_inputs = []
    decoder_inputs = []
    encoder_masks = []
    decoder_masks = []
    labels = []
    src_texts = []
    tgt_texts = []


    for b in batch:
        encoder_inputs.append(b["encoder_input"][:max_token_len_2])
        decoder_inputs.append(b["decoder_input"][:max_token_len_2])
        encoder_mask = (b["encoder_mask"][0, 0, :max_token_len_2]).unsqueeze(0).unsqueeze(0).unsqueeze(0).int()
        encoder_masks.append(encoder_mask)
        decoder_mask = (b["decoder_mask"][0, :max_token_len_2, :max_token_len_2]).unsqueeze(0).unsqueeze(0)
        decoder_masks.append(decoder_mask)
        labels.append(b["label"][:max_token_len_2])
        src_texts.append(b["src_text"])
        tgt_texts.append(b["tgt_text"])

    # encoder_inputs_ = []
    # decoder_inputs_ = []
    # encoder_masks_ = []
    # decoder_masks_ = []
    # labels_ = []
    # src_texts_ = []
    # tgt_texts_ = []


    # for b in batch:
    #     encoder_inputs_.append(b["encoder_input"][:encoder_input_max])
    #     decoder_inputs_.append(b["decoder_input"][:decoder_input_max])
    #     encoder_masks_.append((b["encoder_mask"][0, 0, :encoder_input_max]).unsqueeze(0).unsqueeze(0).unsqueeze(0).int())
    #     decoder_masks_.append((b["decoder_mask"][0, :decoder_input_max, :decoder_input_max]).unsqueeze(0).unsqueeze(0))
    #     labels_.append(b["label"][:decoder_input_max])
    #     src_texts_.append(b["src_text"])
    #     tgt_texts_.append(b["tgt_text"])

    return {
        "encoder_input": torch.vstack(encoder_inputs),
        "decoder_input": torch.vstack(decoder_inputs),
        "encoder_mask": torch.vstack(encoder_masks),
        "decoder_mask": torch.vstack(decoder_masks),
        "label": torch.vstack(labels),
        "src_text": src_texts,
        "tgt_text": tgt_texts,
        # "encoder_input_": torch.vstack(encoder_inputs_),
        # "decoder_input_": torch.vstack(decoder_inputs_),
        # "encoder_mask_": torch.vstack(encoder_masks_),
        # "decoder_mask_": torch.vstack(decoder_masks_),
        # "label_": torch.vstack(labels_),
        # "src_text_": src_texts_,
        # "tgt_text_": tgt_texts_
    }

# part of pytorch model
def get_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], d_model = config['d_model'])
    return model

# part of litmodel
def train_model(config):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'device : {device}')

    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    sos_token, eos_token, pad_token = build_sos_eos_pad_token(tokenizer_tgt)
    model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    writer = SummaryWriter(config['experiment_name'])
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

    dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'

    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,
                                                    max_lr=10**-3,
                                                    steps_per_epoch=len(train_dataloader),
                                                    epochs=config['num_epochs'],
                                                    pct_start=1/10 if config['num_epochs'] !=1 else 0.5,
                                                    div_factor=10,
                                                    three_phase=True,
                                                    final_div_factor=10,
                                                    anneal_strategy='linear'
                                                )


    initial_epoch=0
    global_step=0
    if config['preload']:
        model_filename = get_weights_file_path(config, config['preload'])
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch=state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step=state['global_step']
        print('Model preloaded')

    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id("[PAD]"), label_smoothing=0.1).to(device)

    scaler = torch.cuda.amp.GradScaler()
    lr = [0.0]

    for epoch in range(initial_epoch, config['num_epochs']):
        print(f'Total Epochs {epoch}')
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f'Processing epoch {epoch: 02d}')
        for idx, batch in enumerate(batch_iterator):
            optimizer.zero_grad(set_to_none=True)
            encoder_input = batch['encoder_input'].to(device)
            decoder_input = batch['decoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            decoder_mask = batch['decoder_mask'].to(device)
            # label = batch['label'].to(device)

            # encoder_input_ = batch['encoder_input_'].to(device)
            # decoder_input_ = batch['decoder_input_'].to(device)
            # encoder_mask_ = batch['encoder_mask_'].to(device)
            # decoder_mask_ = batch['decoder_mask_'].to(device)
            # label_ = batch['label_'].to(device)
            # print(f' batch New - {idx} - ei : {encoder_input.shape}, di : {decoder_input.shape}, em : {encoder_mask.shape}, dm : {decoder_mask.shape}, l : {label.shape}')
            # print(f' batch Old - {idx} - ei : {encoder_input_.shape}, di : {decoder_input_.shape}, em : {encoder_mask_.shape}, dm : {decoder_mask_.shape}, l : {label_.shape}')

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                encoder_output = model.encode(encoder_input, encoder_mask)
                decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
                proj_output = model.project(decoder_output)

                label = batch['label'].to(device)
                # print(f'proj_output.shape : {proj_output.shape} label.shape : {label.shape}')
                # proj_new = proj_output.view(-1, tokenizer_tgt.get_vocab_size())
                # label_new = label.view(-1)
                # print(f'proj_new.shape : {proj_new.shape} label_new.shape : {label_new.shape}')

                loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))

            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            writer.add_scalar('train_loss', loss.item(), global_step)
            writer.flush()

            scaler.scale(loss).backward()
            scale = scaler.get_scale()
            scaler.step(optimizer)
            scaler.update()
            skip_lr_sched = (scale > scaler.get_scale())
            if not skip_lr_sched:
                scheduler.step()
            lr.append(scheduler.get_last_lr())
            # if idx > 2:
            #     print('bye')
            #     break

            global_step += 1

    run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)

    # model_filename = get_weights_file_path(config, f"{epoch:02d}")
    # torch.save({
    #     'epoch': epoch,
    #     'model_state_dict': model.state_dict(),
    #     'optimizer_state_dict': optimizer.state_dict(),
    #     'global_step': global_step
    # }, model_filename)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

###Training

In [109]:
# from config import get_config, get_weights_file_path
import warnings
from tqdm import tqdm
import os
from pathlib import Path
import torchmetrics
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn

config = get_config()
config['batch_size'] = 32
config['preload'] = None
config['num_epochs'] = 1

# from train import get_model, get_ds, run_validation
import torch
torch.cuda.amp.autocast(enabled=True)
print(f"{config['batch-size']} & {config['batch_size']}")

4096 & 32


In [110]:
train_model(config)

device : cuda
len(sorted_ds) : 127085
len(filtered_sorted_ds) : 66787
len(train_ds) : 60108
len(val_ds) : 6679
Max length of source sentence: 149
Max length of target sentence: 157
Total parameters : 68145490
Total Epochs 0


Processing epoch  0: 100%|██████████| 1879/1879 [07:06<00:00,  4.41it/s, loss=4.284]


--------------------------------------------------------------------------------
Source: I lived pleasantly enough, kept good company, that is to say, gay, fine company; but had the discouragement to find this way of living sunk me exceedingly, and that as I had no settled income, so spending upon the main stock was but a certain kind of bleeding to death; and this gave me many sad reflections in the interval of my other thoughts.
Target: Je vivais en agrément, recevais de la bonne société, je veux dire une société délicate et joyeuse; mais je découvris avec découragement que cette façon de vivre me ferait rapidement sombrer, et que n'ayant point de revenu fixe, en dépensant sur le capital, je ne faisais que m'assurer de saigner à mort et ceci me donna beaucoup de tristes réflexions.
Predicted: Je me , bien que tout ce que , c ' est à dire , et que le grand m ' avait fait de cette façon , et je n ' avais pas été , et je n ' avais pas de , mais je me de ma mort .
-----------------------



In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device : {device}')

Path(config['model_folder']).mkdir(parents=True, exist_ok=True)

train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)

writer = SummaryWriter(config['experiment_name'])
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id("[PAD]"), label_smoothing=0.1).to(device)
print(f'config-lr is {config["lr"]}')
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

device : cuda
Max length of source sentence: 471
Max length of target sentence: 482
Total parameters : 68145490
config-lr is 0.001


In [12]:
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
dtype

'float16'

In [13]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [14]:
MAX_LR = 10**-3
STEPS_PER_EPOCH = len(train_dataloader)
EPOCHS = 3

In [15]:
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,
                                                max_lr=MAX_LR,
                                                steps_per_epoch=STEPS_PER_EPOCH,
                                                epochs=EPOCHS,
                                                pct_start=1/10 if EPOCHS !=1 else 0.5,
                                                div_factor=10,
                                                three_phase=True,
                                                final_div_factor=10,
                                                anneal_strategy='linear'
                                                )

In [21]:
initial_epoch = 0
global_step = 0

scaler = torch.cuda.amp.GradScaler()
lr = [0.0]

for epoch in range(initial_epoch, EPOCHS):
    loss_acc = []
    torch.cuda.empty_cache()
    model.train()
    batch_iterator = tqdm(train_dataloader, desc=f'Processing epoch {epoch: 02d}')
    for batch in batch_iterator:
        optimizer.zero_grad(set_to_none=True)
        encoder_input = batch['encoder_input'].to(device)
        decoder_input = batch['decoder_input'].to(device)
        encoder_mask = batch['encoder_mask'].to(device)
        decoder_mask = batch['decoder_mask'].to(device)

        with torch.autocast(device_type='cuda', dtype=torch.float16):
            encoder_output = model.encode(encoder_input, encoder_mask)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
            proj_output = model.project(decoder_output)

            label = batch['label'].to(device)

            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))

        batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

        writer.add_scalar('train_loss', loss.item(), global_step)
        writer.flush()

        scaler.scale(loss).backward()

        scale = scaler.get_scale()
        scaler.step(optimizer)
        scaler.update()
        skip_lr_sched = (scale > scaler.get_scale())

        if not skip_lr_sched:
            scheduler.step()
        lr.append(scheduler.get_last_lr())

        global_step += 1

    run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)

Processing epoch  0:   0%|          | 0/1247 [00:00<?, ?it/s]

src_text : "My Lord!" cried d’Artagnan, enlightened by a sudden idea, "my Lord! Pardon me, monsieur, but you are not--"
tgt_text : -- Milord! s'écria d'Artagnan illuminé d'une idée subite, Milord! pardon, monsieur; mais est-ce que vous seriez...
len(enc_input_tokens) : 29
len(dec_input_tokens) : 30
enc_num_padding_tokens : 129
dec_num_padding_tokens : 129
src_text : What shall we do to rouse her?"
tgt_text : Comment faire pour la réveiller ? »
len(enc_input_tokens) : 8
len(dec_input_tokens) : 7
enc_num_padding_tokens : 150
dec_num_padding_tokens : 152
src_text : Next morning we awoke half frozen by the sharp keen air, but with the light of a splendid sun.
tgt_text : Le lendemain on se réveilla à demi gelé par un air très vif, aux rayons d'un beau soleil.
len(enc_input_tokens) : 21
len(dec_input_tokens) : 22
enc_num_padding_tokens : 137
dec_num_padding_tokens : 137
src_text : Mathilde showed him a number of letters from the Ministry of War, addressed to M. Julien Sorel de La Vernaye.
tg

Processing epoch  0:   0%|          | 1/1247 [00:00<13:38,  1.52it/s, loss=10.461]

src_text : Drebber déclarait qu’il avait une petite affaire personnelle a régler et demandait a l’autre de l’attendre la, lui promettant de le rejoindre avant peu.
tgt_text : Drebber said that he had a little business of his own to do, and that if the other would wait for him he would soon rejoin him.
len(enc_input_tokens) : 32
len(dec_input_tokens) : 29
enc_num_padding_tokens : 126
dec_num_padding_tokens : 130
src_text : Arrivé a la porte de la maison, je l’ouvris et j’introduisis mon compagnon dans la premiere piece.
tgt_text : When we came to the door, I opened it, and led him into the front room.
len(enc_input_tokens) : 23
len(dec_input_tokens) : 19
enc_num_padding_tokens : 135
dec_num_padding_tokens : 140
src_text : When he was twelve years old his mother had her own way; he began lessons.
tgt_text : À douze ans, sa mère obtint que l’on commençât ses études. On en chargea le curé.
len(enc_input_tokens) : 17
len(dec_input_tokens) : 21
enc_num_padding_tokens : 141
dec_num_padding_to

Processing epoch  0:   0%|          | 2/1247 [00:01<11:19,  1.83it/s, loss=9.594]

src_text : Julien was unable to speak.
tgt_text : Julien ne pouvait parler.
len(enc_input_tokens) : 6
len(dec_input_tokens) : 5
enc_num_padding_tokens : 152
dec_num_padding_tokens : 154
src_text : I then learnt that my providential fall had brought me exactly to the extremity of an almost perpendicular shaft; and as I had landed in the midst of an accompanying torrent of stones, the least of which would have been enough to crush me, the conclusion was that a loose portion of the rock had come down with me.
tgt_text : J'appris alors que ma chute providentielle m'avait précisément amené à l'extrémité d'une galerie presque perpendiculaire; comme j'étais arrivé au milieu d'un torrent de pierres, dont la moins grosse eût suffi à m'écraser, il fallait en conclure qu'une partie du massif avait glissé avec moi.
len(enc_input_tokens) : 63
len(dec_input_tokens) : 64
enc_num_padding_tokens : 95
dec_num_padding_tokens : 95
src_text : You can imagine how eagerly I accepted.
tgt_text : Vous jugez av

Processing epoch  0:   0%|          | 3/1247 [00:01<13:31,  1.53it/s, loss=9.028]

src_text : "Armentieres," read Porthos; "Armentieres?
tgt_text : --»Armentières», lut Porthos.
len(enc_input_tokens) : 9
len(dec_input_tokens) : 6
enc_num_padding_tokens : 149
dec_num_padding_tokens : 153
src_text : In June the wheat was already high, of a blue green, which contrasted with the black green of the beetroots.
tgt_text : En juin, les blés étaient grands déja, d'un vert bleu qui tranchait sur le vert noir des betteraves.
len(enc_input_tokens) : 23
len(dec_input_tokens) : 23
enc_num_padding_tokens : 135
dec_num_padding_tokens : 136
src_text : This simple-minded woman was evidently agitated: a feeling of constraint and even of resentment marred that expression of profound serenity, as though raised above all the common interests of life, which gave such charm to that heavenly face.
tgt_text : Cette femme si naïve était évidemment agitée : un sentiment de contrainte et même de colère altérait cette expression de sérénité profonde et comme au-dessus de tous les vulgaires intérê

Processing epoch  0:   0%|          | 4/1247 [00:02<13:32,  1.53it/s, loss=8.712]

src_text : These were curled with scrupulous care, not a hair stood out from the rest.
tgt_text : Ils étaient frisés avec beaucoup de soin, pas un cheveu ne dépassait l’autre.
len(enc_input_tokens) : 16
len(dec_input_tokens) : 17
enc_num_padding_tokens : 142
dec_num_padding_tokens : 142
src_text : The captain approached and stuck his dagger vertically between the shells to discourage any ideas about closing; then with his hands he raised the fringed, membrane-filled tunic that made up the animal's mantle.
tgt_text : Le capitaine s'approcha et introduisit son poignard entre les coquilles pour les empêcher de se rabattre ; puis, de la main, il souleva la tunique membraneuse et frangée sur ses bords qui formait le manteau de l'animal.
len(enc_input_tokens) : 40
len(dec_input_tokens) : 44
enc_num_padding_tokens : 118
dec_num_padding_tokens : 115
src_text : HARRIS (WITH KINDLY ENCOURAGEMENT): "It's all right. You're doing it very well, indeed - go on."
tgt_text : HARRIS (l’encourageant aima

Processing epoch  0:   0%|          | 5/1247 [00:03<12:19,  1.68it/s, loss=8.182]

src_text : "Oh, yes, sir! Everybody knew your errand."
tgt_text : -- Oh! oui, monsieur, tout le monde le savait.
len(enc_input_tokens) : 12
len(dec_input_tokens) : 13
enc_num_padding_tokens : 146
dec_num_padding_tokens : 146
src_text : Obviously it was some hard, impenetrable substance, not the soft matter that makes up the bodies of our big marine mammals.
tgt_text : C'était évidemment un corps dur, impénétrable, et non pas cette substance molle qui forme la masse des grands mammifères marins.
len(enc_input_tokens) : 24
len(dec_input_tokens) : 25
enc_num_padding_tokens : 134
dec_num_padding_tokens : 134
src_text : Sometimes it stood on end, the three of us along with it!
tgt_text : Parfois il se dressait, et nous avec lui !
len(enc_input_tokens) : 14
len(dec_input_tokens) : 10
enc_num_padding_tokens : 144
dec_num_padding_tokens : 149
src_text : In the fabrication of these pieces, everything depends on employing a metal with the highest possible power of resistance, and steel is incont

Processing epoch  0:   0%|          | 6/1247 [00:03<11:10,  1.85it/s, loss=7.911]

src_text : But this long, iron cylinder lying in the bay, with no masts or funnels--what were they to make of it?
tgt_text : Mais ce long cylindre de fer allongé dans la baie, sans mâts, sans cheminée, que devaient-ils en penser ?
len(enc_input_tokens) : 25
len(dec_input_tokens) : 24
enc_num_padding_tokens : 133
dec_num_padding_tokens : 135
src_text : He thought he understood why so many people felt a passionate hatred of his friend.
tgt_text : Il crut comprendre pourquoi tant de gens haïssaient passionnément son ami.
len(enc_input_tokens) : 16
len(dec_input_tokens) : 12
enc_num_padding_tokens : 142
dec_num_padding_tokens : 147
src_text : I don't know what fate is now against me; but if I stay here, I feel sure she will never come - that it is utterly impossible she will presently appear at the end of this road.'
tgt_text : Je ne sais ce qu’il y a maintenant contre moi : mais si je reste là, je sens qu’elle ne viendra jamais – qu’il est impossible qu’au bout de ce chemin, tout à l’heur

Processing epoch  0:   1%|          | 7/1247 [00:03<10:49,  1.91it/s, loss=7.667]

src_text : "I do not believe you," cried the soldier, and he expired amid horrible tortures.
tgt_text : -- Je ne vous crois pas», dit le soldat. Et il expira dans un redoublement de tortures.
len(enc_input_tokens) : 18
len(dec_input_tokens) : 20
enc_num_padding_tokens : 140
dec_num_padding_tokens : 139
src_text : The matter was soon cleared: the man carried off upon a shutter, and I borne in much state and solemnity to a special bedroom, where the small bone of my leg was set by Surgeon Purdie, the younger of the two brothers of that name.
tgt_text : L'homme fut placé sur un volet et emporté. Quant à moi, on me transporta en triomphe, et solennellement dans une chambre à coucher spéciale, où le chirurgien Purdle, le cadet des deux qui portent ce nom, me remit en place le péroné.
len(enc_input_tokens) : 50
len(dec_input_tokens) : 51
enc_num_padding_tokens : 108
dec_num_padding_tokens : 108
src_text : "And the ghost walks?"
tgt_text : -- Et le fantôme se promène.
len(enc_input_tokens) : 

Processing epoch  0:   1%|          | 8/1247 [00:04<11:11,  1.85it/s, loss=7.499]

src_text : Of course nobody knew; nobody at Waterloo ever does know where a train is going to start from, or where a train when it does start is going to, or anything about it.
tgt_text : Bien entendu, personne ne le savait ; personne a la gare de Waterloo ne sait jamais d’ou part un train, ni meme ou il va.
len(enc_input_tokens) : 37
len(dec_input_tokens) : 30
enc_num_padding_tokens : 121
dec_num_padding_tokens : 129
src_text : D’abord il ne voulut point avoir l’air d’un enfant qui s’effraie à propos de rien.
tgt_text : For one thing, he did not wish to assume the air of a boy who takes fright at nothing.
len(enc_input_tokens) : 24
len(dec_input_tokens) : 21
enc_num_padding_tokens : 134
dec_num_padding_tokens : 138
src_text : I bethought myself, however, that, perhaps the skin of him might, one way or other, be of some value to us; and I resolved to take off his skin if I could.
tgt_text : Cependant je réfléchis que sa peau pourrait sans doute, d'une façon ou d'une autre, nous être de

Processing epoch  0:   1%|          | 9/1247 [00:05<11:16,  1.83it/s, loss=7.541]

src_text : However, it was not probable that the convicts would have yet left the plateau of Prospect Heights.
tgt_text : Toutefois, il n'était pas probable que les convicts eussent encore abandonné le plateau de Grande-vue.
len(enc_input_tokens) : 19
len(dec_input_tokens) : 21
enc_num_padding_tokens : 139
dec_num_padding_tokens : 138
src_text : A lad of fourteen, with a bright, keen face, had obeyed the summons of the manager.
tgt_text : Un garçonnet de quatorze ans, au visage éveillé, intelligent, arriva bientôt.
len(enc_input_tokens) : 20
len(dec_input_tokens) : 15
enc_num_padding_tokens : 138
dec_num_padding_tokens : 144
src_text : "Oh, nothing whatever."
tgt_text : -- Oh! rien du tout.
len(enc_input_tokens) : 6
len(dec_input_tokens) : 7
enc_num_padding_tokens : 152
dec_num_padding_tokens : 152
src_text : It was a remarkable thing−−all that physical strength which had reached in Quasimodo such an extraordinary development, and which was placed by him blindly at the disposition of a

Processing epoch  0:   1%|          | 10/1247 [00:05<11:51,  1.74it/s, loss=7.255]


src_text : Is he not in love?" she asked herself; "but with whom?
tgt_text : N’aime-t-il pas? se demanda-t-elle.
len(enc_input_tokens) : 15
len(dec_input_tokens) : 16
enc_num_padding_tokens : 143
dec_num_padding_tokens : 143
src_text : I will restore you to liberty this moment; I will proclaim you a piece of immaculate virtue; I will name you the Lucretia of England.’
tgt_text : Je vous rends la liberté à l'instant même, je vous proclame une vertu, je vous surnomme la Lucrèce de l'Angleterre.
len(enc_input_tokens) : 28
len(dec_input_tokens) : 27
enc_num_padding_tokens : 130
dec_num_padding_tokens : 132
src_text : I went towards them, just two steps. - Lord ! there be these lads starting off at full-speed towards here.
tgt_text : Je m’avance : je fais deux pas – Hip ! les voilà partis au grand galop du côté de chez vous.
len(enc_input_tokens) : 25
len(dec_input_tokens) : 24
enc_num_padding_tokens : 133
dec_num_padding_tokens : 135
src_text : 'Tis true; he gave my place in the nation of 

ValueError: ignored