In [33]:
import torch
from torch import nn 
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


import numpy as np
from tqdm import tqdm

from model import create_transformer_model


In [5]:
en_raw = "../data/english.txt"
fr_raw = "../data/french.txt"

In [None]:
START_TOKEN = '<SOS>'
PADDING_TOKEN = '<PAD>' # Used for padding and ignored in loss
END_TOKEN = '<EOS>'
UNKNOWN_TOKEN = '<UNK>' # For handling unknown chars during evaluation


In [7]:
base_french_chars = [' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
                     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                     ':', '<', '=', '>', '?', '@', 
                     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 
                     'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 
                     'Y', 'Z',
                     '[', '\\', ']', '^', '_', '`', 
                     'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                     'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 
                     'y', 'z', 
                     '{', '|', '}', '~',
                     'à', 'â', 'ä', 'æ', 'ç', 'é', 'è', 'ê', 'ë', 'î', 'ï', 'ô', 'œ', 'ù', 'û', 'ü', 'ÿ',
                     'À', 'Â', 'Ä', 'Æ', 'Ç', 'É', 'È', 'Ê', 'Ë', 'Î', 'Ï', 'Ô', 'Œ', 'Ù', 'Û', 'Ü', 'Ÿ',
                     ]

In [8]:
base_english_chars = [' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        ':', '<', '=', '>', '?', '@', 
                        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 
                        'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 
                        'Y', 'Z',
                        '[', '\\', ']', '^', '_', '`', 
                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 
                        'y', 'z', 
                        '{', '|', '}', '~']

In [9]:
# PADDING_TOKEN usually gets index 0
french_vocabulary = [PADDING_TOKEN, START_TOKEN, END_TOKEN, UNKNOWN_TOKEN] + base_french_chars
english_vocabulary = [PADDING_TOKEN, START_TOKEN, END_TOKEN, UNKNOWN_TOKEN] + base_english_chars


In [10]:
with open(en_raw, 'r') as file:
    en_sentences = file.readlines()
with open(fr_raw, 'r') as file:
    fr_sentences = file.readlines()

In [11]:
en_sentences = [sentence.rstrip('\n') for sentence in en_sentences]
fr_sentences = [sentence.rstrip('\n') for sentence in fr_sentences]

# Removes the narrow non-breaking space (\u202f) and replace it with a normal space
fr_sentences = [sentence.replace('\u202f', ' ') for sentence in fr_sentences]


In [12]:
print(f"English: {en_sentences[:10]} \nFrench: {fr_sentences[:10]}\n")

English: ['Go.', 'Hi.', 'Hi.', 'Run!', 'Run!', 'Who?', 'Wow!', 'Fire!', 'Help!', 'Jump.'] 
French: ['Va !', 'Salut !', 'Salut.', 'Cours !', 'Courez !', 'Qui ?', 'Ça alors !', 'Au feu !', "À l'aide !", 'Saute.']



In [13]:
index_to_fr = {k:v for k,v in enumerate(french_vocabulary)}
fr_to_index = {v:k for k,v in enumerate(french_vocabulary)}
index_to_en = {k:v for k,v in enumerate(english_vocabulary)}
en_to_index = {v:k for k,v in enumerate(english_vocabulary)}

In [16]:
# --- Define Padding Index Constant ---
# IMPORTANT: This index must match the PADDING_TOKEN position in vocab
IGNORE_IDX = fr_to_index[PADDING_TOKEN] # Used for loss function
EN_PADDING_IDX = en_to_index[PADDING_TOKEN] # Used for padding source batches
FR_PADDING_IDX = fr_to_index[PADDING_TOKEN] # Used for padding target batches
# Verify these are likely 0 if PADDING_TOKEN is first in vocab list
print(f"Padding Index (Ignore Loss / FR Pad): {IGNORE_IDX}")
print(f"EN Padding Index: {EN_PADDING_IDX}")

Padding Index (Ignore Loss / FR Pad): 0
EN Padding Index: 0


In [17]:
#TOTAL_SENTENCES = 400000
en = en_sentences[:]
fr = fr_sentences[:]

In [18]:
print(f"{len(en)} sentences")
print(en[:10])
print(fr[:10])

170651 sentences
['Go.', 'Hi.', 'Hi.', 'Run!', 'Run!', 'Who?', 'Wow!', 'Fire!', 'Help!', 'Jump.']
['Va !', 'Salut !', 'Salut.', 'Cours !', 'Courez !', 'Qui ?', 'Ça alors !', 'Au feu !', "À l'aide !", 'Saute.']


In [19]:
max(len(x) for x in fr), max(len(x) for x in en),

(349, 286)

In [20]:
PERCENTILE = 99
print( f"{PERCENTILE}th percentile length french: {np.percentile([len(x) for x in fr], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in en], PERCENTILE)}" )


99th percentile length french: 85.0
99th percentile length English: 70.0


In [21]:
max_sequence_length = 100

def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(fr)):
    french_sentence, english_sentence = fr[index], en[index]
    if is_valid_length(french_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(french_sentence, french_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(fr)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 170651
Number of valid sentences: 164317


In [22]:
french = [fr[i] for i in valid_sentence_indicies]
english = [en[i] for i in valid_sentence_indicies]

In [23]:
len(english)
#164317

164317

In [24]:
french[::50000]

['Va !',
 "Quiconque l'a-t-il remarqué ?",
 'Nous parlâmes de divers sujets.',
 'Le match fut annulé à cause de la pluie.']

In [25]:
english[::50000]

['Go.',
 'Did anybody notice this?',
 'We talked about various topics.',
 'The game was called off on account of the rain.']

In [None]:
transformer = create_transformer_model(
    src_vocab_size=len(english_vocabulary),
    tgt_vocab_size=len(french_vocabulary),
    d_model=512,
    num_heads=8,
    num_encoder_layers=3,
    num_decoder_layers=3,
    d_ff=2048,
    max_seq_len= 100,
    dropout=0.1
)

In [27]:
transformer

Transformer(
  (src_embedding): Embedding(98, 512)
  (tgt_embedding): Embedding(132, 512)
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder_layers): ModuleList(
    (0-2): 3 x EncoderLayer(
      (self_attention): MultiHeadAttention(
        (W_q): Linear(in_features=512, out_features=512, bias=True)
        (W_k): Linear(in_features=512, out_features=512, bias=True)
        (W_v): Linear(in_features=512, out_features=512, bias=True)
        (W_o): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): PositionWiseFeedForward(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): 

In [60]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Trainable parameters: {count_parameters(transformer):,}")


Trainable parameters: 22,254,724


In [28]:
from sklearn.model_selection import train_test_split

filtered_english = [en[i] for i in valid_sentence_indicies]
filtered_french = [fr[i] for i in valid_sentence_indicies]

# 90% train, 10% validation
train_english, val_english, train_french, val_french = train_test_split(
    filtered_english, filtered_french, test_size=0.1, random_state=42 
)
print(f"Training samples: {len(train_english)}, Validation samples: {len(val_english)}")


Training samples: 147885, Validation samples: 16432


In [34]:
def tokenize(sentence, vocab):
    return [char for char in sentence if char in vocab]

class TextDataset(Dataset): 
    def __init__(self, english, french, en_to_index, fr_to_index):
        self.english = english
        self.french = french
        self.en_to_index = en_to_index
        self.fr_to_index = fr_to_index

    def __len__(self):
        return len(self.english)

    def __getitem__(self, idx):
        eng_sentence, french_sentence = self.english[idx], self.french[idx]

        eng_tokenized = [START_TOKEN] + tokenize(eng_sentence, english_vocabulary) + [END_TOKEN]
        fr_tokenized = [START_TOKEN] + tokenize(french_sentence, french_vocabulary) + [END_TOKEN]

        eng_numericalized = [self.en_to_index[token] for token in eng_tokenized]
        fr_numericalized = [self.fr_to_index[token] for token in fr_tokenized]
        return torch.tensor(eng_numericalized), torch.tensor(fr_numericalized)

In [35]:
train_dataset = TextDataset(train_english, train_french, en_to_index, fr_to_index)
val_dataset = TextDataset(val_english, val_french, en_to_index, fr_to_index)


In [36]:
def padding_batch(batch, en_padding_idx, fr_padding_idx): # Accept indices
    src_list, tgt_list = zip(*batch)
    # Check if lists are empty before calculating max
    if not src_list or not tgt_list:
         return torch.empty(0), torch.empty(0) # Or handle as appropriate

    src_max_len = max(src.size(0) for src in src_list) if src_list else 0
    tgt_max_len = max(tgt.size(0) for tgt in tgt_list) if tgt_list else 0

    padded_src_list = []
    padded_tgt_list = []
    for src, tgt in zip(src_list, tgt_list):
        pad_len_src = src_max_len - src.size(0)
        pad_len_tgt = tgt_max_len - tgt.size(0)
        # Use the passed indices directly
        padded_src = F.pad(src, (0, pad_len_src), value=en_padding_idx)
        padded_tgt = F.pad(tgt, (0, pad_len_tgt), value=fr_padding_idx)
        padded_src_list.append(padded_src)
        padded_tgt_list.append(padded_tgt)

    # Only stack if lists are not empty
    final_src = torch.stack(padded_src_list) if padded_src_list else torch.empty(0)
    final_tgt = torch.stack(padded_tgt_list) if padded_tgt_list else torch.empty(0)
    return final_src, final_tgt


In [37]:
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                          collate_fn=lambda batch: padding_batch(batch, EN_PADDING_IDX, FR_PADDING_IDX),
                          drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                        collate_fn=lambda batch: padding_batch(batch, EN_PADDING_IDX, FR_PADDING_IDX),
                        drop_last=True)

In [38]:
criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_IDX) # Use the defined constant
optim = torch.optim.AdamW(transformer.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)
clip_grad_norm = 1.0


In [None]:
def loss_fn(output, target, criterion):

    # Output: [batch_size * seq_len, vocab_size]
    # Target: [batch_size * seq_len]
    output_flat = output.contiguous().view(-1, output.shape[-1])
    target_flat = target.contiguous().view(-1)

    # Criterion handles ignoring padding index and reduction (usually mean)
    loss = criterion(output_flat, target_flat)
    return loss



In [None]:
def train_loop():
    transformer.train()
    transformer.to(device)
    #total_loss = 0
    num_epochs = 2 


    print(f"Training on device: {device}") 

    for epoch in range(num_epochs):
        print(f"--- Epoch {epoch+1}/{num_epochs} ---")
        transformer.train() 
        total_epoch_loss = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1} Training')

        for batch_num, batch in enumerate(progress_bar):
            english_batch, french_batch = batch
            english_batch = english_batch.to(device) 
            french_batch = french_batch.to(device) 

            optim.zero_grad()

            # 2. Forward Pass 
            output = transformer(english_batch, french_batch)
            # output shape: [batch_size, tgt_len-1, tgt_vocab_size]

            loss = loss_fn(output, french_batch[:, 1:], criterion)

            # 4. Backward Pass
            loss.backward()

            # 5. Gradient Clipping
            torch.nn.utils.clip_grad_norm_(transformer.parameters(), clip_grad_norm)

            # 6. Optimizer Step
            optim.step()

            total_epoch_loss += loss.item()
            progress_bar.set_postfix({'batch_loss': f'{loss.item():.4f}'})

            if batch_num > 0 and batch_num % 200 == 0: 
                print(f"\n  Batch {batch_num} Loss: {loss.item():.4f}")
                transformer.eval()
                with torch.no_grad():
  
                    input_sentence = "should we go to the mall?" # a fixed sentence for eval
                    input_tokenized = [START_TOKEN] + tokenize(input_sentence, english_vocabulary) + [END_TOKEN]
                    input_numericalized = [en_to_index.get(token, en_to_index.get(UNKNOWN_TOKEN, 0)) for token in input_tokenized]
                    input_tensor = torch.tensor(input_numericalized).unsqueeze(0).to(device)

                    start_symbol_idx = fr_to_index.get(START_TOKEN)
                    if start_symbol_idx is None:
                       raise ValueError("START_TOKEN not found in fr_to_index mapping")

                    output_indices = transformer.greedy_decode(input_tensor, max_len=max_sequence_length, start_symbol=start_symbol_idx)[0].cpu().numpy()

                  
                    pad_idx = fr_to_index.get(PADDING_TOKEN)
                    end_idx = fr_to_index.get(END_TOKEN)

                    predicted_sentence_chars = [
                        index_to_fr[idx] for idx in output_indices
                        if idx != pad_idx and idx != start_symbol_idx and idx != end_idx and idx in index_to_fr
                    ]
                    predicted_sentence = "".join(predicted_sentence_chars)
                    print(f"  Eval (should we go...): {predicted_sentence}")
                    print("  -------------------------------------------")
                transformer.train() # Set back to train mode

        avg_train_loss = total_epoch_loss / len(train_loader)
        avg_val_loss = val_loop() 
        print(f"Epoch {epoch+1} Summary: Avg Train Loss: {avg_train_loss:.4f}, Avg Val Loss: {avg_val_loss:.4f}")



def val_loop(): # Validation loop
    transformer.eval()
    total_val_loss = 0
    with torch.no_grad():
        progress_bar_val = tqdm(val_loader, desc=f'Validation')
        for batch_idx, batch in enumerate(progress_bar_val):
            english_batch, french_batch = batch
            english_batch = english_batch.to(device)
            french_batch = french_batch.to(device)

            output = transformer(english_batch, french_batch)
            val_loss = loss_fn(output, french_batch[:, 1:], criterion) 

            total_val_loss += val_loss.item()
            progress_bar_val.set_postfix({'val_loss': val_loss.item()})

    avg_val_loss = total_val_loss / len(val_loader)
    return avg_val_loss


if __name__ == "__main__":
    device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu') 
    train_loop() 

Training on device: mps
--- Epoch 1/2 ---


Epoch 1 Training:   4%|▍         | 200/4621 [03:54<38:32,  1.91it/s, batch_loss=2.1278]  


  Batch 200 Loss: 2.1278


Epoch 1 Training:   4%|▍         | 201/4621 [04:17<9:07:20,  7.43s/it, batch_loss=2.1278]

  Eval (should we go...): Quellle de de de de de de ?re ?re ?le ?le ?
  -------------------------------------------


Epoch 1 Training:   9%|▊         | 400/4621 [06:05<29:09,  2.41it/s, batch_loss=1.8342]  


  Batch 400 Loss: 1.8342


Epoch 1 Training:   9%|▊         | 401/4621 [06:10<2:00:09,  1.71s/it, batch_loss=1.8342]

  Eval (should we go...): Combien avous avous avent ce ?re ?re ? ? ?re ?ure ?re ?ure ? ? ?re ?re ?un ?re ?re ?
  -------------------------------------------


Epoch 1 Training:  13%|█▎        | 600/4621 [07:46<38:30,  1.74it/s, batch_loss=1.7987]  


  Batch 600 Loss: 1.7987


Epoch 1 Training:  13%|█▎        | 601/4621 [07:50<2:03:25,  1.84s/it, batch_loss=1.7987]

  Eval (should we go...): Quelle est en pas as le tre ?re ?r ? ? ? ?re ?res ?r?re ? ? ?res ?re ?re ?re ?re ?
  -------------------------------------------


Epoch 1 Training:  17%|█▋        | 800/4621 [09:27<31:39,  2.01it/s, batch_loss=1.6200]  


  Batch 800 Loss: 1.6200


Epoch 1 Training:  17%|█▋        | 801/4621 [09:31<1:48:12,  1.70s/it, batch_loss=1.6200]

  Eval (should we go...): Quelle est le mont le mon pas le mon ?e ? ?e ?e ?e ?le ?on ?e ? ?e ?e ? ?e ?e ?re ?
  -------------------------------------------


Epoch 1 Training:  22%|██▏       | 1000/4621 [11:16<31:42,  1.90it/s, batch_loss=1.5447] 


  Batch 1000 Loss: 1.5447


Epoch 1 Training:  22%|██▏       | 1001/4621 [11:20<1:56:18,  1.93s/it, batch_loss=1.5447]

  Eval (should we go...): Pourquoi ais-tu le main ? ?a ? ?a ? ? ? ?a ?ailere ?e ?
  -------------------------------------------


Epoch 1 Training:  26%|██▌       | 1200/4621 [12:55<43:51,  1.30it/s, batch_loss=1.4542]  


  Batch 1200 Loss: 1.4542


Epoch 1 Training:  26%|██▌       | 1201/4621 [12:59<1:45:58,  1.86s/it, batch_loss=1.4542]

  Eval (should we go...): Est-ce que tu ais avec de chanter ? ? ? ?e ??e ?e ?
  -------------------------------------------


Epoch 1 Training:  30%|███       | 1400/4621 [24:28<25:39,  2.09it/s, batch_loss=1.4259]     


  Batch 1400 Loss: 1.4259


Epoch 1 Training:  30%|███       | 1401/4621 [24:32<1:30:36,  1.69s/it, batch_loss=1.4259]

  Eval (should we go...): Avez-vous de ma cher ? ? ?a ? ? ? ? ? ??? ???
  -------------------------------------------


Epoch 1 Training:  35%|███▍      | 1600/4621 [26:01<23:07,  2.18it/s, batch_loss=1.3853]  


  Batch 1600 Loss: 1.3853


Epoch 1 Training:  35%|███▍      | 1601/4621 [26:05<1:21:58,  1.63s/it, batch_loss=1.3853]

  Eval (should we go...): Comment le mont de te prop ?our ? ?
  -------------------------------------------


Epoch 1 Training:  39%|███▉      | 1800/4621 [27:57<27:22,  1.72it/s, batch_loss=1.2294]  


  Batch 1800 Loss: 1.2294


Epoch 1 Training:  39%|███▉      | 1801/4621 [28:01<1:21:40,  1.74s/it, batch_loss=1.2294]

  Eval (should we go...): Pourquoi le m'aider l'autre ? ???
  -------------------------------------------


Epoch 1 Training:  43%|████▎     | 2000/4621 [29:37<24:31,  1.78it/s, batch_loss=1.2871]  


  Batch 2000 Loss: 1.2871


Epoch 1 Training:  43%|████▎     | 2001/4621 [29:41<1:17:07,  1.77s/it, batch_loss=1.2871]

  Eval (should we go...): Aucun devrais-tu de me dire ?0000000000000000000000000000000 ? ? ? ? ? ? ? ? ?
  -------------------------------------------


Epoch 1 Training:  48%|████▊     | 2200/4621 [31:19<16:10,  2.50it/s, batch_loss=1.2373]  


  Batch 2200 Loss: 1.2373


Epoch 1 Training:  48%|████▊     | 2201/4621 [31:24<1:12:21,  1.79s/it, batch_loss=1.2373]

  Eval (should we go...): De me demander le me faire ? ? ? ? ? ? ?le ?e le ?la ?e ?e ?a ?aila ?la ?a ?a ?la 
  -------------------------------------------


Epoch 1 Training:  52%|█████▏    | 2400/4621 [33:03<15:59,  2.31it/s, batch_loss=1.1846]  


  Batch 2400 Loss: 1.1846


Epoch 1 Training:  52%|█████▏    | 2401/4621 [33:07<1:02:58,  1.70s/it, batch_loss=1.1846]

  Eval (should we go...): De la maison de la maison ?ourd ??
  -------------------------------------------


Epoch 1 Training:  56%|█████▋    | 2600/4621 [34:45<20:36,  1.63it/s, batch_loss=1.2816]  


  Batch 2600 Loss: 1.2816


Epoch 1 Training:  56%|█████▋    | 2601/4621 [34:50<1:09:41,  2.07s/it, batch_loss=1.2816]

  Eval (should we go...): La mois-tu aller ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
  -------------------------------------------


Epoch 1 Training:  61%|██████    | 2800/4621 [36:36<18:03,  1.68it/s, batch_loss=1.1817]  


  Batch 2800 Loss: 1.1817


Epoch 1 Training:  61%|██████    | 2801/4621 [36:41<1:02:09,  2.05s/it, batch_loss=1.1817]

  Eval (should we go...): Dites-ce que je veux parler ? ? ? ? ? ? ? ? ? ? ?a ? ?a ? ? ?ai ? ? ? ?ai ? ?
  -------------------------------------------


Epoch 1 Training:  65%|██████▍   | 3000/4621 [38:34<14:18,  1.89it/s, batch_loss=1.2006]  


  Batch 3000 Loss: 1.2006


Epoch 1 Training:  65%|██████▍   | 3001/4621 [38:39<56:07,  2.08s/it, batch_loss=1.2006]

  Eval (should we go...): Voudriez-vous de me dire ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
  -------------------------------------------


Epoch 1 Training:  69%|██████▉   | 3200/4621 [40:31<11:43,  2.02it/s, batch_loss=1.3065]


  Batch 3200 Loss: 1.3065


Epoch 1 Training:  69%|██████▉   | 3201/4621 [40:36<48:38,  2.06s/it, batch_loss=1.3065]

  Eval (should we go...): Dis-nous le montrer ? ? ? ? ? ? ? ? ? ? ?ous ? ? ?ous ? ? ?ou ? ? ?ou ?l'ai
  -------------------------------------------


Epoch 1 Training:  74%|███████▎  | 3400/4621 [42:29<11:21,  1.79it/s, batch_loss=1.2233]


  Batch 3400 Loss: 1.2233


Epoch 1 Training:  74%|███████▎  | 3401/4621 [42:35<50:41,  2.49s/it, batch_loss=1.2233]

  Eval (should we go...): Dis-tu ce que ce soit ? Tom ? ? ? ? ?
  -------------------------------------------


Epoch 1 Training:  78%|███████▊  | 3600/4621 [45:02<12:36,  1.35it/s, batch_loss=1.0117]


  Batch 3600 Loss: 1.0117


Epoch 1 Training:  78%|███████▊  | 3601/4621 [45:07<40:08,  2.36s/it, batch_loss=1.0117]

  Eval (should we go...): Dis-moi commencer ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
  -------------------------------------------


Epoch 1 Training:  82%|████████▏ | 3800/4621 [47:04<07:46,  1.76it/s, batch_loss=1.0356]


  Batch 3800 Loss: 1.0356


Epoch 1 Training:  82%|████████▏ | 3801/4621 [47:09<27:30,  2.01s/it, batch_loss=1.0356]

  Eval (should we go...): Devrais-tu quelque chose ? ? ? ? ? ? ? ? ? ? ? ? ?
  -------------------------------------------


Epoch 1 Training:  87%|████████▋ | 4000/4621 [48:59<05:30,  1.88it/s, batch_loss=1.1064]


  Batch 4000 Loss: 1.1064


Epoch 1 Training:  87%|████████▋ | 4001/4621 [49:04<19:11,  1.86s/it, batch_loss=1.1064]

  Eval (should we go...): Dis-moi ce que je me rendre ? ? ? ? ? ? ? ?a ? ? ? ? ? ? ?
  -------------------------------------------


Epoch 1 Training:  91%|█████████ | 4200/4621 [50:47<03:34,  1.96it/s, batch_loss=1.1734]


  Batch 4200 Loss: 1.1734


Epoch 1 Training:  91%|█████████ | 4201/4621 [50:51<12:08,  1.73s/it, batch_loss=1.1734]

  Eval (should we go...): Tout le monde partir le manger ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 
  -------------------------------------------


Epoch 1 Training:  95%|█████████▌| 4400/4621 [52:45<01:40,  2.20it/s, batch_loss=1.1094]


  Batch 4400 Loss: 1.1094


Epoch 1 Training:  95%|█████████▌| 4401/4621 [52:50<06:44,  1.84s/it, batch_loss=1.1094]

  Eval (should we go...): Trai au moins de la maison ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
  -------------------------------------------


Epoch 1 Training: 100%|█████████▉| 4600/4621 [54:30<00:09,  2.25it/s, batch_loss=0.8502]


  Batch 4600 Loss: 0.8502


Epoch 1 Training: 100%|█████████▉| 4601/4621 [54:34<00:35,  1.77s/it, batch_loss=0.8502]

  Eval (should we go...): Dites-moi aller le montement ? ? ? ?ous ? ? ? ?
  -------------------------------------------


Epoch 1 Training: 100%|██████████| 4621/4621 [54:44<00:00,  1.41it/s, batch_loss=0.8715]
Validation: 100%|██████████| 513/513 [01:33<00:00,  5.50it/s, val_loss=0.939]


Epoch 1 Summary: Avg Train Loss: 1.3595, Avg Val Loss: 0.9313
--- Epoch 2/2 ---


Epoch 2 Training:   4%|▍         | 200/4621 [01:53<39:38,  1.86it/s, batch_loss=1.0956]  


  Batch 200 Loss: 1.0956


Epoch 2 Training:   4%|▍         | 201/4621 [01:58<2:23:22,  1.95s/it, batch_loss=1.0956]

  Eval (should we go...): Demainde-tu ce qui est la maison ? ? ? ? ? ? ? ?a ? ?a ?a ? ? ?a ? ? ? ?a ? ? ?
  -------------------------------------------


Epoch 2 Training:   9%|▊         | 400/4621 [03:41<35:00,  2.01it/s, batch_loss=0.8570]  


  Batch 400 Loss: 0.8570


Epoch 2 Training:   9%|▊         | 401/4621 [03:46<2:08:46,  1.83s/it, batch_loss=0.8570]

  Eval (should we go...): Sortez-vous la plus laisse ? ? ? ? ? ? ? ? ? ? ? ? ?ou ? ? ? ? ? ?ou ?a ? ?
  -------------------------------------------


Epoch 2 Training:  13%|█▎        | 600/4621 [05:27<36:03,  1.86it/s, batch_loss=1.0272]  


  Batch 600 Loss: 1.0272


Epoch 2 Training:  13%|█▎        | 601/4621 [05:31<2:00:52,  1.80s/it, batch_loss=1.0272]

  Eval (should we go...): Devrait-il quelque chose ? ? ? ? ? ? ? ? ? ? ? ? ?a ?a ? ? ?a ? ? ?a ?ai ? ?
  -------------------------------------------


Epoch 2 Training:  17%|█▋        | 800/4621 [07:15<28:22,  2.24it/s, batch_loss=1.0387]  


  Batch 800 Loss: 1.0387


Epoch 2 Training:  17%|█▋        | 801/4621 [07:19<1:49:20,  1.72s/it, batch_loss=1.0387]

  Eval (should we go...): Devrait-il le temps ? ? ? ? ? ? ? ? ? ? ? ? ?ous ? ? ? ? ? ? ? ? ? ? 
  -------------------------------------------


Epoch 2 Training:  22%|██▏       | 1000/4621 [09:00<30:58,  1.95it/s, batch_loss=0.8755] 


  Batch 1000 Loss: 0.8755


Epoch 2 Training:  22%|██▏       | 1001/4621 [09:05<1:57:51,  1.95s/it, batch_loss=0.8755]

  Eval (should we go...): Devrions-nous la maison ? ? ? ? ? ? ? ? ? ? ? ?ous ? ?on ? ?ou ? ? ?ous ? ?
  -------------------------------------------


Epoch 2 Training:  26%|██▌       | 1200/4621 [10:50<42:46,  1.33it/s, batch_loss=0.9993]  


  Batch 1200 Loss: 0.9993


Epoch 2 Training:  26%|██▌       | 1201/4621 [10:56<2:04:06,  2.18s/it, batch_loss=0.9993]

  Eval (should we go...): La vie de la plus laisse ? ? ? ? ? ? ? ? ? ? ?ou ? ?ou ? ?ou ? ? ?ou ? ? 
  -------------------------------------------


Epoch 2 Training:  30%|███       | 1400/4621 [12:53<27:39,  1.94it/s, batch_loss=0.8875]  


  Batch 1400 Loss: 0.8875


Epoch 2 Training:  30%|███       | 1401/4621 [12:58<1:47:19,  2.00s/it, batch_loss=0.8875]

  Eval (should we go...): Devrions-nous aller la maison ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
  -------------------------------------------


Epoch 2 Training:  35%|███▍      | 1600/4621 [15:02<31:18,  1.61it/s, batch_loss=0.9212]  


  Batch 1600 Loss: 0.9212


Epoch 2 Training:  35%|███▍      | 1601/4621 [15:07<1:45:49,  2.10s/it, batch_loss=0.9212]

  Eval (should we go...): Devrait-il le temps la maison ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
  -------------------------------------------


Epoch 2 Training:  39%|███▉      | 1800/4621 [17:01<25:27,  1.85it/s, batch_loss=0.9322]  


  Batch 1800 Loss: 0.9322


Epoch 2 Training:  39%|███▉      | 1801/4621 [17:05<1:31:23,  1.94s/it, batch_loss=0.9322]

  Eval (should we go...): Devrions-nous le plus tard ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 
  -------------------------------------------


Epoch 2 Training:  43%|████▎     | 2000/4621 [18:59<25:03,  1.74it/s, batch_loss=0.9269]  


  Batch 2000 Loss: 0.9269


Epoch 2 Training:  43%|████▎     | 2001/4621 [19:04<1:26:40,  1.98s/it, batch_loss=0.9269]

  Eval (should we go...): Diter la maison de la maison ? ? ? ? ? ? ? ? ? ? ? ? ?
  -------------------------------------------


Epoch 2 Training:  48%|████▊     | 2200/4621 [20:48<24:12,  1.67it/s, batch_loss=0.8929]  


  Batch 2200 Loss: 0.8929


Epoch 2 Training:  48%|████▊     | 2201/4621 [20:53<1:19:49,  1.98s/it, batch_loss=0.8929]

  Eval (should we go...): Devrai-moi le perdre la maison ? ? soit ? ? ? ?out ? ?out ? ? ?out ? ? ?outout ? 
  -------------------------------------------


Epoch 2 Training:  52%|█████▏    | 2400/4621 [22:33<17:53,  2.07it/s, batch_loss=0.8656]  


  Batch 2400 Loss: 0.8656


Epoch 2 Training:  52%|█████▏    | 2401/4621 [22:38<1:06:54,  1.81s/it, batch_loss=0.8656]

  Eval (should we go...): Devrions-nous aller la maison ? ? ? ? ? ? ? ?out ? ? ?ou ? ? ? ?
  -------------------------------------------


Epoch 2 Training:  56%|█████▋    | 2600/4621 [24:25<16:59,  1.98it/s, batch_loss=0.7910]  


  Batch 2600 Loss: 0.7910


Epoch 2 Training:  56%|█████▋    | 2601/4621 [24:30<1:09:00,  2.05s/it, batch_loss=0.7910]

  Eval (should we go...): De nous allerons la vie ? ? ? ? ? ? ?
  -------------------------------------------


Epoch 2 Training:  61%|██████    | 2800/4621 [26:16<14:27,  2.10it/s, batch_loss=0.8722]  


  Batch 2800 Loss: 0.8722


Epoch 2 Training:  61%|██████    | 2801/4621 [26:21<59:15,  1.95s/it, batch_loss=0.8722]

  Eval (should we go...): Serait-il de la maison ? ? ? ? ? ? ?out ? ?oute ?oute ?oute ?
  -------------------------------------------


Epoch 2 Training:  65%|██████▍   | 3000/4621 [28:10<14:19,  1.89it/s, batch_loss=0.7846]


  Batch 3000 Loss: 0.7846


Epoch 2 Training:  65%|██████▍   | 3001/4621 [28:15<53:24,  1.98s/it, batch_loss=0.7846]

  Eval (should we go...): Dites-moi de la maison ? ? ? ? ? ? ? ? ?out ? ? ?ou ? ? ?ou ? ? ?ou ? ? 
  -------------------------------------------


Epoch 2 Training:  69%|██████▉   | 3200/4621 [29:58<11:52,  1.99it/s, batch_loss=0.7790]


  Batch 3200 Loss: 0.7790


Epoch 2 Training:  69%|██████▉   | 3201/4621 [30:02<43:30,  1.84s/it, batch_loss=0.7790]

  Eval (should we go...): Devrais-nous aller la maison ? ? ? ? ?ous ?
  -------------------------------------------


Epoch 2 Training:  74%|███████▎  | 3400/4621 [31:56<11:39,  1.75it/s, batch_loss=0.7898]


  Batch 3400 Loss: 0.7898


Epoch 2 Training:  74%|███████▎  | 3401/4621 [32:01<41:32,  2.04s/it, batch_loss=0.7898]

  Eval (should we go...): Devrait-il me trouver la même ? ? ?e ?outrier ?ous ?ouirir
  -------------------------------------------


Epoch 2 Training:  78%|███████▊  | 3600/4621 [33:45<08:42,  1.96it/s, batch_loss=0.7506]


  Batch 3600 Loss: 0.7506


Epoch 2 Training:  78%|███████▊  | 3601/4621 [33:49<32:07,  1.89s/it, batch_loss=0.7506]

  Eval (should we go...): Serait-on de la maison ? de la maison ? de la taille ?outer ?ondrire ?ondririre ?s ?
  -------------------------------------------


Epoch 2 Training:  82%|████████▏ | 3800/4621 [35:32<06:31,  2.10it/s, batch_loss=0.7837]


  Batch 3800 Loss: 0.7837


Epoch 2 Training:  82%|████████▏ | 3801/4621 [35:36<25:16,  1.85s/it, batch_loss=0.7837]

  Eval (should we go...): Sera-t-on de la même ? ? ? ? ? ? ? ?
  -------------------------------------------


Epoch 2 Training:  87%|████████▋ | 4000/4621 [37:19<04:33,  2.27it/s, batch_loss=0.7616]


  Batch 4000 Loss: 0.7616


Epoch 2 Training:  87%|████████▋ | 4001/4621 [37:24<19:02,  1.84s/it, batch_loss=0.7616]

  Eval (should we go...): Cela nous avons-nous rencontrer la maison ? ? ? ?out ? ? ? ?as ?oute ?
  -------------------------------------------


Epoch 2 Training:  91%|█████████ | 4200/4621 [39:05<03:48,  1.85it/s, batch_loss=0.6382]


  Batch 4200 Loss: 0.6382


Epoch 2 Training:  91%|█████████ | 4201/4621 [39:10<12:46,  1.82s/it, batch_loss=0.6382]

  Eval (should we go...): Devrions-nous aller le même ? ? ? ? ? ? ? tout perdrire ?u ?ontrert ? ? ?
  -------------------------------------------


Epoch 2 Training:  95%|█████████▌| 4400/4621 [40:55<01:44,  2.11it/s, batch_loss=0.7308]


  Batch 4400 Loss: 0.7308


Epoch 2 Training:  95%|█████████▌| 4401/4621 [41:00<06:48,  1.86s/it, batch_loss=0.7308]

  Eval (should we go...): Devrait-il aller la malle ? ? ? ? ?
  -------------------------------------------


Epoch 2 Training: 100%|█████████▉| 4600/4621 [42:50<00:09,  2.24it/s, batch_loss=0.7967]


  Batch 4600 Loss: 0.7967


Epoch 2 Training: 100%|█████████▉| 4601/4621 [42:55<00:39,  1.98s/it, batch_loss=0.7967]

  Eval (should we go...): Devrions-nous aller le mal ? ? ? ? ? ?
  -------------------------------------------


Epoch 2 Training: 100%|██████████| 4621/4621 [43:07<00:00,  1.79it/s, batch_loss=0.7895]
Validation: 100%|██████████| 513/513 [01:28<00:00,  5.78it/s, val_loss=0.646]

Epoch 2 Summary: Avg Train Loss: 0.8604, Avg Val Loss: 0.6674





In [64]:
transformer.eval()
def translate(eng_sentence_text): 
    """Translates an English sentence to French using greedy decoding."""
    with torch.no_grad(): # Disable gradients during inference
      
        input_tokenized = [START_TOKEN] + tokenize(eng_sentence_text, english_vocabulary) + [END_TOKEN] # Tokenize input sentence
        input_numericalized = [en_to_index[token] for token in input_tokenized] # Numericalize input tokens
        input_tensor = torch.tensor(input_numericalized).unsqueeze(0).to(device) # Create tensor, add batch dimension, move to device

        # Greedy Decode 
        output_indices = transformer.greedy_decode(
            input_tensor, max_len=max_sequence_length, start_symbol=fr_to_index[START_TOKEN]
        )[0].cpu().numpy() 

        # 3. Convert Output Indices to French Sentence
        predicted_sentence_chars = []
        for idx in output_indices:
            if idx == fr_to_index[END_TOKEN]: 
                break
            predicted_sentence_chars.append(index_to_fr[idx]) 

        predicted_sentence = "".join(predicted_sentence_chars) 
        return predicted_sentence



In [65]:
english_input = "what should we do when the day starts?"
french_translation = translate(english_input)
print(f"English Input: {english_input}")
print(f"French Translation: {french_translation}")

English Input: what should we do when the day starts?
French Translation: <SOS>Ce que devrais-nous faire le dire ?


In [66]:
english_input = "what do you think about the book?"
french_translation = translate(english_input)
print(f"English Input: {english_input}")
print(f"French Translation: {french_translation}")

English Input: what do you think about the book?
French Translation: <SOS>Ce que vous pensez-vous ?


In [67]:
english_input = "where are you going?"
french_translation = translate(english_input)
print(f"English Input: {english_input}")
print(f"French Translation: {french_translation}")

English Input: where are you going?
French Translation: <SOS>Où vous avez en train de matin ?


In [68]:
english_input = "I want a new book."
french_translation = translate(english_input)
print(f"English Input: {english_input}")
print(f"French Translation: {french_translation}")

English Input: I want a new book.
French Translation: <SOS>Je veux un livre un livre.
