In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import torch
import torch.optim as optim
import torch.nn as nn
from utils import load_checkpoint, save_checkpoint, translate_sentence
import random
import string
import re
import numpy as np
import tensorflow as tf
from Transformers_from_scratch import Transformer
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [83]:
path = "english-french-data"
file_path = "eng-french-data/eng-french.csv"
# read the data
df = pd.read_csv(file_path)
df['source'] = df['English words/sentences']

# let's add an initial “seed” token ([start]) and a stop token ([end]) to each target sentence.
df['target'] = df['French words/sentences'].apply(lambda x: '[start] ' + x + ' [end]')
df = df.drop(['English words/sentences', 'French words/sentences'], axis=1)

# display a few random samples
df.sample(5)

Unnamed: 0,source,target
92314,The cat is watching the fish.,[start] Le chat regarde le poisson. [end]
8659,It's well done.,[start] C'est bien fait. [end]
51270,We need some help here.,[start] Nous avons besoin d'aide ici. [end]
129984,I'd like to cash a travelers' check.,[start] J'aimerais encaisser un chèque de voya...
33317,Are you freaking out?,[start] Vous avez les foies ? [end]


In [84]:
df.shape

(175621, 2)

In [85]:
# shuffle the data
df = df.sample(frac=0.5).reset_index(drop=True)

# split the data into train, validation, and test sets
train_size = int(len(df) * 0.7)
val_size = int(len(df) * 0.2)
test_size = int(len(df) * 0.1)

train_df = df[:train_size]
val_df = df[train_size:train_size+val_size]
test_df = df[train_size+val_size:]

#### Standardizing, tokenizing and data indexing

- First, we need to parse our raw text data and vectorize it. To keep things simple, we will first limit our vocabulary using the max_tokens parameter. We will also limit the length of each sentence using the sequence_length parameter.

- Each sentence will be standardized, tokenized by word, and then indexed by token.

- This will result in a batch of vectors of tokens, stored in a 2D matrix of shape [(batch_size, sequence_length)].

In [86]:
max_tokens = 25000
sequence_length = 30

# define a custom standardization function that convert to lowercase and strips all punctuations except "[" and "]" (so we can tell apart "start" from "[start]").
strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
 
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")

# tokenize the data using our custom standardization function
source_vectorization = tf.keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=sequence_length
)

target_vectorization = tf.keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=sequence_length + 1, # add +1 token to our target sentences since they'll be shifted right by 1 during training
    standardize=custom_standardization,
)

# index all tokens in the source and target sentences
train_source_texts = train_df['source'].values
train_target_texts = train_df['target'].values
source_vectorization.adapt(train_source_texts)
target_vectorization.adapt(train_target_texts)

In [88]:
print(len(target_vectorization.get_vocabulary()))
len(source_vectorization.get_vocabulary())

20753


10134

In [89]:
# display a random sample before and after vectorization just to test the vectorization
random_sample = random.randint(0, len(train_df))
print("Source texts (one random sample):", train_source_texts[random_sample])
print("Target texts (one random sample):", train_target_texts[random_sample])
print("Source vectors (one random sample):", source_vectorization(train_source_texts[random_sample]))
print("Target vectors (one random sample):", target_vectorization(train_target_texts[random_sample]))

# display the decoding of the vectorized text (from vector back to text) just to test the vectorization
source_decoded_text = ''
for i in range(len(source_vectorization(train_source_texts[random_sample]))):
    source_decoded_text += source_vectorization.get_vocabulary()[source_vectorization(train_source_texts[random_sample])[i]] + ' '
print("Source decoded texts (one random sample):", source_decoded_text)

target_decoded_text = ''
for i in range(len(target_vectorization(train_target_texts[random_sample]))):
    target_decoded_text += target_vectorization.get_vocabulary()[target_vectorization(train_target_texts[random_sample])[i]] + ' '
print("Target decoded texts (one random sample):", target_decoded_text)

Source texts (one random sample): He speaks really well.
Target texts (one random sample): [start] Il parle vraiment bien. [end]
Source vectors (one random sample): tf.Tensor(
[ 10 697  80 110   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0], shape=(30,), dtype=int64)
Target vectors (one random sample): tf.Tensor(
[  2  13 257  77  76   3   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0], shape=(31,), dtype=int64)
Source decoded texts (one random sample): he speaks really well                           
Target decoded texts (one random sample): [start] il parle vraiment bien [end]                          


In [90]:
# display the shape of our vectorized data
train_source_vectors = source_vectorization(train_source_texts)
train_target_vectors = target_vectorization(train_target_texts)
src = torch.from_numpy(train_source_vectors.numpy())
target = torch.from_numpy(train_target_vectors.numpy())
print("Source vectors (shape):", src.shape)
print("Target vectors (shape):", target.shape)

Source vectors (shape): torch.Size([61466, 30])
Target vectors (shape): torch.Size([61466, 31])


#### The model

In [91]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

src_pad_idx = 0
target_pad_idx = 0
src_vocab_size = len(source_vectorization.get_vocabulary())
target_vocab_size = len(target_vectorization.get_vocabulary())

In [101]:
model = Transformer(src_voc_size = src_vocab_size, target_voc_size= target_vocab_size, src_pad_idx = src_pad_idx, 
                    target_pad_idx = target_pad_idx, embed_size=256, num_layers=3,
                     forward_expansion=4, heads = 4, dropout = 0, device = device, 
                     max_length = sequence_length).to(device)
print(model)

Transformer(
  (encoder): Encoder(
    (word_embedding): Embedding(10134, 256)
    (position_embedding): Embedding(30, 256)
    (layers): ModuleList(
      (0): TransformerBlock(
        (attention): SelfAttention(
          (values): Linear(in_features=64, out_features=64, bias=False)
          (keys): Linear(in_features=64, out_features=64, bias=False)
          (queries): Linear(in_features=64, out_features=64, bias=False)
          (fc_out): Linear(in_features=256, out_features=256, bias=True)
        )
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
        )
        (dropout): Dropout(p=0, inplace=False)
      )
      (1): TransformerBlock(
        (attention): SelfAttention(
          (values):

#### The DataLoader

In [102]:
num_workers = 6
# Création du dataset à partir des données et labels
dataset = TensorDataset(src, target, target) 
# 1st param : encoder inputs
# 2nd param : decoder inputs (truncate by 1 to keep it at the same length as decoder_outputs, which is shifted right by 1).
# 3rd param : decoder outputs, model target 


# Création du DataLoader à partir du dataset
batch_size = 64
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)


#### The training function

In [103]:
num_epochs = 50
learning_rate = 1e-3
load_model = True
save_model = True

In [104]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=src_pad_idx)
optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)

In [105]:
random_sample = random.randint(0, len(train_df))
sentence = train_source_texts[random_sample]
translation = train_target_texts[random_sample]
print("Source texts (one random sample):", sentence)
print("Target texts (one random sample):", translation)

Source texts (one random sample): I'm going to be your teacher.
Target texts (one random sample): [start] Je vais être votre professeur. [end]


In [97]:
def translate_sentence_(input_sentence, model, source_vectorization, target_vectorization, sequence_length):
    tokenized_input_sentence = torch.tensor(source_vectorization([input_sentence]).numpy())
    decoded_sentence = "[start]"
    for i in range(sequence_length):
        tokenized_target_sentence = torch.tensor(target_vectorization([decoded_sentence])[:, :-1].numpy())
        output = model(tokenized_input_sentence, tokenized_target_sentence)
        sampled_token_index = output.argmax(2)[0, i].item()
        #print(sampled_token_index)
        sampled_token = target_vectorization.get_vocabulary()[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence


In [106]:
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=5, verbose=True
)

trigger_time = 0
patience = 3

if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

print(translate_sentence_(sentence, model, source_vectorization, target_vectorization, sequence_length))

model.train()

for epoch in range(num_epochs):
    last_loss = 100
    print(f"[Epoch {epoch} / {num_epochs}]")

    # save the model at each epoch, to prevent losing all parameters for early kernel stopping
    if save_model:
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict()
        }
        save_checkpoint(checkpoint)

    losses = []

    for x_enc, x_dec, y in tqdm(train_loader):
        optimizer.zero_grad()
        # Forward prop
        output = model(x_enc, x_dec[:, :-1])

        # reshape output and y to compute CrossEntropyLoss
        output = output.reshape(-1, output.shape[2])
        y = y[:, 1:].reshape(-1)


        loss = loss_fn(output, y)
        losses.append(loss.item())

        # Back prop
        loss.backward()
        
        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

    mean_loss = sum(losses) / len(losses)
    print("mean_loss :", mean_loss)
    print("learning_rate :", learning_rate)
    print(translate_sentence(sentence, model, source_vectorization, target_vectorization, sequence_length))
    scheduler.step(mean_loss)

    # early stopping
    if mean_loss > last_loss:
        trigger_time+=1
        if trigger_time>patience:
            break
    else:
        trigger_time = 0

    last_loss = mean_loss # update the last loss

=> Loading checkpoint
[start] je ne suis pas sûr que cest ton idée [end]
[Epoch 0 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [26:46<00:00,  1.67s/it]


mean_loss : 3.321492270301954
learning_rate : 0.001
[start] je suis très heureux de vous avoir en colère [end]
[Epoch 1 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [30:36<00:00,  1.91s/it]


mean_loss : 3.0649833681680163
learning_rate : 0.001
[start] je suis très heureux de ta part [end]
[Epoch 2 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [29:44<00:00,  1.86s/it]


mean_loss : 2.845444804795947
learning_rate : 0.001
[start] je suis très heureux de votre avocat [end]
[Epoch 3 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [27:45<00:00,  1.73s/it]


mean_loss : 2.646230079132858
learning_rate : 0.001
[start] je suis très heureux de votre assistance [end]
[Epoch 4 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [28:28<00:00,  1.78s/it]


mean_loss : 2.4600819628394976
learning_rate : 0.001
[start] je vais avoir ton argent [end]
[Epoch 5 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [28:21<00:00,  1.77s/it]


mean_loss : 2.287953255451929
learning_rate : 0.001
[start] je vais vous montrer une autre chance [end]
[Epoch 6 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [32:01<00:00,  2.00s/it]


mean_loss : 2.1300605005825965
learning_rate : 0.001
[start] je vais avoir besoin de ton aide [end]
[Epoch 7 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [28:53<00:00,  1.80s/it]


mean_loss : 1.9905434906420674
learning_rate : 0.001
[start] je vais le faire par ton voyage [end]
[Epoch 8 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [29:00<00:00,  1.81s/it]


mean_loss : 1.8585658893376806
learning_rate : 0.001
[start] je vais vous payer à vos côtés [end]
[Epoch 9 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [28:40<00:00,  1.79s/it]


mean_loss : 1.7447934852802542
learning_rate : 0.001
[start] je vais arrêter de faire ton travail [end]
[Epoch 10 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [28:33<00:00,  1.78s/it]


mean_loss : 1.638229392941859
learning_rate : 0.001
[start] je vais être très honnête avec votre famille [end]
[Epoch 11 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [29:11<00:00,  1.82s/it]


mean_loss : 1.548901650957709
learning_rate : 0.001
[start] je vais être très honnête [end]
[Epoch 12 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [29:25<00:00,  1.84s/it]


mean_loss : 1.4735733288011739
learning_rate : 0.001
[start] je vais être ton professeur [end]
[Epoch 13 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [29:27<00:00,  1.84s/it]


mean_loss : 1.4050562525639052
learning_rate : 0.001
[start] je vais te faire du mal [end]
[Epoch 14 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [29:33<00:00,  1.85s/it]


mean_loss : 1.3462787898224424
learning_rate : 0.001
[start] je vais être votre institutrice [end]
[Epoch 15 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [29:25<00:00,  1.84s/it]


mean_loss : 1.2950189976364714
learning_rate : 0.001
[start] je vais résoudre le problème [end]
[Epoch 16 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [29:24<00:00,  1.84s/it]


mean_loss : 1.250757617149095
learning_rate : 0.001
[start] je vais aller chercher ton travail [end]
[Epoch 17 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [29:20<00:00,  1.83s/it]


mean_loss : 1.2068704744407464
learning_rate : 0.001
[start] je vais être votre ami [end]
[Epoch 18 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [29:26<00:00,  1.84s/it]


mean_loss : 1.1696308818369576
learning_rate : 0.001
[start] je vais aller à la figure [end]
[Epoch 19 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [28:57<00:00,  1.81s/it]


mean_loss : 1.135838264047045
learning_rate : 0.001
[start] je vais y aller en votre présence [end]
[Epoch 20 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [29:36<00:00,  1.85s/it]


mean_loss : 1.1021647711946367
learning_rate : 0.001
[start] je vais accepter votre offre [end]
[Epoch 21 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [31:24<00:00,  1.96s/it]


mean_loss : 1.0727008858406828
learning_rate : 0.001
[start] je vais aller te chercher [end]
[Epoch 22 / 50]
=> Saving checkpoint


100%|██████████| 961/961 [29:20<00:00,  1.83s/it]


mean_loss : 1.0450327171867522
learning_rate : 0.001
[start] je vais aller chercher votre manteau [end]
[Epoch 23 / 50]
=> Saving checkpoint


 24%|██▍       | 233/961 [07:18<22:49,  1.88s/it]


KeyboardInterrupt: 

#### Translate sentence

In [107]:
load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

=> Loading checkpoint


In [108]:
test_source_texts = test_df['source'].values
test_target_texts = test_df['target'].values

In [112]:
random_sample = np.random.randint(0, len(test_df), 10)
for k, i in enumerate(random_sample):
    sentence = test_source_texts[i]
    model_translation = translate_sentence(sentence, model, source_vectorization, target_vectorization, sequence_length)
    real_translation = test_target_texts[i]
    print(f"Sentence {k}")
    print("Source texts :", sentence)
    print("Model translation :", model_translation)
    print("Target texts :", real_translation)
    print("\n")

Sentence 0
Source texts : He gave me his word.
Model translation : [start] il ma donné une montre ses devoirs [end]
Target texts : [start] Il m'a donné sa parole. [end]


Sentence 1
Source texts : I already told you that a hundred times.
Model translation : [start] je vous ai dit que ce nétait pas un endroit [end]
Target texts : [start] Je vous ai déjà dit cela cent fois. [end]


Sentence 2
Source texts : He took a coin out of his pocket.
Model translation : [start] il a pris un taxi à boston en vitesse [end]
Target texts : [start] Il sortit une pièce de sa poche. [end]


Sentence 3
Source texts : The water from this fountain is safe to drink.
Model translation : [start] la nourriture depuis que la vie est vraiment mary [end]
Target texts : [start] L'eau de cette fontaine est potable. [end]


Sentence 4
Source texts : I think I'll be busy this week.
Model translation : [start] je pense que je pourrais jouer avec elle [end]
Target texts : [start] Je pense que je vais être occupée cette 

=> Loading checkpoint
