In [None]:
import os
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup

# URL to the directory containing the files to be downloaded
language = "en-es"
url = f"https://data.statmt.org/opus-100-corpus/v1.0/supervised/{language}/"
save_directory = f"../data/{language}"

# Create the save directory if it doesn't exist
os.makedirs(save_directory, exist_ok=True)

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML response
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the anchor tags in the HTML
links = soup.find_all('a')

# Extract the href attribute from each anchor tag
file_links = [link['href'] for link in links if '.' in link['href']]

# Download each file
for file_link in tqdm(file_links):
    file_url = url + file_link
    save_path = os.path.join(save_directory, file_link)
    
    print(f"Downloading {file_url}")
    
    # Send a GET request for the file
    file_response = requests.get(file_url)
    if file_response.status_code == 404:
        print(f"Could not download {file_url}")
        continue
    
    # Save the file to the specified directory
    with open(save_path, 'wb') as file:
        file.write(file_response.content)
    
    print(f"Saved {file_link}")

print("All files have been downloaded.")

In [None]:
en_training_data_path = "../data/en-es/opus.en-es-train.en"
en_validation_data_path = "../data/en-es/opus.en-es-dev.en"
es_training_data_path = "../data/en-es/opus.en-es-train.es"
es_validation_data_path = "../data/en-es/opus.en-es-dev.es"

def read_files(path):
    with open(path, "r", encoding="utf-8") as f:
        en_train_dataset = f.read().split("\n")[:-1]
    return en_train_dataset

en_training_data = read_files(en_training_data_path)
en_validation_data = read_files(en_validation_data_path)
es_training_data = read_files(es_training_data_path)
es_validation_data = read_files(es_validation_data_path)

max_length = 500
train_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_training_data, en_training_data) if len(es_sentence) <= max_length and len(en_sentence) <= max_length]
val_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_validation_data, en_validation_data) if len(es_sentence) <= max_length and len(en_sentence) <= max_length]
es_training_data, en_training_data = zip(*train_dataset)
es_validation_data, en_validation_data = zip(*val_dataset)

print(len(es_training_data))
print(len(es_validation_data))
print(es_training_data[:3])
print(en_training_data[:3])

In [None]:
en_training_data = en_training_data[:100]
es_training_data = es_training_data[:100]

In [None]:
from custom_tokenizer import CustomTokenizer

# prepare Spanish tokenizer, this is the input language
tokenizer = CustomTokenizer(char_level=True)
tokenizer.fit_on_texts(es_training_data)
tokenizer.save("tokenizer.json")

# prepare English tokenizer, this is the output language
detokenizer = CustomTokenizer(char_level=True)
detokenizer.fit_on_texts(en_training_data)
detokenizer.save("detokenizer.json")

In [43]:
tokenized_sentence = detokenizer.texts_to_sequences(["Hello world, how are you?"])[0]
print(tokenized_sentence)

detokenized_sentence = detokenizer.detokenize([tokenized_sentence], remove_start_end=False)
print(detokenized_sentence)

detokenized_sentence = detokenizer.detokenize([tokenized_sentence])
print(detokenized_sentence)

[33, 51, 48, 55, 55, 58, 3, 66, 58, 61, 55, 47, 15, 3, 51, 58, 66, 3, 44, 61, 48, 3, 68, 58, 64, 36, 32]
['<start>hello world, how are you?<eos>']
['hello world, how are you?']


In [None]:
from mltu.tensorflow.dataProvider import DataProvider
import numpy as np


def preprocess_inputs(data_batch, label_batch):
    encoder_input = np.zeros((len(data_batch), tokenizer.max_length)).astype(np.int64)
    decoder_input = np.zeros((len(label_batch), detokenizer.max_length)).astype(
        np.int64
    )
    decoder_output = np.zeros((len(label_batch), detokenizer.max_length)).astype(
        np.int64
    )

    data_batch_tokens = tokenizer.texts_to_sequences(data_batch)
    label_batch_tokens = detokenizer.texts_to_sequences(label_batch)

    for index, (data, label) in enumerate(zip(data_batch_tokens, label_batch_tokens)):
        encoder_input[index][: len(data)] = data
        decoder_input[index][: len(label) - 1] = label[:-1]  # Drop the [END] tokens
        decoder_output[index][: len(label) - 1] = label[1:]  # Drop the [START] tokens

    return (encoder_input, decoder_input), decoder_output


train_dataProvider = DataProvider(
    train_dataset,
    batch_size=4,
    batch_postprocessors=[preprocess_inputs],
    use_cache=True,
)

val_dataProvider = DataProvider(
    val_dataset, batch_size=4, batch_postprocessors=[preprocess_inputs], use_cache=True
)

In [None]:
for data_batch in train_dataProvider:
    (encoder_inputs, decoder_inputs), decoder_outputs = data_batch

    encoder_inputs_str = tokenizer.detokenize(encoder_inputs)
    decoder_inputs_str = detokenizer.detokenize(decoder_inputs, remove_start_end=False)
    decoder_outputs_str = detokenizer.detokenize(
        decoder_outputs, remove_start_end=False
    )
    print(encoder_inputs_str)
    print(decoder_inputs_str)
    print(decoder_outputs_str)

    break

In [39]:
from model import Transformer
from configs import ModelConfigs
from custom_tokenizer import CustomTokenizer

import numpy as np

import tensorflow as tf

try:
    [
        tf.config.experimental.set_memory_growth(gpu, True)
        for gpu in tf.config.experimental.list_physical_devices("GPU")
    ]
except:
    pass

from keras.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    ReduceLROnPlateau,
    TensorBoard,
)
from mltu.tensorflow.callbacks import Model2onnx, WarmupCosineDecay

from mltu.tensorflow.dataProvider import DataProvider
from mltu.tokenizers import CustomTokenizer

from mltu.tensorflow.transformer.utils import MaskedAccuracy, MaskedLoss
from mltu.tensorflow.transformer.callbacks import EncDecSplitCallback

configs = ModelConfigs()

# Path to dataset
en_training_data_path = "../data/en-es/opus.en-es-train.en"
en_validation_data_path = "../data/en-es/opus.en-es-dev.en"
es_training_data_path = "../data/en-es/opus.en-es-train.es"
es_validation_data_path = "../data/en-es/opus.en-es-dev.es"


def read_files(path):
    with open(path, "r", encoding="utf-8") as f:
        en_train_dataset = f.read().split("\n")[:-1]
    return en_train_dataset


en_training_data = read_files(en_training_data_path)[:200]
en_validation_data = read_files(en_validation_data_path)[:200]
es_training_data = read_files(es_training_data_path)[:200]
es_validation_data = read_files(es_validation_data_path)[:200]

# Consider only sentences with length <= 500
max_length = 500
train_dataset = [
    [es_sentence, en_sentence]
    for es_sentence, en_sentence in zip(es_training_data, en_training_data)
    if len(es_sentence) <= max_length and len(en_sentence) <= max_length
]
val_dataset = [
    [es_sentence, en_sentence]
    for es_sentence, en_sentence in zip(es_validation_data, en_validation_data)
    if len(es_sentence) <= max_length and len(en_sentence) <= max_length
]
es_training_data, en_training_data = zip(*train_dataset)
es_validation_data, en_validation_data = zip(*val_dataset)

# prepare spanish tokenizer, this is the input language
tokenizer = CustomTokenizer(char_level=True)
tokenizer.fit_on_texts(es_training_data)
tokenizer.save(configs.model_path + "/tokenizer.json")

# prepare english tokenizer, this is the output language
detokenizer = CustomTokenizer(char_level=True)
detokenizer.fit_on_texts(en_training_data)
detokenizer.save(configs.model_path + "/detokenizer.json")


def preprocess_inputs(data_batch, label_batch):
    encoder_input = np.zeros((len(data_batch), tokenizer.max_length)).astype(np.int64)
    decoder_input = np.zeros((len(label_batch), detokenizer.max_length)).astype(
        np.int64
    )
    decoder_output = np.zeros((len(label_batch), detokenizer.max_length)).astype(
        np.int64
    )

    

    data_batch_tokens = tokenizer.texts_to_sequences(data_batch)
    label_batch_tokens = detokenizer.texts_to_sequences(label_batch)

    max_label_length = max(len(label) for label in label_batch_tokens)

    decoder_input = np.zeros((len(label_batch), max_label_length - 1)).astype(np.int64)
    decoder_output = np.zeros((len(label_batch), max_label_length - 1)).astype(np.int64)

    for index, (data, label) in enumerate(zip(data_batch_tokens, label_batch_tokens)):
        encoder_input[index][:len(data)] = data
        decoder_input[index][:len(label) - 1] = label[:-1]  # Drop the [END] tokens
        decoder_output[index][:len(label) - 1] = label[1:]  # Drop the [START] tokens

    return (encoder_input, decoder_input), decoder_output


# Create Training Data Provider
train_dataProvider = DataProvider(
    train_dataset,
    batch_size=configs.batch_size,
    batch_postprocessors=[preprocess_inputs],
    use_cache=True,
)

# Create Validation Data Provider
val_dataProvider = DataProvider(
    val_dataset,
    batch_size=configs.batch_size,
    batch_postprocessors=[preprocess_inputs],
    use_cache=True,
)

# Create TensorFlow Transformer Model
transformer = Transformer(
    num_layers=configs.num_layers,
    d_model=configs.d_model,
    num_heads=configs.num_heads,
    dff=configs.dff,
    input_vocab_size=len(tokenizer) + 1,
    target_vocab_size=len(detokenizer) + 1,
    dropout_rate=configs.dropout_rate,
    encoder_input_size=tokenizer.max_length,
    decoder_input_size=detokenizer.max_length,
)

transformer.summary()


Fitting tokenizer: 100%|██████████| 200/200 [00:00<00:00, 210928.04it/s]
Fitting tokenizer: 100%|██████████| 200/200 [00:00<00:00, 356052.97it/s]


In [40]:
optimizer = tf.keras.optimizers.Adam(learning_rate=configs.init_lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# Compile the model
transformer.compile(
    loss=MaskedLoss(),
    optimizer=optimizer,
    metrics=[MaskedAccuracy()],
    run_eagerly=False
    )

In [41]:
from tensorflow.keras.optimizers.schedules import LearningRateSchedule

class WarmupCosineDecay(LearningRateSchedule):

    def __init__(self,
                 initial_learning_rate,
                 decay_steps,
                 alpha,
                 warmup_epochs):
        super().__init__()

        self.initial_learning_rate = initial_learning_rate
        self.decay_steps = decay_steps
        self.alpha = alpha
        self.warmup_epochs = warmup_epochs

    def __call__(self, step):
        lr = self.initial_learning_rate
        if step < self.warmup_epochs:
            lr = lr * step / self.warmup_epochs
        else:
            lr = lr * 0.5 * (1 + math.cos(math.pi * (step - self.warmup_epochs) / self.decay_steps))
        return lr * self.alpha

warmupCosineDecay = WarmupCosineDecay(
    initial_learning_rate=configs.init_lr,
    decay_steps=configs.decay_epochs,
    alpha=configs.final_lr / configs.init_lr,
    warmup_epochs=configs.warmup_epochs,
)

earlystopper = EarlyStopping(monitor="val_masked_accuracy", patience=5, verbose=1, mode="max")
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.keras", monitor="val_masked_accuracy", verbose=1, save_best_only=True, mode="max", save_weights_only=False)
tb_callback = TensorBoard(f"{configs.model_path}/logs")
reduceLROnPlat = ReduceLROnPlateau(monitor="val_masked_accuracy", factor=0.9, min_delta=1e-10, patience=2, verbose=1, mode="max")
model2onnx = Model2onnx(f"{configs.model_path}/model.keras", metadata={"tokenizer": tokenizer.dict(), "detokenizer": detokenizer.dict()}, save_on_epoch_end=False)
encDecSplitCallback = EncDecSplitCallback(configs.model_path, encoder_metadata={"tokenizer": tokenizer.dict()}, decoder_metadata={"detokenizer": detokenizer.dict()})

In [42]:

from tensorflow.keras.callbacks import Callback

class WarmupCosineDecayCallback(Callback):
    def __init__(self, warmupCosineDecay):
        super().__init__()
        self.warmupCosineDecay = warmupCosineDecay

    def on_epoch_begin(self, epoch, logs=None):
        lr = self.warmupCosineDecay(epoch)
        self.model.optimizer.learning_rate = lr

warmupCosineDecayCallback = WarmupCosineDecayCallback(warmupCosineDecay)

transformer.fit(
    train_dataProvider, 
    validation_data=val_dataProvider, 
    epochs=configs.train_epochs,
    callbacks=[
        warmupCosineDecayCallback,
        checkpoint, 
        tb_callback, 
        reduceLROnPlat,
        model2onnx,
        encDecSplitCallback
    ]
)


Epoch 1/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - loss: 5.0007 - masked_accuracy: 0.0124

2024-03-22 23:00:40.042433: W tensorflow/core/framework/op_kernel.cc:1827] INVALID_ARGUMENT: ValueError: could not broadcast input array from shape (395,) into shape (371,)
Traceback (most recent call last):

  File "/home/thanhan/.local/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 270, in __call__
    ret = func(*args)

  File "/home/thanhan/.local/lib/python3.10/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "/home/thanhan/.local/lib/python3.10/site-packages/tensorflow/python/data/ops/from_generator_op.py", line 198, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "/home/thanhan/.local/lib/python3.10/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py", line 250, in _get_iterator
    for i, batch in enumerate(gen_fn()):

  File "/home/thanhan/.local/lib/python3.10/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.

InvalidArgumentError: Graph execution error:

Detected at node PyFunc defined at (most recent call last):
<stack traces unavailable>
ValueError: could not broadcast input array from shape (395,) into shape (371,)
Traceback (most recent call last):

  File "/home/thanhan/.local/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 270, in __call__
    ret = func(*args)

  File "/home/thanhan/.local/lib/python3.10/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "/home/thanhan/.local/lib/python3.10/site-packages/tensorflow/python/data/ops/from_generator_op.py", line 198, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "/home/thanhan/.local/lib/python3.10/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py", line 250, in _get_iterator
    for i, batch in enumerate(gen_fn()):

  File "/home/thanhan/.local/lib/python3.10/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py", line 244, in generator_fn
    yield self.py_dataset[i]

  File "/home/thanhan/.local/lib/python3.10/site-packages/mltu/dataProvider.py", line 282, in __getitem__
    batch_data, batch_annotations = batch_postprocessor(batch_data, batch_annotations)

  File "/tmp/ipykernel_96834/932217772.py", line 97, in preprocess_inputs
    encoder_input[index][:len(data)] = data

ValueError: could not broadcast input array from shape (395,) into shape (371,)


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]] [Op:__inference_one_step_on_iterator_224962]