In [3]:
import os
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup

# URL to the directory containing the files to be downloaded
language = "en-es"
url = f"https://data.statmt.org/opus-100-corpus/v1.0/supervised/{language}/"
save_directory = f"../data/{language}"

# Create the save directory if it doesn't exist
os.makedirs(save_directory, exist_ok=True)

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML response
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the anchor tags in the HTML
links = soup.find_all('a')

# Extract the href attribute from each anchor tag
file_links = [link['href'] for link in links if '.' in link['href']]

# Download each file
for file_link in tqdm(file_links):
    file_url = url + file_link
    save_path = os.path.join(save_directory, file_link)
    
    print(f"Downloading {file_url}")
    
    # Send a GET request for the file
    file_response = requests.get(file_url)
    if file_response.status_code == 404:
        print(f"Could not download {file_url}")
        continue
    
    # Save the file to the specified directory
    with open(save_path, 'wb') as file:
        file.write(file_response.content)
    
    print(f"Saved {file_link}")

print("All files have been downloaded.")

  0%|          | 0/7 [00:00<?, ?it/s]

Downloading https://data.statmt.org/opus-100-corpus/v1.0/supervised/en-es//opus-100-corpus/v1.0/supervised/


 14%|█▍        | 1/7 [00:02<00:12,  2.01s/it]

Could not download https://data.statmt.org/opus-100-corpus/v1.0/supervised/en-es//opus-100-corpus/v1.0/supervised/
Downloading https://data.statmt.org/opus-100-corpus/v1.0/supervised/en-es/opus.en-es-dev.en


 29%|██▊       | 2/7 [00:04<00:11,  2.35s/it]

Saved opus.en-es-dev.en
Downloading https://data.statmt.org/opus-100-corpus/v1.0/supervised/en-es/opus.en-es-dev.es


 43%|████▎     | 3/7 [00:09<00:13,  3.49s/it]

Saved opus.en-es-dev.es
Downloading https://data.statmt.org/opus-100-corpus/v1.0/supervised/en-es/opus.en-es-test.en


 57%|█████▋    | 4/7 [00:12<00:10,  3.49s/it]

Saved opus.en-es-test.en
Downloading https://data.statmt.org/opus-100-corpus/v1.0/supervised/en-es/opus.en-es-test.es


 71%|███████▏  | 5/7 [00:22<00:11,  5.69s/it]

Saved opus.en-es-test.es
Downloading https://data.statmt.org/opus-100-corpus/v1.0/supervised/en-es/opus.en-es-train.en


 86%|████████▌ | 6/7 [02:34<00:48, 48.75s/it]

Saved opus.en-es-train.en
Downloading https://data.statmt.org/opus-100-corpus/v1.0/supervised/en-es/opus.en-es-train.es


100%|██████████| 7/7 [05:03<00:00, 43.43s/it]

Saved opus.en-es-train.es
All files have been downloaded.





In [2]:
en_training_data_path = "../data/en-es/opus.en-es-train.en"
en_validation_data_path = "../data/en-es/opus.en-es-dev.en"
es_training_data_path = "../data/en-es/opus.en-es-train.es"
es_validation_data_path = "../data/en-es/opus.en-es-dev.es"

def read_files(path):
    with open(path, "r", encoding="utf-8") as f:
        en_train_dataset = f.read().split("\n")[:-1]
    return en_train_dataset

en_training_data = read_files(en_training_data_path)
en_validation_data = read_files(en_validation_data_path)
es_training_data = read_files(es_training_data_path)
es_validation_data = read_files(es_validation_data_path)

max_lenght = 500
train_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_training_data, en_training_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
val_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_validation_data, en_validation_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
es_training_data, en_training_data = zip(*train_dataset)
es_validation_data, en_validation_data = zip(*val_dataset)

print(len(es_training_data))
print(len(es_validation_data))
print(es_training_data[:3])
print(en_training_data[:3])

995249
1990
('Fueron los asbestos aquí. ¡Eso es lo que ocurrió!', 'Me voy de aquí.', 'Una vez, juro que cagué una barra de tiza.')
("It was the asbestos in here, that's what did it!", "I'm out of here.", 'One time, I swear I pooped out a stick of chalk.')


In [3]:
from custom_tokenizer import CustomTokenizer

# prepare Spanish tokenizer, this is the input language
tokenizer = CustomTokenizer(char_level=True)
tokenizer.fit_on_texts(es_training_data)
tokenizer.save("tokenizer.json")

# prepare English tokenizer, this is the output language
detokenizer = CustomTokenizer(char_level=True)
detokenizer.fit_on_texts(en_training_data)
detokenizer.save("detokenizer.json")

Fitting tokenizer:   0%|          | 0/995249 [00:00<?, ?it/s]

Fitting tokenizer: 100%|██████████| 995249/995249 [00:10<00:00, 95892.27it/s] 
Fitting tokenizer: 100%|██████████| 995249/995249 [00:10<00:00, 98450.65it/s] 


In [4]:
tokenized_sentence = detokenizer.texts_to_sequences(["Hello world, how are you?"])[0]
print(tokenized_sentence)

detokenized_sentence = detokenizer.detokenize([tokenized_sentence], remove_start_end=False)
print(detokenized_sentence)

detokenized_sentence = detokenizer.detokenize([tokenized_sentence])
print(detokenized_sentence)

[33, 51, 48, 55, 55, 58, 3, 66, 58, 61, 55, 47, 15, 3, 51, 58, 66, 3, 44, 61, 48, 3, 68, 58, 64, 36, 32]
['<start>hello world, how are you?<eos>']
['hello world, how are you?']


In [5]:
from mltu.tensorflow.dataProvider import DataProvider
import numpy as np


def preprocess_inputs(data_batch, label_batch):
    encoder_input = np.zeros((len(data_batch), tokenizer.max_length)).astype(np.int64)
    decoder_input = np.zeros((len(label_batch), detokenizer.max_length)).astype(
        np.int64
    )
    decoder_output = np.zeros((len(label_batch), detokenizer.max_length)).astype(
        np.int64
    )

    data_batch_tokens = tokenizer.texts_to_sequences(data_batch)
    label_batch_tokens = detokenizer.texts_to_sequences(label_batch)

    for index, (data, label) in enumerate(zip(data_batch_tokens, label_batch_tokens)):
        encoder_input[index][: len(data)] = data
        decoder_input[index][: len(label) - 1] = label[:-1]  # Drop the [END] tokens
        decoder_output[index][: len(label) - 1] = label[1:]  # Drop the [START] tokens

    return (encoder_input, decoder_input), decoder_output


train_dataProvider = DataProvider(
    train_dataset,
    batch_size=4,
    batch_postprocessors=[preprocess_inputs],
    use_cache=True,
)

val_dataProvider = DataProvider(
    val_dataset, batch_size=4, batch_postprocessors=[preprocess_inputs], use_cache=True
)

TypeError: Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

In [10]:
for data_batch in train_dataProvider:
    (encoder_inputs, decoder_inputs), decoder_outputs = data_batch

    encoder_inputs_str = tokenizer.detokenize(encoder_inputs)
    decoder_inputs_str = detokenizer.detokenize(decoder_inputs, remove_start_end=False)
    decoder_outputs_str = detokenizer.detokenize(
        decoder_outputs, remove_start_end=False
    )
    print(encoder_inputs_str)
    print(decoder_inputs_str)
    print(decoder_outputs_str)

    break

['fueron los asbestos aquí. ¡eso es lo que ocurrió!', 'me voy de aquí.', 'una vez, juro que cagué una barra de tiza.', 'y prefiero mudarme, ¿entiendes?']
["<start>it was the asbestos in here, that's what did it!", "<start>i'm out of here.", '<start>one time, i swear i pooped out a stick of chalk.', '<start>and i will move, do you understand me?']
["it was the asbestos in here, that's what did it!<eos>", "i'm out of here.<eos>", 'one time, i swear i pooped out a stick of chalk.<eos>', 'and i will move, do you understand me?<eos>']


In [22]:
from model import Transformer
from configs import ModelConfigs
from custom_tokenizer import CustomTokenizer

import numpy as np

import tensorflow as tf

try:
    [
        tf.config.experimental.set_memory_growth(gpu, True)
        for gpu in tf.config.experimental.list_physical_devices("GPU")
    ]
except:
    pass

from keras.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    ReduceLROnPlateau,
    TensorBoard,
)
from mltu.tensorflow.callbacks import Model2onnx, WarmupCosineDecay

from mltu.tensorflow.dataProvider import DataProvider
from mltu.tokenizers import CustomTokenizer

from mltu.tensorflow.transformer.utils import MaskedAccuracy, MaskedLoss
from mltu.tensorflow.transformer.callbacks import EncDecSplitCallback

configs = ModelConfigs()

# Path to dataset
en_training_data_path = "../data/en-es/opus.en-es-train.en"
en_validation_data_path = "../data/en-es/opus.en-es-dev.en"
es_training_data_path = "../data/en-es/opus.en-es-train.es"
es_validation_data_path = "../data/en-es/opus.en-es-dev.es"


def read_files(path):
    with open(path, "r", encoding="utf-8") as f:
        en_train_dataset = f.read().split("\n")[:-1]
    return en_train_dataset


en_training_data = read_files(en_training_data_path)
en_validation_data = read_files(en_validation_data_path)
es_training_data = read_files(es_training_data_path)
es_validation_data = read_files(es_validation_data_path)

# Consider only sentences with length <= 500
max_lenght = 500
train_dataset = [
    [es_sentence, en_sentence]
    for es_sentence, en_sentence in zip(es_training_data, en_training_data)
    if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght
]
val_dataset = [
    [es_sentence, en_sentence]
    for es_sentence, en_sentence in zip(es_validation_data, en_validation_data)
    if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght
]
es_training_data, en_training_data = zip(*train_dataset)
es_validation_data, en_validation_data = zip(*val_dataset)

# prepare spanish tokenizer, this is the input language
tokenizer = CustomTokenizer(char_level=True)
tokenizer.fit_on_texts(es_training_data)
tokenizer.save(configs.model_path + "/tokenizer.json")

# prepare english tokenizer, this is the output language
detokenizer = CustomTokenizer(char_level=True)
detokenizer.fit_on_texts(en_training_data)
detokenizer.save(configs.model_path + "/detokenizer.json")


def preprocess_inputs(data_batch, label_batch):
    encoder_input = np.zeros((len(data_batch), tokenizer.max_length)).astype(np.int64)
    decoder_input = np.zeros((len(label_batch), detokenizer.max_length)).astype(
        np.int64
    )
    decoder_output = np.zeros((len(label_batch), detokenizer.max_length)).astype(
        np.int64
    )

    data_batch_tokens = tokenizer.texts_to_sequences(data_batch)
    label_batch_tokens = detokenizer.texts_to_sequences(label_batch)

    for index, (data, label) in enumerate(zip(data_batch_tokens, label_batch_tokens)):
        encoder_input[index][: len(data)] = data
        decoder_input[index][: len(label) - 1] = label[:-1]  # Drop the [END] tokens
        decoder_output[index][: len(label) - 1] = label[1:]  # Drop the [START] tokens

    return (encoder_input, decoder_input), decoder_output


# Create Training Data Provider
train_dataProvider = DataProvider(
    train_dataset,
    batch_size=configs.batch_size,
    batch_postprocessors=[preprocess_inputs],
    use_cache=True,
)

# Create Validation Data Provider
val_dataProvider = DataProvider(
    val_dataset,
    batch_size=configs.batch_size,
    batch_postprocessors=[preprocess_inputs],
    use_cache=True,
)

# Create TensorFlow Transformer Model
transformer = Transformer(
    num_layers=configs.num_layers,
    d_model=configs.d_model,
    num_heads=configs.num_heads,
    dff=configs.dff,
    input_vocab_size=len(tokenizer) + 1,
    target_vocab_size=len(detokenizer) + 1,
    dropout_rate=configs.dropout_rate,
    encoder_input_size=tokenizer.max_length,
    decoder_input_size=detokenizer.max_length,
)

transformer.summary()


Fitting tokenizer: 100%|██████████| 995249/995249 [00:10<00:00, 96045.31it/s] 
Fitting tokenizer: 100%|██████████| 995249/995249 [00:09<00:00, 108906.88it/s]


In [27]:
optimizer = tf.keras.optimizers.Adam(learning_rate=configs.init_lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# Compile the model
transformer.compile(
    loss=MaskedLoss(),
    optimizer=optimizer,
    metrics=[MaskedAccuracy()],
    run_eagerly=False
    )

In [28]:
# Define callbacks
warmupCosineDecay = WarmupCosineDecay(
    lr_after_warmup=configs.lr_after_warmup,
    final_lr=configs.final_lr,
    warmup_epochs=configs.warmup_epochs,
    decay_epochs=configs.decay_epochs,
    initial_lr=configs.init_lr,
    )
earlystopper = EarlyStopping(monitor="val_masked_accuracy", patience=5, verbose=1, mode="max")
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.keras", monitor="val_masked_accuracy", verbose=1, save_best_only=True, mode="max", save_weights_only=False)
tb_callback = TensorBoard(f"{configs.model_path}/logs")
reduceLROnPlat = ReduceLROnPlateau(monitor="val_masked_accuracy", factor=0.9, min_delta=1e-10, patience=2, verbose=1, mode="max")
model2onnx = Model2onnx(f"{configs.model_path}/model.keras", metadata={"tokenizer": tokenizer.dict(), "detokenizer": detokenizer.dict()}, save_on_epoch_end=False)
encDecSplitCallback = EncDecSplitCallback(configs.model_path, encoder_metadata={"tokenizer": tokenizer.dict()}, decoder_metadata={"detokenizer": detokenizer.dict()})

In [35]:
# Train the model
transformer.fit(
    train_dataProvider, 
    validation_data=val_dataProvider, 
    epochs=configs.train_epochs,
    callbacks=[
        warmupCosineDecay,
        checkpoint, 
        tb_callback, 
        reduceLROnPlat,
        model2onnx,
        encDecSplitCallback
        ]
    )

AttributeError: 'Adam' object has no attribute 'lr'