In [None]:
import os
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup

# URL to the directory containing the files to be downloaded
language = "en-es"
url = f"https://data.statmt.org/opus-100-corpus/v1.0/supervised/{language}/"
save_directory = f"../data/{language}"

# Create the save directory if it doesn't exist
os.makedirs(save_directory, exist_ok=True)

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML response
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the anchor tags in the HTML
links = soup.find_all('a')

# Extract the href attribute from each anchor tag
file_links = [link['href'] for link in links if '.' in link['href']]

# Download each file
for file_link in tqdm(file_links):
    file_url = url + file_link
    save_path = os.path.join(save_directory, file_link)
    
    print(f"Downloading {file_url}")
    
    # Send a GET request for the file
    file_response = requests.get(file_url)
    if file_response.status_code == 404:
        print(f"Could not download {file_url}")
        continue
    
    # Save the file to the specified directory
    with open(save_path, 'wb') as file:
        file.write(file_response.content)
    
    print(f"Saved {file_link}")

print("All files have been downloaded.")

In [2]:
import numpy as np
import os
from custom_tokenizer import CustomTokenizer
from configs import ModelConfigs
from mltu.tensorflow.dataProvider import DataProvider
from transformer import Transformer
import tensorflow as tf

2024-03-24 16:01:56.448522: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-24 16:01:56.829213: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
file_dir = os.path.join(os.getcwd(), "../data")


def read_data(path):
    sentences = []
    with open(path, "r", encoding="utf-8") as f:
        sentences = f.read().split("\n")[:-1]
    return sentences


en_train_data = read_data(path=os.path.join(file_dir, "en-es/opus.en-es-train.en"))[:200]
es_train_data = read_data(path=os.path.join(file_dir, "en-es/opus.en-es-train.es"))[:200]
en_val_data = read_data(path=os.path.join(file_dir, "en-es/opus.en-es-dev.en"))[:200]
es_val_data = read_data(path=os.path.join(file_dir, "en-es/opus.en-es-dev.es"))[:200]

In [4]:
configs = ModelConfigs()

In [5]:
max_length = 20
train_dataset = [
    [es_sentence, en_sentence]
    for es_sentence, en_sentence in zip(es_train_data, en_train_data)
    if len(es_sentence) <= max_length and len(en_sentence) <= max_length
]
val_dataset = [
    [es_sentence, en_sentence]
    for es_sentence, en_sentence in zip(es_val_data, en_val_data)
    if len(es_sentence) <= max_length and len(en_sentence) <= max_length
]
es_train_data, en_train_data = zip(*train_dataset)
es_val_data, en_val_data = zip(*val_dataset)

In [6]:
tokenizer = CustomTokenizer(char_level=True)
detokenizer = CustomTokenizer(char_level=True)

tokenizer.fit_on_texts(es_train_data)
tokenizer.save("model/tokenize/tokenizer.json")

detokenizer.fit_on_texts(en_train_data)
detokenizer.save("model/tokenize/detokenizer.json")

Fitting tokenizer: 100%|██████████| 122/122 [00:00<00:00, 236353.39it/s]
Fitting tokenizer: 100%|██████████| 122/122 [00:00<00:00, 373234.93it/s]


In [7]:
def preprocess_inputs(data_batch, label_batch):
    encoder_input = np.zeros((len(data_batch), tokenizer.max_length)).astype(np.int64)
    decoder_input = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64)
    decoder_output = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64)

    data_batch_tokens = tokenizer.texts_to_sequences(data_batch)
    label_batch_tokens = detokenizer.texts_to_sequences(label_batch)

    for index, (data, label) in enumerate(zip(data_batch_tokens, label_batch_tokens)):
        encoder_input[index][:len(data)] = data
        decoder_input[index][:len(label)-1] = label[:-1] # Drop the [END] tokens
        decoder_output[index][:len(label)-1] = label[1:] # Drop the [START] tokens

    return (encoder_input, decoder_input), decoder_output

In [8]:
train_dataProvider = DataProvider(
    train_dataset,
    batch_size=configs.batch_size,
    batch_postprocessors=[preprocess_inputs],
    use_cache=True,
)

val_dataProvider = DataProvider(
    val_dataset,
    batch_size=configs.batch_size,
    batch_postprocessors=[preprocess_inputs],
    use_cache=True,
)

In [9]:
transformer = Transformer(
    num_layers=configs.num_layers,
    d_model=configs.d_model,
    num_heads=configs.num_heads,
    dff=configs.dff,
    input_vocab_size=len(tokenizer)+1,
    target_vocab_size=len(detokenizer)+1,
    dropout_rate=configs.dropout_rate,
    encoder_input_size=tokenizer.max_length,
    decoder_input_size=detokenizer.max_length
    )

transformer.summary()



In [10]:
from CustomSchedule import CustomSchedule

learning_rate = CustomSchedule(configs.d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)


In [11]:
from mltu.tensorflow.transformer.utils import MaskedLoss
from mltu.tensorflow.transformer.utils import MaskedAccuracy

transformer.compile(
    loss=MaskedLoss(),
    optimizer=optimizer,
    metrics=[MaskedAccuracy()])

In [12]:
transformer.fit(
    train_dataProvider,
    epochs=configs.train_epochs,
    validation_data=val_dataProvider
)

Epoch 1/20


  self._warn_if_super_not_called()


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 465ms/step - loss: 4.8340 - masked_accuracy: 0.0177 - val_loss: 4.6506 - val_masked_accuracy: 0.0438
Epoch 2/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 255ms/step - loss: 4.6561 - masked_accuracy: 0.0121 - val_loss: 4.7772 - val_masked_accuracy: 0.0637
Epoch 3/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 234ms/step - loss: 4.4998 - masked_accuracy: 0.0369 - val_loss: 4.1421 - val_masked_accuracy: 0.0677
Epoch 4/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 253ms/step - loss: 4.2515 - masked_accuracy: 0.0412 - val_loss: 3.9337 - val_masked_accuracy: 0.0797
Epoch 5/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 238ms/step - loss: 3.9753 - masked_accuracy: 0.0663 - val_loss: 3.8098 - val_masked_accuracy: 0.0757
Epoch 6/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 243ms/step - loss: 3.7387 - masked_accuracy: 0.0959 - val_loss: 3.8

<keras.src.callbacks.history.History at 0x7c5845485480>

In [13]:
for data_batch in train_dataProvider:
    (encoder_inputs, decoder_inputs), decoder_outputs = data_batch

    encoder_inputs_str = tokenizer.detokenize(encoder_inputs)
    decoder_inputs_str = detokenizer.detokenize(decoder_inputs, remove_start_end=False)
    decoder_outputs_str = detokenizer.detokenize(decoder_outputs, remove_start_end=False)
    print(encoder_inputs_str)
    print(decoder_inputs_str)
    print(decoder_outputs_str)
    
    break

['el mal.', 'y luego ...', '¡no!', 'mira esto.', 'protestas en daca.', 'alguien me la jugó', 'ya...', '¿puedo fumar uno?', '- tres millones.', '[risa]', 'son los negocios.', 'espérenme.', 'puedo esperar.', '¡seif!', '¿ah, sí?', 'soy yo.']
['<start>evil.', '<start>and then...', '<start>no!', '<start>look at this.', '<start>protests in dhaka.', '<start>someone betrayed me.', '<start>i...', '<start>can i have one?', '<start>three million.', '<start>[laughter]', "<start>it's business.", '<start>wait for me.', '<start>i can wait.', '<start>prepare yourself!', '<start>oh, yeah?', "<start>it's me."]
['evil.<eos>', 'and then...<eos>', 'no!<eos>', 'look at this.<eos>', 'protests in dhaka.<eos>', 'someone betrayed me.<eos>', 'i...<eos>', 'can i have one?<eos>', 'three million.<eos>', '[laughter]<eos>', "it's business.<eos>", 'wait for me.<eos>', 'i can wait.<eos>', 'prepare yourself!<eos>', 'oh, yeah?<eos>', "it's me.<eos>"]


In [21]:
sentence = 'el mal.'
sequences = tokenizer.texts_to_sequences([sentence])

# print(sequences)
transformer.predict([None, np.array(sequences)])

AttributeError: 'NoneType' object has no attribute 'shape'

In [14]:
class Translator(tf.Module):
  def __init__(self, tokenizers, transformer):
    self.tokenizers = tokenizers
    self.transformer = transformer

  def __call__(self, sentence, max_length=20):
    # The input sentence is Portuguese, hence adding the `[START]` and `[END]` tokens.
    assert isinstance(sentence, tf.Tensor)
    if len(sentence.shape) == 0:
      sentence = sentence[tf.newaxis]

    sentence = self.tokenizers.pt.tokenize(sentence).to_tensor()

    encoder_input = sentence

    # As the output language is English, initialize the output with the
    # English `[START]` token.
    start_end = self.tokenizers.en.tokenize([''])[0]
    start = start_end[0][tf.newaxis]
    end = start_end[1][tf.newaxis]

    # `tf.TensorArray` is required here (instead of a Python list), so that the
    # dynamic-loop can be traced by `tf.function`.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())
      predictions = self.transformer([encoder_input, output], training=False)

      # Select the last token from the `seq_len` dimension.
      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)

      # Concatenate the `predicted_id` to the output which is given to the
      # decoder as its input.
      output_array = output_array.write(i+1, predicted_id[0])

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())
    # The output shape is `(1, tokens)`.
    text = tokenizers.en.detokenize(output)[0]  # Shape: `()`.

    tokens = tokenizers.en.lookup(output)[0]

    # `tf.function` prevents us from using the attention_weights that were
    # calculated on the last iteration of the loop.
    # So, recalculate them outside the loop.
    self.transformer([encoder_input, output[:,:-1]], training=False)
    attention_weights = self.transformer.decoder.last_attn_scores

    return text, tokens, attention_weights

NameError: name 'MAX_TOKENS' is not defined

In [None]:
translator = Translator(
    transformer=transformer, tokenizer=tokenizer, detokenizer=detokenizer
)

val_examples = [
    [es_sentence, en_sentence]
    for es_sentence, en_sentence in zip(es_val_data, en_val_data)
    if len(es_sentence) <= max_length and len(en_sentence) <= max_length
]
val_dataset = []
for es, en in val_examples:
    print(es)
    results, duration = translator.predict(es)
    print("Spanish:     ", es.lower())
    print("English:     ", en.lower())
    print("English pred:", results)
    print(duration)
    print()

Sí, Joe.


AttributeError: 'NoneType' object has no attribute 'shape'