In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import random
import unicodedata
import re



In [2]:
train_path = "final_parallel_train.csv"
val_path = "final_parallel_val.csv"
test_path = "final_parallel_test.csv"

In [3]:
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = df = pd.read_csv(test_path)

train_pairs = list(zip(train_df["warao_sentence"], train_df["spanish_sentence"]))
val_pairs = list(zip(val_df["warao_sentence"], val_df["spanish_sentence"]))
test_pairs = list(zip(test_df["warao_sentence"], test_df["spanish_sentence"]))

print(f"Loaded {len(train_pairs)} translation pairs.")
print("Example:", train_pairs[0])

Loaded 6541 translation pairs.
Example: ('Jesú ribane: Yatusike katukane ribia? Ine sina tai? taeyama. Takore Pedro riboto ribane: Ijisike Akirito, taeyama.', 'El les preguntó de nuevo: “ Pero ustedes, ¿quién dicen que soy Yo?” “Tú eres el Cristo (el Mesías),” Le respondió Pedro.')


In [None]:
# def tokenize(text):
#     return text.lower().split()

In [None]:
# def tokenize(text, add_special_tokens=True):
#     text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode('utf-8')

#     text = text.lower().strip()

#     text = re.sub(r'([.!?,"()\'¿¡])', r' \1 ', text)

#     text = re.sub(r'\s+', ' ', text).strip()

#     tokens = text.split()

#     if add_special_tokens:
#         tokens.insert(0, "<SOS>")
#         tokens.append("<EOS>")

#     return tokens

In [4]:
def tokenize(text, lowercase=True):
    text = unicodedata.normalize('NFC', text.strip())

    if lowercase:
        text = text.lower()

    text = re.sub(r'([.!?,"()\'¿¡:;])', r' \1 ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokens = text.split()

    # tokens = ["<SOS>"] + tokens + ["<EOS>"] (this is handled in sentence_to_tensor already)

    return tokens

In [5]:
def build_vocab(sentences):
    vocab = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
    for s in sentences:
        for w in tokenize(s):
            if w not in vocab:
                vocab[w] = len(vocab)
    return vocab

In [6]:
warao_vocab = build_vocab([src for src, _ in train_pairs])
spanish_vocab = build_vocab([tgt for _, tgt in train_pairs])

In [7]:
def sentence_to_tensor(sentence, vocab):
    tokens = tokenize(sentence)
    ids = [vocab.get(w, vocab["<UNK>"]) for w in tokens]
    ids = [vocab["<SOS>"]] + ids + [vocab["<EOS>"]]
    return torch.tensor(ids, dtype=torch.long)

In [8]:
class TranslationDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        return sentence_to_tensor(src, warao_vocab), sentence_to_tensor(tgt, spanish_vocab)

In [9]:
train_dataset = TranslationDataset(train_pairs)
val_dataset   = TranslationDataset(val_pairs)
test_dataset  = TranslationDataset(test_pairs)

In [10]:
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)

    src_lengths = torch.tensor([len(s) for s in src_batch], dtype=torch.long)

    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=warao_vocab["<PAD>"])
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=spanish_vocab["<PAD>"])
    return src_batch, tgt_batch, src_lengths

batch_size = 16
# dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, drop_last=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, drop_last=True)

In [None]:
# class EncoderRNN(nn.Module):
#     def __init__(self, input_size, hidden_size, dropout=0.2):
#         super().__init__()
#         self.embedding = nn.Embedding(input_size, hidden_size)
#         self.rnn = nn.LSTM(hidden_size, hidden_size)  # GRU → LSTM
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, x):
#         embedded = self.embedding(x)
#         outputs, (hidden, cell) = self.rnn(embedded)
#         return hidden, cell  # return both states for LSTM


In [11]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.2):
        super().__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.dropout = nn.Dropout(dropout)

        self.rnn = nn.LSTM(
            hidden_size,
            hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
        )

    def forward(self, src, src_lengths):
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_lengths.cpu(), enforce_sorted=False)


        packed_outputs, (hidden, cell) = self.rnn(packed_embedded)


        return hidden, cell


In [13]:
class DecoderRNN(nn.Module):
    def __init__(self, output_size, hidden_size, num_layers=1, dropout=0.2):
        super().__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.dropout = nn.Dropout(dropout)

        self.rnn = nn.LSTM(
            hidden_size,
            hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
        )

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)

        embedded = self.embedding(x)
        embedded = self.dropout(embedded)

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))

        output = self.dropout(output)

        prediction = self.fc(output.squeeze(0))
        return prediction, hidden, cell


In [18]:
def evaluate(encoder, decoder, dataloader, criterion, device):
    encoder.eval()
    decoder.eval()
    total_loss = 0.0

    with torch.no_grad():
        for src, tgt, src_lengths in dataloader:
            src, tgt = src.transpose(0, 1).to(device), tgt.transpose(0, 1).to(device)

            src_lengths = src_lengths.to('cpu')

            src_lengths, sorted_indices = torch.sort(src_lengths, descending=True)

            src = src[:, sorted_indices]
            tgt = tgt[:, sorted_indices]

            batch_size = src.size(1)

            hidden, cell = encoder(src, src_lengths)

            dec_input = torch.full(
                (batch_size,),
                spanish_vocab["<SOS>"],
                dtype=torch.long,
                device=device
            )

            loss = 0.0
            for t in range(1, tgt.size(0)):
                output, hidden, cell = decoder(dec_input, hidden, cell)
                loss += criterion(output, tgt[t])
                dec_input = tgt[t]  # teacher forcing

            loss = loss / (tgt.size(0) - 1)
            total_loss += loss.item()

    return total_loss / len(dataloader)


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

hidden_size = 256
encoder = EncoderRNN(len(warao_vocab), hidden_size).to(device)
decoder = DecoderRNN(len(spanish_vocab), hidden_size).to(device)

pad_idx = spanish_vocab["<PAD>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx, reduction='mean')
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.0003, weight_decay=1e-5)

# after dropout the best model was at epoch 30 so increasing it to 50
EPOCHS = 40

best_val_loss = float('inf')
patience = 10
counter = 0

for epoch in range(EPOCHS):
    encoder.train()
    decoder.train()
    total_train_loss = 0
    for src, tgt, src_lengths in train_loader:
      optimizer.zero_grad()

      src, tgt = src.transpose(0, 1).to(device), tgt.transpose(0, 1).to(device)
      batch_size = src.size(1)

      src_lengths = src_lengths.to('cpu')

      # hidden, cell = encoder(src)

      src_lengths, sorted_indices = torch.sort(src_lengths, descending=True)

      src = src[:, sorted_indices]

      tgt = tgt[:, sorted_indices]

      hidden, cell = encoder(src, src_lengths)

      dec_input = torch.full(
          (batch_size,),
          spanish_vocab["<SOS>"],
          dtype=torch.long,
          device=device
      )

      loss = 0.0
      for t in range(1, tgt.size(0)):
          output, hidden, cell = decoder(dec_input, hidden, cell)
          loss += criterion(output, tgt[t])
          dec_input = tgt[t]

      loss = loss / (tgt.size(0) - 1)
      loss.backward()

      torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1.0)
      torch.nn.utils.clip_grad_norm_(decoder.parameters(), 1.0)

      optimizer.step()

      total_train_loss += loss.item()


    val_loss = evaluate(encoder, decoder, val_loader, criterion, device)
    avg_train_loss = total_train_loss / len(train_loader)

    print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            "encoder_state_dict": encoder.state_dict(),
            "decoder_state_dict": decoder.state_dict(),
            "epoch": epoch,
            "val_loss": val_loss,
        }, "best_model.pt")
        print(f"  --> New best model saved at epoch {epoch} (val_loss={val_loss:.4f})")

    # if val_loss < best_val_loss:
    #     best_val_loss = val_loss
    #     counter = 0
    #     torch.save({
    #         'encoder': encoder.state_dict(),
    #         'decoder': decoder.state_dict()
    #     }, "best_model.pt")
    # else:
    #     counter += 1
    #     if counter >= patience:
    #         print("Early stopping triggered.")
    #         break

Epoch 1/40, Train Loss: 6.0270, Val Loss: 5.4438
  --> New best model saved at epoch 0 (val_loss=5.4438)
Epoch 2/40, Train Loss: 5.1622, Val Loss: 5.2019
  --> New best model saved at epoch 1 (val_loss=5.2019)
Epoch 3/40, Train Loss: 4.9157, Val Loss: 5.0280
  --> New best model saved at epoch 2 (val_loss=5.0280)
Epoch 4/40, Train Loss: 4.7140, Val Loss: 4.8984
  --> New best model saved at epoch 3 (val_loss=4.8984)
Epoch 5/40, Train Loss: 4.5779, Val Loss: 4.7947
  --> New best model saved at epoch 4 (val_loss=4.7947)
Epoch 6/40, Train Loss: 4.4322, Val Loss: 4.7089
  --> New best model saved at epoch 5 (val_loss=4.7089)
Epoch 7/40, Train Loss: 4.3075, Val Loss: 4.6396
  --> New best model saved at epoch 6 (val_loss=4.6396)
Epoch 8/40, Train Loss: 4.2006, Val Loss: 4.5731
  --> New best model saved at epoch 7 (val_loss=4.5731)
Epoch 9/40, Train Loss: 4.1006, Val Loss: 4.5226
  --> New best model saved at epoch 8 (val_loss=4.5226)
Epoch 10/40, Train Loss: 4.0021, Val Loss: 4.4730
  -->

In [20]:
checkpoint = torch.load("best_model.pt", map_location=device)
encoder.load_state_dict(checkpoint["encoder_state_dict"])
decoder.load_state_dict(checkpoint["decoder_state_dict"])
print("Loaded best model from epoch", checkpoint["epoch"], "with val_loss", checkpoint["val_loss"])

Loaded best model from epoch 30 with val_loss 4.174581077363756


In [21]:
# checkpoint = torch.load("best_model.pt")
# encoder.load_state_dict(checkpoint['encoder'])
# decoder.load_state_dict(checkpoint['decoder'])

test_loss = evaluate(encoder, decoder, test_loader, criterion, device)
print(f"Final Test Loss: {test_loss:.4f}")


Final Test Loss: 4.0261


In [44]:
def detokenize(text: str):
    text = text.strip()

    text = re.sub(r'\s+([,.!?;:])', r'\1', text)

    text = re.sub(r'\s+([\)\]\}])', r'\1', text)

    text = re.sub(r'\s+(”)', r'\1', text)
    text = re.sub(r'\s+(“)', r'\1', text)

    text = re.sub(r'\s+(\'|")', r'\1', text)

    return text

In [45]:
def translate(sentence, max_len=1000):
    encoder.eval()
    decoder.eval()

    with torch.no_grad():
        src_tensor = sentence_to_tensor(sentence, warao_vocab).to(device)

        src_lengths = torch.tensor([src_tensor.size(0)], dtype=torch.long)

        src_tensor = src_tensor.unsqueeze(1)

        hidden, cell = encoder(src_tensor, src_lengths)

        dec_input = torch.tensor([spanish_vocab["<SOS>"]], device=device)
        output_sentence = []

        for _ in range(max_len):
            output, hidden, cell = decoder(dec_input, hidden, cell)
            top1 = output.argmax(1).item()

            if top1 == spanish_vocab["<EOS>"]:
                break

            output_sentence.append(top1)
            dec_input = torch.tensor([top1], device=device)

        inv_vocab = {v: k for k, v in spanish_vocab.items()}
        return " ".join(inv_vocab[i] for i in output_sentence)


In [48]:
print(detokenize(translate("Aroko isia najoro-yakutai tai warao wisi tane abanaja. Tiarone aroko ekumo ejobo-yakutai taisike wisi tane abaya, tae.")))

“y si alguien me ha dado a un hombre, y le ha dado a conocer el hijo del hombre, y el que me envió.”


In [49]:
input_sentence = "Aroko isia najoro-yakutai tai warao wisi tane abanaja. Tiarone aroko ekumo ejobo-yakutai taisike wisi tane abaya, tae."

exists = any(input_sentence.strip() == src.strip() for src, _ in train_pairs)
print("In training data:", exists)

In training data: True


In [50]:
print(translate("Bajuka sabuka"))

no tengo comer


In [51]:

random.seed(21)


num_samples = 500
sampled_train_pairs = random.sample(train_pairs, min(num_samples, len(train_pairs)))

print(f"Selected {len(sampled_train_pairs)} random training pairs.")
print(sampled_train_pairs)

Selected 500 random training pairs.
[('Ribane: Airamo sabuka ina utirajasiata naruae. Tata airamowitu tane abakore atae ekiajase bajinae oriaina airamo takitane.', 'Por eso dijo: “Cierto hombre de familia noble fue a un país lejano a recibir un reino para sí y después volver.'), ('Jono jisaka isia ribu manamo ribia. Dio Karima jakutai tai yakeraja jae tane ribia. Tiarone warao Dio monuka nónae tiakutai kajono isia taisi yaribia.', 'Con ella bendecimos a nuestro Señor y Padre, y con ella maldecimos a los hombres, que han sido hechos a la imagen de Dios.'), ('Kwarika ejoboyakutai jiaka jokeraja taisi isia abate. Obojona jiro jajasi awai karata eku jabátae tiaja ja. Kwarika ejoboyakutai taisi awai eneberenajine. Tiarone Marima amujaba Marimasi ainatabatuma amujaba tane yakeraja ribatine. Tatuma ine jaja, tatine.', '“Así el vencedor será vestido de vestiduras blancas y no borraré su nombre del Libro de la Vida, y reconoceré su nombre delante de Mi Padre y delante de Sus ángeles.'), ('Ainat

In [52]:
random.seed(21)


num_samples = 500
sampled_test_pairs = random.sample(test_pairs, min(num_samples, len(test_pairs)))

print(f"Selected {len(sampled_test_pairs)} random training pairs.")
print(sampled_test_pairs)

Selected 500 random training pairs.
[('Tane Dio yakeraja jae, tane ribubuae, maobojona.', 'Y glorificaban a Dios por causa de mí.'), ('Taisi rao takore najamutuata ribu rijisawitu nokoae. Ribu rijisa jakutai Dio warao taisi rewaramejerenaja.', 'que fue arrebatado al paraíso, y escuchó palabras inefables que al hombre no se le permite expresar.'), ('Takore Pedro aobojona ejobane ribane: Maisia nonajakutai nomeraji bajama! Airamo oriainataba inatabae Herodes amojekumo mejeromejerei. Judio maisiko nonakitane obonoajakutai kokotuka taisi ekumo mejeronae, taeyama.', 'Cuando Pedro volvió en sí, dijo: “Ahora sé en verdad que el Señor ha enviado a Su ángel, y me ha rescatado de la mano de Herodes (Agripa I) y de todo lo que esperaba el pueblo de los Judíos.”'), ('Yatu sanamatayakutai Dio yatu kamonuka tuwaramejerete. Airamo Jesú oriemusabakore Dio yatu tuwara-mejerete. Jesú najamutuatamo oriemusabate. Ainatabatuma tae-rajasi isiko naote.', 'Pero que El les dé alivio a ustedes que son afligidos

In [53]:
def output(pairs):
  translated_results = []

  for warao_sentence, spanish_sentence in pairs:
      try:
          predicted_translation = detokenize(translate(warao_sentence))
      except Exception as e:
          predicted_translation = f"Error: {e}"

      translated_results.append((warao_sentence, predicted_translation, spanish_sentence.lower()))
  return translated_results

# print(f"Generated translations for {len(translated_results)} pairs.")
# print(translated_results[1])

# df_results = pd.DataFrame(translated_results, columns=["warao", "predicted_spanish", "actual_spanish"])
# df_results.to_csv("warao_test_translation_results.csv", index=False)


In [54]:
# train_output = output(sampled_train_pairs)
test_output = output(test_pairs)

# train_results = pd.DataFrame(train_output, columns=["warao", "predicted_spanish", "actual_spanish"])
# train_results.to_csv("warao_train_translation_results.csv", index=False)

test_results = pd.DataFrame(test_output, columns=["warao", "predicted_spanish", "actual_spanish"])
test_results.to_csv("warao_test_translation_results.csv", index=False)