In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import random



In [None]:
train_path = "parallel_train.csv"
val_path = "parallel_val.csv"
test_path = "parallel_test.csv"

In [None]:
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = df = pd.read_csv(test_path)


# df = df.dropna(subset=["warao_sentence", "spanish_sentence"])
# df = df[df["warao_sentence"].str.strip() != ""]
# df = df[df["spanish_sentence"].str.strip() != ""]

train_pairs = list(zip(train_df["warao_sentence"], train_df["spanish_sentence"]))
val_pairs = list(zip(val_df["warao_sentence"], val_df["spanish_sentence"]))
test_pairs = list(zip(test_df["warao_sentence"], test_df["spanish_sentence"]))

print(f"✅ Loaded {len(train_pairs)} translation pairs.")
print("Example:", train_pairs[0])

✅ Loaded 6403 translation pairs.
Example: ('Jesú ribane: Yatusike katukane ribia? Ine sina tai? taeyama. Takore Pedro riboto ribane: Ijisike Akirito, taeyama.', 'El les preguntó de nuevo: “ Pero ustedes, ¿quién dicen que soy Yo?” “Tú eres el Cristo (el Mesías),” Le respondió Pedro.')


In [None]:
def tokenize(text):
    return text.lower().split()

In [None]:
def build_vocab(sentences):
    vocab = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
    for s in sentences:
        for w in tokenize(s):
            if w not in vocab:
                vocab[w] = len(vocab)
    return vocab

In [None]:
warao_vocab = build_vocab([src for src, _ in train_pairs])
spanish_vocab = build_vocab([tgt for _, tgt in train_pairs])

In [None]:
def sentence_to_tensor(sentence, vocab):
    tokens = tokenize(sentence)
    ids = [vocab.get(w, vocab["<UNK>"]) for w in tokens]
    ids = [vocab["<SOS>"]] + ids + [vocab["<EOS>"]]
    return torch.tensor(ids, dtype=torch.long)

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        return sentence_to_tensor(src, warao_vocab), sentence_to_tensor(tgt, spanish_vocab)

In [None]:
train_dataset = TranslationDataset(train_pairs)
val_dataset   = TranslationDataset(val_pairs)
test_dataset  = TranslationDataset(test_pairs)

In [None]:
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=warao_vocab["<PAD>"])
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=spanish_vocab["<PAD>"])
    return src_batch, tgt_batch

batch_size = 32
# dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, drop_last=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, drop_last=True)

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, hidden = self.rnn(embedded)
        return hidden


In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, output_size, hidden_size, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        x = x.unsqueeze(0)
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc(output.squeeze(0))
        return prediction, hidden


In [None]:
def evaluate(encoder, decoder, dataloader, criterion, device):
    encoder.eval()
    decoder.eval()
    total_loss = 0

    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.transpose(0, 1).to(device), tgt.transpose(0, 1).to(device)
            batch_size = src.size(1)

            hidden = encoder(src)
            dec_input = torch.full((batch_size,), spanish_vocab["<SOS>"], dtype=torch.long, device=device)

            loss = 0
            for t in range(1, tgt.size(0)):
                output, hidden = decoder(dec_input, hidden)
                loss += criterion(output, tgt[t])
                dec_input = tgt[t]

            total_loss += loss.item() / tgt.size(0)

    return total_loss / len(dataloader)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

hidden_size = 256
encoder = EncoderRNN(len(warao_vocab), hidden_size).to(device)
decoder = DecoderRNN(len(spanish_vocab), hidden_size).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=spanish_vocab["<PAD>"])
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.0005, weight_decay=1e-5)


EPOCHS = 100

best_val_loss = float('inf')
patience = 10
counter = 0

for epoch in range(EPOCHS):
    encoder.train()
    decoder.train()
    total_train_loss = 0
    for src, tgt in train_loader:
        optimizer.zero_grad()
        src = src.transpose(0,1).to(device)
        tgt = tgt.transpose(0,1).to(device)
        hidden = encoder(src)
        batch_size = src.size(1)
        dec_input = torch.full((batch_size,), spanish_vocab["<SOS>"], dtype=torch.long).to(device)


        loss = 0
        for t in range(1, tgt.size(0)):
            output, hidden = decoder(dec_input, hidden)
            loss += criterion(output, tgt[t])
            dec_input = tgt[t]
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item() / tgt.size(0)

    val_loss = evaluate(encoder, decoder, val_loader, criterion, device)
    avg_train_loss = total_train_loss / len(train_loader)

    print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # if val_loss < best_val_loss:
    #     best_val_loss = val_loss
    #     counter = 0
    #     torch.save({
    #         'encoder': encoder.state_dict(),
    #         'decoder': decoder.state_dict()
    #     }, "best_model.pt")
    # else:
    #     counter += 1
    #     if counter >= patience:
    #         print("Early stopping triggered.")
    #         break

Epoch 1/100, Train Loss: 6.8884, Val Loss: 6.4728
Epoch 2/100, Train Loss: 5.9005, Val Loss: 6.1882
Epoch 3/100, Train Loss: 5.4720, Val Loss: 6.0358
Epoch 4/100, Train Loss: 5.1190, Val Loss: 5.9227
Epoch 5/100, Train Loss: 4.8165, Val Loss: 5.8489
Epoch 6/100, Train Loss: 4.5308, Val Loss: 5.7830
Epoch 7/100, Train Loss: 4.2623, Val Loss: 5.7722
Epoch 8/100, Train Loss: 4.0142, Val Loss: 5.7459
Epoch 9/100, Train Loss: 3.7669, Val Loss: 5.7191
Epoch 10/100, Train Loss: 3.5475, Val Loss: 5.7485
Epoch 11/100, Train Loss: 3.3351, Val Loss: 5.7361
Epoch 12/100, Train Loss: 3.1282, Val Loss: 5.7559
Epoch 13/100, Train Loss: 2.9393, Val Loss: 5.7825
Epoch 14/100, Train Loss: 2.7671, Val Loss: 5.7897
Epoch 15/100, Train Loss: 2.5984, Val Loss: 5.8114
Epoch 16/100, Train Loss: 2.4575, Val Loss: 5.8268
Epoch 17/100, Train Loss: 2.3153, Val Loss: 5.8689
Epoch 18/100, Train Loss: 2.1909, Val Loss: 5.8880
Epoch 19/100, Train Loss: 2.0911, Val Loss: 5.8702
Epoch 20/100, Train Loss: 1.9707, Val Lo

In [None]:
# checkpoint = torch.load("best_model.pt")
# encoder.load_state_dict(checkpoint['encoder'])
# decoder.load_state_dict(checkpoint['decoder'])

test_loss = evaluate(encoder, decoder, test_loader, criterion, device)
print(f"Final Test Loss: {test_loss:.4f}")


Final Test Loss: 8.4204


In [None]:
def translate(sentence, max_len=100):
    with torch.no_grad():
        src_tensor = sentence_to_tensor(sentence, warao_vocab).unsqueeze(0).to(device)
        src_tensor = src_tensor.transpose(0, 1)
        hidden = encoder(src_tensor)

        dec_input = torch.tensor([spanish_vocab["<SOS>"]], device=device)
        output_sentence = []

        for _ in range(max_len):
            output, hidden = decoder(dec_input, hidden)
            top1 = output.argmax(1).item()
            if top1 == spanish_vocab["<EOS>"]:
                break
            output_sentence.append(top1)
            dec_input = torch.tensor([top1], device=device)

        inv_vocab = {v: k for k, v in spanish_vocab.items()}
        return " ".join(inv_vocab[i] for i in output_sentence)


In [None]:
print(translate("Aroko isia najoro-yakutai tai warao wisi tane abanaja. Tiarone aroko ekumo ejobo-yakutai taisike wisi tane abaya, tae."))

no es lo que entra en la boca lo que contamina al hombre; sino lo que sale de la boca, eso es lo que contamina al hombre.”


In [None]:
input_sentence = "Aroko isia najoro-yakutai tai warao wisi tane abanaja. Tiarone aroko ekumo ejobo-yakutai taisike wisi tane abaya, tae."

exists = any(input_sentence.strip() == src.strip() for src, _ in train_pairs)
print("In training data:", exists)

In training data: True


In [None]:
print(translate("Bajuka sabuka"))

no tener comer.


In [None]:

random.seed(21)


num_samples = 500
sampled_train_pairs = random.sample(train_pairs, min(num_samples, len(train_pairs)))

print(f"Selected {len(sampled_train_pairs)} random training pairs.")
print(sampled_train_pairs)

NameError: name 'random' is not defined

In [None]:
random.seed(21)


num_samples = 500
sampled_test_pairs = random.sample(test_pairs, min(num_samples, len(test_pairs)))

print(f"Selected {len(sampled_test_pairs)} random training pairs.")
print(sampled_test_pairs)

In [None]:
def output(pairs):
  translated_results = []

  for warao_sentence, spanish_sentence in sampled_train_pairs:
      try:
          predicted_translation = translate(warao_sentence)
      except Exception as e:
          predicted_translation = f"Error: {e}"

      translated_results.append((warao_sentence, predicted_translation, spanish_sentence))
  return translated_results

# print(f"Generated translations for {len(translated_results)} pairs.")
# print(translated_results[1])

# df_results = pd.DataFrame(translated_results, columns=["warao", "predicted_spanish", "actual_spanish"])
# df_results.to_csv("warao_test_translation_results.csv", index=False)


Generated translations for 100 pairs.
('Jono jisaka isia ribu manamo ribia. Dio Karima jakutai tai yakeraja jae tane ribia. Tiarone warao Dio monuka nónae tiakutai kajono isia taisi yaribia.', 'con ella bendecimos a nuestro señor y padre, y con ella maldecimos a los hombres, que han sido hechos a la imagen de dios.', 'Con ella bendecimos a nuestro Señor y Padre, y con ella maldecimos a los hombres, que han sido hechos a la imagen de Dios.')


In [None]:
train_output = output(sampled_train_pairs)
test_output = output(sampled_test_pairs)

train_results = pd.DataFrame(train_output, columns=["warao", "predicted_spanish", "actual_spanish"])
train_results.to_csv("warao_train_translation_results.csv", index=False)

test_results = pd.DataFrame(test_output, columns=["warao", "predicted_spanish", "actual_spanish"])
test_results.to_csv("warao_test_translation_results.csv", index=False)