# Machine translation Nahuatl - Spanish

COURSE PROJECT LT2326
November 2021

### Part 1 - Data preparation

In [None]:
# pip install elotl

In [1]:
import csv
import string
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
from torchtext.data.metrics import bleu_score
import numpy as np
import pandas as pd
import elotl.corpus
import elotl.nahuatl.orthography
import random

In [2]:
hyperparameters = {"epochs":20,
                   "batch_size":64,
                   "embedding_size":256,
                   "hidden_size":1024,
                   "learning_rate":0.001,
                   "num_layers":2,
                   "dropout":0.5}

In [3]:
device = "cuda:3" if torch.cuda.is_available() else "cpu"

In [None]:
corpus = pd.read_csv("Axolotl.csv")

In [None]:
# These functions remove punctuation and normalize Nahuan spelling.

def normalize_nahuatl(x):
    n = elotl.nahuatl.orthography.Normalizer("inali")
    return n.normalize(x)

def remove_punct(x):
    string.punctuation = string.punctuation + '¿'
    exclude = set(string.punctuation)
    x.translate(str.maketrans('', '', string.punctuation))
    stripped_string = ''.join(ch for ch in x if ch not in exclude)
    return stripped_string

In [None]:
corpus['Nah'] = corpus['Nah'].apply(normalize_nahuatl)
corpus['Esp'] = corpus['Esp'].apply(remove_punct)
corpus['Nah'] = corpus['Nah'].apply(remove_punct)

In [None]:
corpus

In [None]:
s = corpus.Esp.str.len().sort_values().index
s

In [None]:
corpus.reindex(s)

In [None]:
n = 0
for idx, row in corpus.reindex(s).iterrows():
    if len(row["Esp"].split()) > 75:
        n += 1
        print(row["Esp"])
        print()
n

In [None]:
# This code removes the 824 sentence pairs with more than 75 words in their Spanish version.

for idx, row in corpus.reindex(s).iterrows():
    if len(row["Esp"].split()) > 75:
        corpus = corpus.drop(idx)

In [None]:
n = 0
for idx, row in corpus.iterrows():
    if len(row["Esp"].split()) > 75:
        n += 1
        print(row["Esp"])
        print()
n

In [None]:
corpus = shuffle(corpus)
corpus.reset_index(inplace=True, drop=True)

In [None]:
len(corpus)

In [None]:
train_df = corpus[:16000]
val_df = corpus[16000:16500]
test_df = corpus[16500:]

In [None]:
print(len(train_df))
print(len(val_df))
print(len(test_df))

In [None]:
corpus

In [None]:
train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)
test_df.to_csv("test.csv", index=False)

In [None]:
x = np.asarray([len(sent.split()) for sent in corpus["Esp"]])

fig,ax = plt.subplots(1,1)
ax.hist(x, bins = [0,5,10,15,20,25,30,35,40,45,50,60,70,80,90,100,110,120,130,140,150,175,200,225,250,275,300])
ax.set_title("Lengths of Spanish sentences")
#ax.set_xticks([0,10,20,30,40,50,60,70,80,90,100,150,200,250,300])
ax.set_xlabel("Number of words in sentence")
ax.set_ylabel("Number of sentences")
plt.show()

print(f"Longest sentence in corpus: {x[x.argmax()]} words")

In [1]:
# This function opens the csv-files with the datasets and creates train, validation and test iterators.
# The Spanish and Nahuan sentences are tokenized and their vocabularies created where each unique token is assigned an index.

def get_data():
    whitespacer = lambda x: x.split(' ')

    SPANISH = Field(
        tokenize=whitespacer,
        lower=True,                   
        batch_first=False,
        init_token="<start>",
        eos_token="<end>")
    
    NAHUATL = Field(
        tokenize=whitespacer,
        lower=True,                   
        batch_first=False,
        init_token="<start>",
        eos_token="<end>")
    
    train, val, test = TabularDataset.splits(
                        path = "./",
                        train = "train.csv",
                        validation = "val.csv",
                        test = "test.csv",
                        format = "csv",
                        fields = [("spanish", SPANISH), ("nahuatl", NAHUATL)],
                        skip_header = True)
    
    SPANISH.build_vocab(train, val, min_freq=2)
    NAHUATL.build_vocab(train, val, min_freq=2)

    
    train_iter = BucketIterator(
        train,                                                  
        batch_size=hyperparameters["batch_size"],
        sort_within_batch=True,
        sort_key=lambda x: (len(x.nahuatl)),
        shuffle=True,                                                  
        device=device
    )
    
    val_iter = BucketIterator(
        val,                                                  
        batch_size=hyperparameters["batch_size"],
        sort_within_batch=True,
        sort_key=lambda x: (len(x.nahuatl)),
        shuffle=True,                                                  
        device=device
    )
                
    test_iter = BucketIterator(
        test,                                                  
        batch_size=hyperparameters["batch_size"],
        sort_within_batch=True,
        sort_key=lambda x: (len(x.nahuatl)),
        shuffle=True,
        device=device
    )

    return train_iter, val_iter, test_iter, NAHUATL, SPANISH

In [5]:
train_iter, val_iter, test_iter, NAHUATL, SPANISH = get_data()

In [6]:
print(NAHUATL.vocab.stoi["nikan"])

19


In [19]:
print(NAHUATL.vocab.itos[19])

nikan


In [8]:
print(SPANISH.vocab.stoi["aguacate"])

3543


In [9]:
print(SPANISH.vocab.itos[3543])

aguacate


In [10]:
len(SPANISH.vocab)

10451

In [11]:
len(NAHUATL.vocab)

12845

In [21]:
print(len(train_iter))

250


In [22]:
print(len(val_iter))

8


In [23]:
print(len(test_iter))

9


### Part 2 - Building and training the model

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, num_layers, drop):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(drop)
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(emb_size, hidden_size, num_layers, dropout=drop)

    def forward(self, x):
        x1 = self.embedding(x)
        x2 = self.dropout(x1)
        output, (hidden, cell) = self.lstm(x2)
        return hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, num_layers, drop):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(drop)
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(emb_size, hidden_size, num_layers, dropout=drop)
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)
        x2 = self.embedding(x)
        x3 = self.dropout(x2)
        output, (hidden, cell) = self.lstm(x3, (hidden, cell))
        x4 = self.fc(output)
        x4 = x4.squeeze(0)
        return x4, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_length = target.shape[0]
        target_vocab_size = self.decoder.vocab_size
        sentence = torch.zeros(target_length, batch_size, target_vocab_size).to(device)
        hidden, cell = self.encoder(source)
        x = target[0]
        
        for t in range(1, target_length):
            
            output, hidden, cell = self.decoder(x, hidden, cell)
            sentence[t] = output
            teacher_force = random.random() < teacher_force_ratio
            predicted_word = output.argmax(1)
            x = target[t] if teacher_force else predicted_word
            
        return sentence

In [None]:
def plot_loss(epochs, train_loss, val_loss):
    plt.title("Training and Validation Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.plot(epochs, train_loss, label="Training Loss")
    plt.plot(epochs, val_loss, label="Validation Loss")
    plt.legend()
    plt.show()
    return

In [None]:
encoder = Encoder(len(NAHUATL.vocab), 
                  hyperparameters["embedding_size"], 
                  hyperparameters["hidden_size"], 
                  hyperparameters["num_layers"],
                  hyperparameters["dropout"]).to(device)

decoder = Decoder(len(SPANISH.vocab), 
                  hyperparameters["embedding_size"], 
                  hyperparameters["hidden_size"], 
                  hyperparameters["num_layers"], 
                  hyperparameters["dropout"]).to(device)
                            
seq2seq = Seq2Seq(encoder, decoder).to(device)

padding_index = SPANISH.vocab.stoi[SPANISH.pad_token]
loss_fn = nn.CrossEntropyLoss(ignore_index = padding_index)
optimizer = optim.Adam(
    seq2seq.parameters(),
    lr=hyperparameters["learning_rate"]
)

epoch_list = []
val_loss_list = []
train_loss_list = []
total_loss = 0

for epoch in range(hyperparameters["epochs"]):
    
    # TRAIN LOOP
    training_loss = 0
    seq2seq.train()
    
    for i, batch in enumerate(train_iter):
        
        source = batch.nahuatl.to(device)
        target = batch.spanish.to(device)
        
        output = seq2seq(source, target)
        output_reshaped = output[1:].reshape(-1, output.shape[2])
        target_reshaped = target[1:].reshape(-1)
        loss = loss_fn(output_reshaped, target_reshaped)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        training_loss += loss.item()
    
    # VALIDATION LOOP
    validation_loss = 0
    seq2seq.eval()
    
    for i, batch in enumerate(val_iter):
        source = batch.nahuatl.to(device)
        target = batch.spanish.to(device)
        output = seq2seq(source, target)
        output_reshaped = output[1:].reshape(-1, output.shape[2])
        target_reshaped = target[1:].reshape(-1)
        loss = loss_fn(output_reshaped, target_reshaped)
        validation_loss += loss.item()
    
    epoch_list.append(epoch+1)
    training_loss_avg = training_loss/len(train_iter)
    train_loss_list.append(training_loss_avg)
    validation_loss_avg = validation_loss/len(val_iter)
    val_loss_list.append(validation_loss_avg)

    print("Epoch: {}".format(epoch+1))
    print("Training loss: {}".format(training_loss_avg))
    print("Validation loss: {}".format(validation_loss_avg))
    
plot_loss(epoch_list, train_loss_list, val_loss_list)

In [None]:
# torch.save(seq2seq.state_dict(), "Seq2Seq")
# torch.save(encoder.state_dict(), "Encoder")
# torch.save(decoder.state_dict(), "Decoder")

### Part 3 - Evaluating the model

The correct hyperparameters for loading the model are:

hyperparameters = {"epochs":30,
                   "batch_size":64,
                   "embedding_size":256,
                   "hidden_size":1024,
                   "learning_rate":0.001,
                   "num_layers":2,
                   "dropout":0.5}

In [None]:
# Set load_model to True if you want to load the saved model
load_model = False

In [None]:
if load_model:
    encoder = Encoder(len(NAHUATL.vocab), 
                  hyperparameters["embedding_size"], 
                  hyperparameters["hidden_size"], 
                  hyperparameters["num_layers"],
                  hyperparameters["dropout"]).to(device)
    decoder = Decoder(len(SPANISH.vocab), 
                  hyperparameters["embedding_size"], 
                  hyperparameters["hidden_size"], 
                  hyperparameters["num_layers"], 
                  hyperparameters["dropout"]).to(device)
    
    encoder.load_state_dict(torch.load("Encoder"))
    decoder.load_state_dict(torch.load("Decoder"))
    seq2seq = Seq2Seq(encoder, decoder).to(device).to(device)
    seq2seq.load_state_dict(torch.load("Seq2Seq"))

In [None]:
seq2seq.eval()
total_loss = 0

padding_index = SPANISH.vocab.stoi[SPANISH.pad_token]
loss_fn = nn.CrossEntropyLoss(ignore_index = padding_index)

with torch.no_grad():
    for i, batch in enumerate(test_iter):
        
        source = batch.nahuatl.to(device)
        target = batch.spanish.to(device)
        output = seq2seq(source, target)
        output_reshaped = output[1:].reshape(-1, output.shape[2])
        target_reshaped = target[1:].reshape(-1)

        loss = loss_fn(output_reshaped, target_reshaped)
        total_loss += loss.item()
        print(f"Loss is: {total_loss/(i+1)}", end='\r')

In [None]:
# Below are some sentences from the test data to test the model on

In [None]:
test_sentence_1 = "miak otik yekitayah noso se tonali oixpolo"
# correct: la queríamos mucho pero un día desapareció

In [None]:
test_sentence_2 = "se kitoka iteyotsin"
# correct: se siembra la semillita

In [None]:
test_sentence_3 = "wan itakilo tein chikawak semi welik mah se kimana wan mah se kitsopeli ika panela"
# correct: su fruto recio es muy sabroso hervido y endulzado con piloncillo

In [None]:
# This function translates a Nahuan sentence to Spanish

def nahuatl_to_spanish(sentence):
    n = elotl.nahuatl.orthography.Normalizer("inali")
    normalized_sentence = n.normalize(sentence)
    tokenized_sentence = normalized_sentence.split(' ')
    tokenized_sentence = [string.lower() for string in tokenized_sentence]
    tokenized_sentence.insert(0, NAHUATL.init_token)
    tokenized_sentence.append(NAHUATL.eos_token)
    indiced_sentence = [NAHUATL.vocab.stoi[x] for x in tokenized_sentence]
    
    sentence_tensor = torch.LongTensor(indiced_sentence).unsqueeze(1).to(device)
    
    with torch.no_grad():
        hidden, cell = seq2seq.encoder(sentence_tensor)

    outputs = [NAHUATL.vocab.stoi["<start>"]]

    for _ in range(len(sentence)*3):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = seq2seq.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        if output.argmax(1).item() == SPANISH.vocab.stoi["<end>"]:
            break

    translated_sentence = [SPANISH.vocab.itos[idx] for idx in outputs]

    return tokenized_sentence[1:-1], translated_sentence[1:-1]

In [None]:
source, target = nahuatl_to_spanish(test_sentence_1)

In [None]:
source

In [None]:
target

In [None]:
# This function calculates BLEU score on the test data by comparing the generated translations 
# with the authentic ones from the corpus.

def calculate_bleu_score(test_data):
    
    candidate_corpus = []
    reference_corpus = []
    sources = []
    
    for idx, row in test_data.iterrows():
        source, target = nahuatl_to_spanish(row["Nah"])
        sources.append(source)
        candidate_corpus.append(target)
        reference_corpus.append([row["Esp"].split()])
    
    bleu = bleu_score(candidate_corpus, reference_corpus)
    
    print(f"Bleu score: {bleu}")
    
    return candidate_corpus, reference_corpus, sources, bleu

In [None]:
test_data = pd.read_csv("test.csv")
candidate_corpus, reference_corpus, sources, bleu = calculate_bleu_score(test_data)

The code below is used to investigate the length of the sentences in the corpus and calculate the BLEU score for sentences of different length.

In [None]:
corpus = pd.read_csv("Axolotl.csv")

In [None]:
x = np.asarray([len(sent.split()) for sent in corpus["Esp"]])
print(x[x.argmax()])

In [None]:
extremely_long_sentences = np.asarray([1 for sent in corpus["Esp"] if len(sent.split()) > 200])

In [None]:
extremely_long_sentences.sum()

In [None]:
very_long_sentences = np.asarray([1 for sent in corpus["Esp"] if len(sent.split()) > 100])

In [None]:
very_long_sentences.sum()

In [None]:
long_sentences = np.asarray([1 for sent in corpus["Esp"] if len(sent.split()) > 50])

In [None]:
long_sentences.sum()

In [None]:
one_word_sentences = np.asarray([1 for sent in corpus["Esp"] if len(sent.split()) == 1])

In [None]:
one_word_sentences.sum()

In [None]:
s = test_data.Esp.str.len().sort_values().index
s

In [None]:
test_data.reindex(s)

In [None]:
s = test_data.Esp.str.len().sort_values().index
s

In [None]:
n = 0
for idx, row in test_data.reindex(s).iterrows():
    if len(row["Esp"].split()) > 75:
        n += 1
n

In [None]:
calculate_bleu_score(test_data.reindex(s)[:100])

In [None]:
calculate_bleu_score(test_data.reindex(s)[100:200])

In [None]:
calculate_bleu_score(test_data.reindex(s)[200:300])

In [None]:
calculate_bleu_score(test_data.reindex(s)[300:400])

In [None]:
calculate_bleu_score(test_data.reindex(s)[400:500])

In [None]:
calculate_bleu_score(test_data.reindex(s)[500:])

### References

Grupo de Ingeniería Lingüística GIL, UNAM. (2015). Corpus paralelo español-náhuatl. [Dataset]. http://www.corpus.unam.mx/axolotl

Trevett, B. (2018). Sequence to Sequence Learning with Neural Networks. [Jupyter Notebook]. https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb