# Machine translation Nahuatl - Spanish

Data from Axolotl parallel corpus: https://axolotl-corpus.mx/

COURSE PROJECT LT2326

Oct 2021

Klara Båstedt

### Part 1 - Data preparation

The data is a parallel corpus of texts in Spanish and Nahuatl consisting of almost 18000 parallel sentences.

Spelling normalization: https://pypi.org/project/elotl/

In [None]:
import csv
import string
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from torchtext.data import Field, BucketIterator, TabularDataset
import numpy as np
import pandas as pd
import elotl.corpus
import elotl.nahuatl.orthography
import random

In [None]:
pip install elotl

In [None]:
hyperparameters = {'epochs':3,
                   'batch_size':16,
                   'embedding_size':128,
                   'hidden_size':1024,
                   'learning_rate':0.001,
                   'num_layers':2,
                   'dropout':0.5}

In [None]:
corpus = pd.read_csv("Axolotl.csv")

In [None]:
def normalize_nahuatl(x):
    n = elotl.nahuatl.orthography.Normalizer("sep")
    return n.normalize(x)

def remove_punct(x):
    string.punctuation = string.punctuation + '¿'
    exclude = set(string.punctuation)
    x.translate(str.maketrans('', '', string.punctuation))
    stripped_string = ''.join(ch for ch in x if ch not in exclude)
    return stripped_string.lower()

In [None]:
corpus['Nah'] = corpus['Nah'].apply(normalize_nahuatl)
corpus['Esp'] = corpus['Esp'].apply(remove_punct)
corpus['Nah'] = corpus['Nah'].apply(remove_punct)

In [None]:
corpus = shuffle(corpus)
corpus.reset_index(inplace=True, drop=True)

In [None]:
train_df = corpus[:16995]
val_df = corpus[16995:17445]
test_df = corpus[17445:]

In [None]:
train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)
test_df.to_csv("test.csv", index=False)

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
def get_data():
    whitespacer = lambda x: x.split(' ')

    SPANISH = Field(
        tokenize=whitespacer,
        lower=True,                   
        batch_first=False,
        init_token='<start>',
        eos_token='<end>')
    
    NAHUATL = Field(
        tokenize=whitespacer,
        lower=True,                   
        batch_first=False,
        init_token='<start>',
        eos_token='<end>')
    
    train, val, test = TabularDataset.splits(
                        path = './',
                        train = 'train.csv',
                        validation = 'val.csv',
                        test = 'test.csv',
                        format = 'csv',
                        fields = [('spanish', SPANISH), ('nahuatl', NAHUATL)],
                        skip_header = True)
    
    SPANISH.build_vocab(train, val)
    NAHUATL.build_vocab(train, val)

    
    train_iter = BucketIterator(
        train,                                                  
        batch_size=hyperparameters['batch_size'],
        sort_within_batch=True,
        sort_key=lambda x: (len(x.nahuatl)),
        shuffle=True,                                                  
        device=device
    )
    
    val_iter = BucketIterator(
        val,                                                  
        batch_size=hyperparameters['batch_size'],
        sort_within_batch=True,
        sort_key=lambda x: (len(x.nahuatl)),
        shuffle=True,                                                  
        device=device
    )
                
    test_iter = BucketIterator(
        test,                                                  
        batch_size=hyperparameters['batch_size'],
        sort_within_batch=True,
        sort_key=lambda x: (len(x.nahuatl)),
        shuffle=True,
        device=device
    )

    return train_iter, val_iter, test_iter, NAHUATL, SPANISH

SPANISH.build_vocab(train, val, max_size=10000, min_freq=2)
NAHUATL.build_vocab(train, val, max_size=10000, min_freq=2)

The size of len(SPANISH.vocab) and len(NAHUATL.vocab) is 13982 and 16399, with min_freq=2.
Nahuatl is agglutinative while Spanish is not. This explains why the sizes of the vocabularies change to 27313 and 49302 when min_freq=1. I don't think min_freq=2 works well when one language is agglutinative. We will need our network to have access to as many nahuan "words" as possible. For the same reason it's not smart to set the vocabularies to equal size.

  

In [None]:
train_iter, val_iter, test_iter, NAHUATL, SPANISH = get_data()

In [None]:
print(NAHUATL.vocab.stoi["auakatl"])

In [None]:
print(NAHUATL.vocab.itos[17656])

In [None]:
print(SPANISH.vocab.stoi["aguacate"])

In [None]:
print(SPANISH.vocab.itos[5543])

In [None]:
len(SPANISH.vocab)

In [None]:
len(NAHUATL.vocab)

### Part 2 - Building and training the model

In [None]:
# Testa att inkludera validation i train-loopen och att sluta träna när loss inte längre minskar
# Testa att ta med en exempelmening och se hur den blir bättre översatt för varje epoch

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, num_layers, drop):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(drop)
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(emb_size, hidden_size, num_layers, dropout=drop)

    def forward(self, x):
        # shape of x: (length, batchsize)
        x1 = self.embedding(x)
        # shape of x: (length, batchsize, embsize)
        x2 = self.dropout(x1)
        output, (hidden, cell) = self.lstm(x2)

        return hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, num_layers, drop):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(drop)
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(emb_size, hidden_size, num_layers, dropout=drop)
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, hidden, cell):
        #shape of x: (N), but we want (1, N) or 
        x = x.unsqueeze(0)
        x2 = self.embedding(x)
        x3 = self.dropout(x2)
        output, (hidden, cell) = self.lstm(x3, (hidden, cell))
        x4 = self.fc(output)
        x4 = x4.squeeze(0)
        return x4, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_length = target.shape[0]
        target_vocab_size = self.decoder.vocab_size
        sentence = torch.zeros(target_length, batch_size, target_vocab_size).to(device)
        hidden, cell = self.encoder(source)
        x = target[0]
        
        for t in range(1, target_length):
            
            output, hidden, cell = self.decoder(x, hidden, cell)
            sentence[t] = output
            teacher_force = random.random() < teacher_force_ratio
            predicted_word = output.argmax(1) 
            x = target[t] if teacher_force else predicted_word
            
        return sentence

In [None]:
encoder = Encoder(len(NAHUATL.vocab), 
                  hyperparameters["embedding_size"], 
                  hyperparameters["hidden_size"], 
                  hyperparameters["num_layers"],
                  hyperparameters["dropout"]).to(device)

decoder = Decoder(len(SPANISH.vocab), 
                  hyperparameters["embedding_size"], 
                  hyperparameters["hidden_size"], 
                  hyperparameters["num_layers"], 
                  hyperparameters["dropout"]).to(device)
                            
seq2seq = Seq2Seq(encoder, decoder).to(device)

# Lägg till nåt här så loss inte räknas på padding

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(
    seq2seq.parameters(),
    lr=hyperparameters['learning_rate']
)

# start training loop
total_loss = 0
for epoch in range(hyperparameters['epochs']):
    for i, batch in enumerate(train_iter):
        source = batch.nahuatl.to(device)
        target = batch.spanish.to(device)
        output = seq2seq(source, target)
        # shape (trglength, batchsize, outputdim)
        loss = loss_fn(output[1:].reshape(-1, output.shape[2]), target[1:].reshape(-1))
        total_loss += loss.item()
        print(f"Loss in epoch {epoch+1} is:" total_loss/(i+1), end='\r')
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [None]:
torch.save(lstm_model.state_dict(), "|".join([f"{k}_{v}" for k, v in model_hyperparameters.items()]))

### Part 3 - Evaluating the model

In [None]:
# bleu score
# back translation

In [None]:
model.eval()
total_loss = 0

with torch.no_grad():
    for i, batch in enumerate(test_iter):
        source = batch.nahuatl.to(device)
        target = batch.spanish.to(device)
        output = seq2seq(source, target)
        # shape (trglength, batchsize, outputdim)
        loss = loss_fn(output[1:].reshape(-1, output.shape[2]), target[1:].reshape(-1))
        total_loss += loss.item()
        print(f"Loss in epoch {epoch+1} is:" total_loss/(i+1), end='\r')
        
    

### References

https://axolotl-corpus.mx/