# Nahuatl-Spanish Neural Machine Translation


## Data Pre-Processing

Import the necessary libraries for data preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split
import os

Visualize first 

In [65]:
file = pd.read_csv('axolotl.csv')
file.head()

Unnamed: 0,Español,Náhuatl
0,De Porfirio Diaz a Zapata,De Porfirio Diaz a Zapata
1,Documentos nauas de la Ciudad de México del si...,Documentos nauas de la Ciudad de México del si...
2,Documentos nauas de la Ciudad de México del si...,Documentos nauas de la Ciudad de México del si...
3,Documentos nauas de la Ciudad de México del si...,Documentos nauas de la Ciudad de México del si...
4,Documentos nauas de la Ciudad de México del si...,Documentos nauas de la Ciudad de México del si...


### Clean data
A function that given a list of strings returns only alphanumeric symbols in lower case

In [11]:
# Lower case sentences and return only letters and numbers. 
def clean_dataset(data, numeric = True):
    if numeric:
        return ["".join([i for i in sentence if i.isalnum() or i == ' ']).lower() for sentence in data]
    else:
        return ["".join([i for i in sentence if i.isalpha() or i == ' ']).lower() for sentence in data]


A function that given a list of strings returns only alphanumeric symbols and some punctuations marks  

In [12]:
def further_clean_dataset(data):
    is_mark = lambda i: i in ['.', ',', "'", "'", '?', '!', '¿', '(', ')','¡','-'] 
    mark_sapce = lambda i: " "+i+" " if is_mark(i) else i
    #alphanumeric or space
    anumsap = lambda i: i.isalnum() or i == ' '
    return ["".join([mark_sapce(i) for i in sentence if anumsap(i) or is_mark(i)]).lower() for sentence in data]
    

### Writing clean Data
A function to write cleaned data into text files in a format that will be used by Deep learning algorithm 

In [13]:
def write_file(source_data, target_data, tokenization, set_type, path = '../'):
    
    src = open(path+"/"+tokenization+ '-'+set_type+'-src.txt', 'w')
    tgt = open(path+"/"+tokenization+ '-'+set_type+'-tgt.txt', 'w')
    
    for i in source_data:
        src.write(i+"\n")
    src.close() 
    for i in target_data:
        tgt.write(i+"\n")
    tgt.close()
    return

## Simple Tokenization Techniques

### LeNu Tokenization

Preprocess the sentences so that only letters and numbers are left 

In [15]:
#Parallelize data
nah = clean_dataset(df['Náhuatl'])
spa = clean_dataset(df['Español'])
parallel_data_lenu = pd.DataFrame([[i,j] for i,j in zip(nah,spa)], columns = ['Nahuatl','Spanish'])
parallel_data_lenu.head()

Unnamed: 0,Nahuatl,Spanish
0,amanteca toyauan qn iyaoan in aquique in canin...,amanteca yoyauan quiere decir sus enemigos son...
1,anomatia qn quitoz nequi nixpan in omito yauyu...,anomatia quiere decir no delante de mí se habl...
2,ca nican tica nopiltzé xolé ca o toyollo on pa...,aquí estás hijo mío chaval ha quedado satisfec...
3,ca nican tonca nopiltzé nopiltziné notelpuchtl...,aquí estás hijo mío hijito mío muchacho mío mi...
4,ca nican tica tle ticmati,aquí estás qué es lo que piensas


#### Split data

In [18]:

# Split Train and Validation Set 
X_train, X_val, y_train, y_val = train_test_split(
    parallel_data_lenu['Nahuatl'], 
    parallel_data_lenu['Spanish'], 
    test_size=0.2, 
    random_state=30)

# Split between validation test and test set
X_dev, X_test, y_dev, y_test = train_test_split(
    X_val, 
    y_val, 
    test_size=0.5, 
    random_state=30)


In [29]:
print( 'Size of training set:', len(X_train))
print( 'Size of development set:', len(X_dev))
print( 'Size of test set:', len(X_test))


Size of training set: 14320
Size of development set: 1790
Size of test set: 1791


#### Write files

In [33]:
#traing
write_file(source_data = X_train , 
           target_data = y_train , 
           tokenization = 'LeNu', 
           set_type = 'train', 
           path = os.getcwd()+'/data/LeNu')
#dev
write_file(source_data = X_dev , 
           target_data = y_dev , 
           tokenization = 'LeNu', 
           set_type = 'dev', 
           path = os.getcwd()+'/data/LeNu')

#test
write_file(source_data = X_test , 
           target_data = y_test , 
           tokenization = 'LeNu', 
           set_type = 'test', 
           path = os.getcwd()+'/data/LeNu')

### Character-only Tokenization to use in Morfessor

Preprocess sentences so that only letters are left
Will not be used for translation, used in Morfessor

In [14]:
#Parallelize data
nah_charonly = clean_dataset(df['Náhuatl'], numeric=False)
spa_charonly = clean_dataset(df['Español'], numeric=False)
parallel_data_charonly = pd.DataFrame([[i,j] for i,j in zip(nah_charonly,spa_charonly)], columns = [ 'Nahuatl','Spanish'])
parallel_data_charonly.head()


Unnamed: 0,Nahuatl,Spanish
0,amanteca toyauan qn iyaoan in aquique in canin...,amanteca yoyauan quiere decir sus enemigos son...
1,anomatia qn quitoz nequi nixpan in omito yauyu...,anomatia quiere decir no delante de mí se habl...
2,ca nican tica nopiltzé xolé ca o toyollo on pa...,aquí estás hijo mío chaval ha quedado satisfec...
3,ca nican tonca nopiltzé nopiltziné notelpuchtl...,aquí estás hijo mío hijito mío muchacho mío mi...
4,ca nican tica tle ticmati,aquí estás qué es lo que piensas


#### Split into Training, Development and Test sets

In [35]:

# Split Train and Validation Set 
X_train, X_val, y_train, y_val = train_test_split(
    parallel_data_charonly['Nahuatl'], 
    parallel_data_charonly['Spanish'], 
    test_size=0.2, 
    random_state=30)

# Split between validation test and test set
X_dev, X_test, y_dev, y_test = train_test_split(
    X_val, 
    y_val, 
    test_size=0.5, 
    random_state=30)

#### Writing Files

In [36]:
#training
write_file(source_data = X_train , 
           target_data = y_train , 
           tokenization = 'charonly', 
           set_type = 'train', 
           path = os.getcwd()+'/data/character_only')
#dev
write_file(source_data = X_dev , 
           target_data = y_dev , 
           tokenization = 'charonly', 
           set_type = 'dev', 
           path = os.getcwd()+'/data/character_only')

#test
write_file(source_data = X_test , 
           target_data = y_test , 
           tokenization = 'charonly', 
           set_type = 'test', 
           path = os.getcwd()+'/data/character_only')

### LeNuP Tokenization
Filter dat so that only Characters, Numbers, and Punctuation Marks are left in the data 

In [15]:
#Parallelize data
nah_lenup = further_clean_dataset(df['Náhuatl'])
spa_lenup = further_clean_dataset(df['Español'])
parallel_data_lenup = pd.DataFrame([[i,j] for i,j in zip(nah_lenup,spa_lenup)], columns = [ 'Nahuatl','Spanish'])
parallel_data_lenup.head()


Unnamed: 0,Nahuatl,Spanish
0,amanteca toyauan q . n . iyaoan in aquique in...,' amanteca yoyauan quiere decir ' sus enemig...
1,' anomatia q . n . ( quitoz nequi ) nixpan...,' anomatia quiere decir ' no delante de mí s...
2,ca nican tica nopiltzé xolé ca o toyollo on pa...,"' aquí estás , hijo mío , chaval ha quedado..."
3,"' ca nican tonca nopiltzé , nopiltziné , no...","' aquí estás , hijo mío , hijito mío , muc..."
4,' ca nican tica tle ticmati ?,"' aquí estás , ¿ qué es lo que piensas ?"


#### Split into Training, Development and Test sets

In [196]:

# Split Train and Validation Set 
X_train, X_val, y_train, y_val = train_test_split( 
    parallel_data_lenup['Nahuatl'], 
    parallel_data_lenup['Spanish'],                     
    test_size=0.2, 
    random_state=30)

# Split between validation test and test set
X_dev, X_test, y_dev, y_test = train_test_split( 
    X_val, 
    y_val, 
    test_size=0.5, 
    random_state=30)



#### Write

In [198]:
#training
write_file(source_data = X_train , 
           target_data = y_train , 
           tokenization = 'LeNuP', 
           set_type = 'train', 
           path = os.getcwd()+'/data/LeNuP')
#dev
write_file(source_data = X_dev , 
           target_data = y_dev , 
           tokenization = 'LeNuP', 
           set_type = 'dev', 
           path = os.getcwd()+'/data/LeNuP')

#test
write_file(source_data = X_test , 
           target_data = y_test , 
           tokenization = 'LeNuP', 
           set_type = 'test', 
           path = os.getcwd()+'/data/LeNuP')

## Morfessor 2.0 Tokenization
Morfessor is an unsupervised morphological Segmentation Library. In the next cells, we will transform the data using 3 different morfessor techniques, but first we will get the data into a format that Morfessor will be able to read. Sentences from Character-only tokenization will be used to perform morphological segmentations. 

In [90]:
import morfessor

def get_for_morfessor(data):
    final = []
    for sentence in data:
        n = [(1,i)for i in sentence.split()]
        n.append((0,()))
        final+=(n)
    return final

def transform_sentence(sentence, word_idx):
    new_st = []
    for i in sentence.split(): 
    
        q =  i.lower().split()
        if len(q) > 0  and q[0] in word_idx:
            new_st+=(word_idx[q[0]])
        else:
            new_st.append(i) 
            
    new_st = ' '.join(new_st)
    return new_st
        
train_data = get_for_morfessor(nah_charonly)

### Train the tokenizer with only words
Train the morfessor morphological segmentator

In [97]:
%%time
#create model, fill in parameters and train

model_word = morfessor.BaselineModel()
model_word.load_data(train_data, count_modifier=lambda x: 1)
model_word.train_batch()


............................................................
............................................................
............................................................
............................................................
............................................................
............................................................
............................................................

CPU times: user 8min 8s, sys: 3.31 s, total: 8min 11s
Wall time: 8min 35s





Save model for use in later projects

In [99]:
import pickle
# create segmentations dictionary
wordbased_segementations = dict((j,k) for i,j,k in model_word.get_segmentations())
# Save Segmentations for later use
with open('wordbased_segementations.pickle', 'wb') as handle:
    pickle.dump(wordbased_segementations, handle, protocol=pickle.HIGHEST_PROTOCOL)



## LeNu+MS
### Morfessor, Letters, and Numbers 


Preprocess sentences by using transforming sentences using the morphological segmentation model created above and use LeNu tokenization.

#### Transfrom data

In [100]:
# transform the data
nah_LeNu_segmentations = [transform_sentence(sentence, word_idx=wordbased_segementations) for sentence in nah]
parallel_LeNu_segmentations = pd.DataFrame([[i,j] for i,j in zip(nah_LeNu_segmentations,spa)], columns = [ 'Nahuatl','Spanish'])
parallel_LeNu_segmentations.head()


Unnamed: 0,Nahuatl,Spanish
0,aman teca to ya uan q n iyao an in aquique in ...,amanteca yoyauan quiere decir sus enemigos son...
1,a no matia q n qui toz nequi n ixpan in omito ...,anomatia quiere decir no delante de mí se habl...
2,ca nican tica nopil tzé xolé ca o to yollo on ...,aquí estás hijo mío chaval ha quedado satisfec...
3,ca nican tonca nopil tzé nopiltzin é no telpuc...,aquí estás hijo mío hijito mío muchacho mío mi...
4,ca nican tica tle ticmati,aquí estás qué es lo que piensas


#### Split into Training, Development and Test sets

In [101]:

# Split Train and Validation Set 
X_train, X_val, y_train, y_val = train_test_split(
    parallel_LeNu_segmentations['Nahuatl'], 
    parallel_LeNu_segmentations['Spanish'], 
    test_size=0.2, 
    random_state=30)

# Split between validation test and test set
X_dev, X_test, y_dev, y_test = train_test_split(
    X_val, 
    y_val, 
    test_size=0.5, 
    random_state=30)

#### Writing Files

In [104]:
#training
write_file(source_data = X_train , 
           target_data = y_train , 
           tokenization = 'LeNu+MS', 
           set_type = 'train', 
           path = os.getcwd()+'/data/LeNu+MS')
#dev
write_file(source_data = X_dev , 
           target_data = y_dev , 
           tokenization = 'LeNu+MS', 
           set_type = 'dev', 
           path = os.getcwd()+'/data/LeNu+MS')

#test
write_file(source_data = X_test , 
           target_data = y_test , 
           tokenization = 'LeNu+MS', 
           set_type = 'test', 
           path = os.getcwd()+'/data/LeNu+MS')

## LeNuP+MS
### Morfessor, Characters, Numbers, Punctuation


Preprocess sentences by using transforming sentences using the morphological segmentation model created above and use LeNuP tokenization.

#### Transfrom data

In [107]:
# transform the data
nah_LeNuP_segmentations = [transform_sentence(sentence, word_idx=wordbased_segementations) for sentence in nah_cnm]
parallel_LeNuP_segmentations = pd.DataFrame([[i,j] for i,j in zip(nah_LeNuP_segmentations,spa_cnm)], columns = [ 'Nahuatl','Spanish'])
parallel_LeNuP_segmentations.head()


Unnamed: 0,Nahuatl,Spanish
0,aman teca to ya uan q . n . iyao an in aquique...,' amanteca yoyauan quiere decir ' sus enemig...
1,' a no matia q . n . ( qui toz nequi ) n ixpan...,' anomatia quiere decir ' no delante de mí s...
2,ca nican tica nopil tzé xolé ca o to yollo on ...,"' aquí estás , hijo mío , chaval ha quedado..."
3,"' ca nican tonca nopil tzé , nopiltzin é , no ...","' aquí estás , hijo mío , hijito mío , muc..."
4,' ca nican tica tle ticmati ?,"' aquí estás , ¿ qué es lo que piensas ?"


#### Split into Training, Development and Test sets

In [108]:

# Split Train and Validation Set 
X_train, X_val, y_train, y_val = train_test_split(
    parallel_LeNuP_segmentations['Nahuatl'], 
    parallel_LeNuP_segmentations['Spanish'], 
    test_size=0.2, 
    random_state=30)

# Split between validation test and test set
X_dev, X_test, y_dev, y_test = train_test_split(
    X_val, 
    y_val, 
    test_size=0.5, 
    random_state=30)

#### Writing Files

In [110]:
#training
write_file(source_data = X_train , 
           target_data = y_train , 
           tokenization = 'LeNuP+MS', 
           set_type = 'train', 
           path = os.getcwd()+'/data/LeNuP+MS')
#dev
write_file(source_data = X_dev , 
           target_data = y_dev , 
           tokenization = 'LeNuP+MS', 
           set_type = 'dev', 
           path = os.getcwd()+'/data/LeNuP+MS')

#test
write_file(source_data = X_test , 
           target_data = y_test , 
           tokenization = 'LeNuP+MS', 
           set_type = 'test', 
           path = os.getcwd()+'/data/LeNuP+MS')

## LeNu + MS

In [106]:
# transform the data
nah_LeNu_MS = [transform_sentence(sentence, word_idx=wordbased_segementations) for sentence in nah]
parallel_LeNu_MS = pd.DataFrame([[i,j] for i,j in zip(nah_LeNu_MS,spa)], columns = [ 'Nahuatl','Spanish'])
parallel_LeNu_MS.head()


Unnamed: 0,Nahuatl,Spanish
0,aman teca to ya uan q n iyao an in aquique in ...,amanteca yoyauan quiere decir sus enemigos son...
1,a no matia q n qui toz nequi n ixpan in omito ...,anomatia quiere decir no delante de mí se habl...
2,ca nican tica nopil tzé xolé ca o to yollo on ...,aquí estás hijo mío chaval ha quedado satisfec...
3,ca nican tonca nopil tzé nopiltzin é no telpuc...,aquí estás hijo mío hijito mío muchacho mío mi...
4,ca nican tica tle ticmati,aquí estás qué es lo que piensas


In [67]:

# Split Train and Validation Set 
X_train, X_val, y_train, y_val = train_test_split(
    parallel_mcnm_segmentations['Nahuatl'], 
    parallel_mcnm_segmentations['Spanish'], 
    test_size=0.2, 
    random_state=30)

# Split between validation test and test set
X_dev, X_test, y_dev, y_test = train_test_split(
    X_val, 
    y_val, 
    test_size=0.5, 
    random_state=30)

In [66]:
#training
write_file(source_data = X_train , 
           target_data = y_train , 
           tokenization = 'morf_cnm', 
           set_type = 'train', 
           path = os.getcwd()+'/data/morf_cnm')
#dev
write_file(source_data = X_dev , 
           target_data = y_dev , 
           tokenization = 'morf_cnm', 
           set_type = 'dev', 
           path = os.getcwd()+'/data/morf_cnm')

#test
write_file(source_data = X_test , 
           target_data = y_test , 
           tokenization = 'morf_cnm', 
           set_type = 'test', 
           path = os.getcwd()+'/data/morf_cnm')