# Normalization

The following tokenization function separates punction from words and removes extra whitespace. 

In [1]:
import re

In [2]:
def normwix(text):
    text = text.lower()
    text = re.sub(r"[`´‘’ʔ']", "'", text, flags=re.IGNORECASE)
    text = re.sub(r"'", "ʔ", text, flags=re.IGNORECASE)
    text = re.sub(r" +", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"[üïɨ+]", "ɨ", text, flags=re.IGNORECASE)
    text = re.sub(r"ḱ", "k", text, flags=re.IGNORECASE)
    text = re.sub(r"(ẃ|ẁ)", "w", text, flags=re.IGNORECASE)
    text = re.sub(r"[ń]", "n", text, flags=re.IGNORECASE)
    text = re.sub(r"[áàäá]", "a", text, flags=re.IGNORECASE)
    text = re.sub(r"[éèëéë́]", "e", text, flags=re.IGNORECASE)
    text = re.sub(r"[íìií]", "i", text, flags=re.IGNORECASE)
    text = re.sub(r"[óòöó]", "o", text, flags=re.IGNORECASE)
    text = re.sub(r"[úùú]", "u", text, flags=re.IGNORECASE) 
    return text

#primarily for the bible
def aggressive_normwix(text):
    text.lower()
    text = normwix(text)
    text = re.sub(r"([a-z+])\1+", r"\1", text, flags=re.IGNORECASE)
    text = re.sub(r" ʔ", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"v", "w", text, flags=re.IGNORECASE)
    text = re.sub(r"(c|qu)", "k", text, flags=re.IGNORECASE)
    #text = re.sub(r"[0-9]+", "", text, flags=re.IGNORECASE)
    text = re.sub(r"ch", "ts", text, flags=re.IGNORECASE)
    text = re.sub(r"rr", "x", text, flags=re.IGNORECASE)
    text = re.sub(r"(?<!t|\[)s", "ts", text, flags=re.IGNORECASE)
    text = re.sub(r"([a-z+])\1+", r"\1", text, flags=re.IGNORECASE)
    return text

In [3]:
def tokenize(text):
    text = re.sub(r"(?<![\s])([\)|\(|.|,|,\-,\"|:|;|¿|?|¡|!])", r" \1", text)
    text = re.sub(r"([\)|\(|.|,|,\-,\"|:|;|¿|?|¡|!])(?<![\s])", r"\1 ", text)
    text = re.sub(r"(ç|_)",'',text, flags=re.IGNORECASE)
    text = re.sub(r"	",' ',text, flags=re.IGNORECASE)
    text = re.sub(r"^ ", "", text, flags=re.IGNORECASE)
    return text

## Detect language to apply appropriate normalization. 

Since Spanish is present in the Wixárika portion of the data we need to detect it words so that we avoid normalizing Spanish with the orthographic rules of Wixarika.  In order to do this, we will create a synthetic dataset that will contain spanish and wixarika words rerpresented with character count vectors. We will use this dataset to fit a classifer to learn to distinguish between the two languages. Later on, we will use this classifier to apply normalization rules appropriately.

We begin by separating puncuation from words, removing extra whitespace, and applying a simple normalization shceme to wixárika to remove sporadic diacritics and character repetition. 


In [4]:
def get_set(path):
    return set(tokenize(open(path,'r').read()).lower().split())

In [5]:
spanish_words = get_set('data/es-hch/train.es')

we apply a simple normalization scheme to remove some of the variation. 

In [6]:
wixarika_words =  set(normwix(i) for i in get_set('data/es-hch/train.hch'))


Get the overlap of tokens to use as a proxy for code-switched text. 

In [7]:
overlap = wixarika_words.intersection(spanish_words)

Create a vocabulary with both languages. 

In [8]:
vocab = list(spanish_words.union(wixarika_words))

In [9]:
for i in vocab:
    if '_' in i:
        print(i)
    

We will use ``vocab`` above as input to a count vectorizer, where each word will be represented as a caracter count vector. 

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
vectorizer = CountVectorizer(analyzer='char')
vecs = vectorizer.fit_transform(vocab)

Let's take a look at the names of the features, in this cases, the characters that a word could have  

In [12]:
feature_names = vectorizer.get_feature_names()
print(feature_names)

['!', '"', '(', ')', '+', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '¿', 'á', 'é', 'í', 'ñ', 'ó', 'ú', 'ü', 'ɨ', 'ʔ', '–']


In [13]:
def vectorize_new_word( word):
    
    return [word.count(i) for i in feature_names]

We now convert `vecs` to an array and create a dictionary that contains a word and its vector representation.

In [14]:
vectors = vecs.toarray()
word_vecs = {vocab[i]: vectors[i] for i in range(len(vocab))}

We now create a dataset consisting of a word, a label (0 for wixarika, 1 for spanish), and a vector and store them in a pandas data frame.

In [15]:
import pandas as pd

First, we take out the overlap from wixarika words so that they are not labeled as wixarika words

In [16]:
wix_words = wixarika_words-overlap
es_words = spanish_words-overlap

In [17]:
data = [[i,1,word_vecs[i]] for i in wix_words] +  [[i,0,word_vecs[i]] for i in es_words]
df = pd.DataFrame(data, columns=['word','label','vector'])

In [18]:
df

Unnamed: 0,word,label,vector
0,tekuni,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,nepɨʔena,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,tɨyari,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,mainɨkɨ,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,kaniwaye,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
17689,apresurado,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17690,soplar,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17691,padres,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17692,confeccionado,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


We now split our data for training and evaluation. 

In [19]:
import numpy as np
from sklearn.model_selection import train_test_split

In [20]:
X = [i for i in df.vector]
y = [i for i in df.label]
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

We now use the data boave to fit a support vector machine. 

In [21]:
from sklearn import svm

In [22]:
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(X_train, y_train)

SVC(decision_function_shape='ovo')

We now predict the labels for the held-out set and compute its,accuracy, precision,recall, and f1 scores. 

In [23]:
from sklearn.metrics import accuracy_score,balanced_accuracy_score,f1_score, precision_score, recall_score

In [24]:
predictions = clf.predict(X_test)

In [25]:
accuracy = accuracy_score(predictions, y_test)
precision = precision_score(predictions, y_test)
recall = recall_score(predictions, y_test)
f1 = f1_score(predictions, y_test)

In [26]:
print("Accuracy : {}\nPrecision : {}\nRecall : {}\nF1 : {}".format(accuracy, precision, recall, f1))

Accuracy : 0.980220401243289
Precision : 0.9823807477438762
Recall : 0.9874730021598273
F1 : 0.984920292977165


Let's take a look the labels for the overlapping words Overlapping words classified as spanish

In [27]:
cs_pred = clf.predict([word_vecs[i] for i in overlap])

In [28]:
accuracy_score(cs_pred, [0 for i in range(len(cs_pred))])

0.9402298850574713

In [29]:
for i,j in zip(cs_pred, overlap):
    print(j,i)

; 0
pasteles 0
anillo 0
repollos 0
tordo 0
botes 0
017 0
jacta 0
tras 0
antaño 0
hollinado 0
036 0
ti 1
serenamente 0
juancito 0
comadreja 0
foresta 0
016 0
barra 0
a 0
marfil 0
cera 0
trizas 0
haya 1
mismo 0
guerreros 0
madrina 0
046 0
molinero 0
cesto 0
? 0
cuatrocientos 0
panderetas 0
ovillo 0
colina 0
vagones 0
1 0
hark 1
sendero 0
diablillo 0
segadores 0
poder 0
iglesia 0
chorlito 0
mofarse 0
cristal 0
guadaña 0
cerezas 0
mate 1
ni 1
previos 0
paz 0
reverencia 0
- 0
toalla 0
segando 0
pajas 0
primera 0
la 0
militar 0
valla 0
castillo 0
lima 0
peldaño 0
bote 0
pomerania 0
bollo 0
arneses 0
remos 0
establo 0
sorbo 0
cornetas 0
maldijo 0
gesto 0
afamado 0
ya 1
mantel 0
elsie 0
cereales 0
he 1
006 0
perspicaz 0
cenicienta 0
3 0
molinera 0
hechicera 0
tirita 1
sirope 0
012 0
granuja 0
escuela 0
padre 0
como 0
novia 0
mirlo 0
hora 0
perezosa 0
media 0
cerradura 0
justicia 0
margarita 0
cegado 0
rey 1
014 0
hans 0
devoraba 0
fiel 0
031 0
splash 0
san 0
bremen 0
reyes 0
semsi 0
suerte 0
i

# Reconstructing  word initial glottal stops.

Now that we have trained our classifier to detect the language of a word, we can use it to reconstruct the word initial glottal stop in wixarika words only. 

In [30]:
def reconstruct_glottal(sentences,clf=clf, aggressive=False):
    sents= []
    norming = normwix
    if aggressive:
        norming = aggressive_normwix
    for j in sentences:
        
        sent = tokenize(norming(j)).split()
        sent_labels = [vectorize_new_word(word) for word in sent]
        pr = clf.predict(sent_labels)
        normed_sent = ["ʔ"+i if (j==1 and i[0] in 'ɨaeiu') else i for i,j in zip(sent,pr)]
        sents.append(" ".join(normed_sent))
    return sents

# Norming labialization of velar stops.

In [31]:
def collapse_labialization(sentence):
    sentence = re.sub('kw','ku', sentence)
    return sentence

In [32]:
def renorm(input_path,output,aggressive=False):
    normalized =  reconstruct_glottal(open(input_path,'r').readlines(),aggressive=aggressive)
    
    with open(output,'w') as f:
        f.write(collapse_labialization("\n".join(normalized)))
    print(f"{input_path} renormed and written to {output}" )
    return 

# The Spanish-Wixarika Parallel Corpus

Write normalized data for NMT 

In [33]:
renorm(input_path='data/es-hch/train.hch',
       output='clean/es-hch/train.hch')

data/es-hch/train.hch renormed and written to clean/es-hch/train.hch


In [34]:
renorm(input_path='data/es-hch/dev.hch',
       output='clean/es-hch/dev.hch')

data/es-hch/dev.hch renormed and written to clean/es-hch/dev.hch


In [35]:
renorm(input_path='data/es-hch/test.hch',
       output='clean/es-hch/test.hch')

data/es-hch/test.hch renormed and written to clean/es-hch/test.hch


In [36]:
for t in ['train','dev','test']:
    ss = f"es-hch/{t}.es"
    sen = open(f'data/{ss}','r').readlines()
    with open(f'clean/{ss}','w') as f:
        f.write("".join([tokenize(i).lower() for i in sen]))

# Descriptive Grammars Gomez (1998) and Ramos-Bierge(2017)

In [37]:
renorm(input_path='scratch/Gomez/gomez.hch',
       output='clean/Gomez/gomez.hch')

scratch/Gomez/gomez.hch renormed and written to clean/Gomez/gomez.hch


In [38]:
renorm(input_path='scratch/Ramos/ramos.hch',
       output='clean/Ramos/ramos.hch')

scratch/Ramos/ramos.hch renormed and written to clean/Ramos/ramos.hch


In [39]:
for t in ['Ramos','Gomez']:
    ss = f"{t}/{t.lower()}.es"
    sen = open(f'data/{ss}','r').readlines()
    with open(f'clean/{ss}','w') as f:
        f.write("".join([tokenize(i).lower() for i in sen]))

# AmericasNLI

[AmericasNLI](https://arxiv.org/pdf/2104.08726.pdf) is an extension of [XNLI](https://arxiv.org/pdf/1809.05053.pdf) to 10 Indigenous languages of the Americas. 

In [40]:
renorm(input_path='scratch/AmericasNLI/dev.hch',
       output='clean/AmericasNLI/dev.hch')

scratch/AmericasNLI/dev.hch renormed and written to clean/AmericasNLI/dev.hch


In [41]:
renorm(input_path='scratch/AmericasNLI/test.hch',
       output='clean/AmericasNLI/americasnli.hch')

scratch/AmericasNLI/test.hch renormed and written to clean/AmericasNLI/americasnli.hch


In [51]:
for t in ['test']:
    ss = f"AmericasNLI/americasnli.es"
    sen = open(f'data/{ss}','r').readlines()
    with open(f'clean/{ss}','w') as f:
        f.write("".join([tokenize(i).lower() for i in sen]))

# Bible

Most local languages around the world lack the resources to build robust machine translation systems. The bible may be a useful resource since it has been translated to more than 3000 languages. In two to three sentences describe some of the ethical implications of using the bible to train a machine translation system for a local language? 

In [30]:
### your answer goes here. ####
answer = ""
print(answer)




In [31]:
spanish_bible = open('scratch/Bible/spa-x-bible-hablahoi-latina.txt.jhubc','r').readlines()
wixarika_bible = open('scratch/Bible/hch-x-bible-hch-v1.txt','r').readlines()

In [33]:
bible = [(i,j) for i,j in zip(wixarika_bible,spanish_bible) if len(i) > 1 and len(j) > 1 ]

In [34]:
import pandas as pd

In [35]:
bible_df = pd.DataFrame(bible,columns=['wixarika','spanish'])

In [40]:
for i in range(bible_df.shape[0]):
    if "que'ane" in bible_df.wixarika[i]:
        print((bible_df[i-1:i]))

                                               wixarika  \
5333  Me xüca peneüqueni , 'aixüa cani'aneni , 'axa ...   

                                                spanish  
5333  Si te casas , no cometes pecado ; y si una muj...  
                                               wixarika  \
5334  'Ipaü nepaine ne'ivama , tucari canaye'aximeni...   

                                                spanish  
5334  Hermanos , lo que quiero decir es esto : Nos q...  
                                               wixarika  \
5335  mümeta memutisuana müme memüca'utisuana vahepa...   

                                                spanish  
5335  los que están de luto deben portarse como si e...  


In [36]:
bible_df[:10].wixarika

0    'Inüari canihücütüni 'icü . Nuivarite mecani'i...
1    'Apurahami 'Isahaqui cani'uquiyarieyatücaitüni...
2    Cura meta Parexi Sara püva'uquiyaritücai , Tam...
3    'Arami 'Aminaravi pü'uquiyarieyatücai . 'Amina...
4    Sarumuni Puhuxi pü'uquiyarieyatücai , Xahavi v...
5    Quisahi Raviri pü'uquiyarieyatücai , que mü'an...
6    Sarumuni Xupuhami pü'uquiyarieyatücai . Xupuha...
7    'Asa Cusapati pü'uquiyarieyatücai . Cusapati C...
8    'Usiyaxi Cutami pü'uquiyarieyatücai . Cutami '...
9    'Esequiyaxi Manasexi pü'uquiyarieyatücai . Man...
Name: wixarika, dtype: object

We select 1000 random sentences for tesing and roughly 10% for developemnt, note that we will only use the test set. Training NMT with the Bible is left for future work. 

In [46]:
hch_train, hch_eval, es_train, es_eval = train_test_split( bible_df.wixarika, bible_df.spanish, test_size=0.2260,  random_state=42)

hch_dev, hch_test, es_dev, es_test = train_test_split( hch_eval, es_eval, test_size=0.559,  random_state=42)


In [47]:
bible_splits = {'train' : {'es':es_train,'hch':hch_train},
'dev' : {'es':es_dev,'hch':hch_dev },
'test' : {'es':es_test,'hch':hch_test}}

In [48]:
for split in ['train','dev','test']:
    for lang in ['es','hch']:
        with open(f'scratch/Bible/{split}.{lang}','w') as f:
            f.write("".join(bible_splits[split][lang]))


Wixárika

In [49]:
suffix='hch'
for split in ['test']:
    renorm(input_path=f'scratch/Bible/{split}.{suffix}', 
           output=f'clean/Bible/bible.{suffix}',aggressive=True)

scratch/Bible/test.hch renormed and written to clean/Bible/bible.hch


Spanish

In [50]:
suffix = 'es'
for t in ['test']:
    ss = f"Bible/bible.{suffix}"
    sen = open(f'scratch/{ss}','r').readlines()
    with open(f'clean/{ss}','w') as f:
        f.write("".join([tokenize(i).lower() for i in sen]))