# Evaluating Morphological Parsers

In [1]:
import sentencepiece as spm
import morfessor as morf
import re

In this notebook, we will evaluate different tokenizers on their ability to correctly parse morphological boundaries for wixárika and spanish. In order to do this, we will implement a variety of tokenization methods. Then we will evalute them on a 4 datasets, 2 in Spanish and 2 in Wixárika. The metrics that we will use are the Border F1 and associated Precision and Recall scores. 



In [2]:
import pandas as pd

## Gold Morphological Segmentations
The datasets we will use contain words annotated with morphological boundaries. They come from two grammars () and ().
Note that the words in the datasests are capitalized and the tokenizers were trained to only handle lowercased data. For the wixarika tokenizers, the choice in unicode characters is also really important. Make sure that you use appropriate characters when testing. 

In [55]:
import re 

In [50]:
def normwix(text):
    text = text.lower()
    text = re.sub(r"[`´‘’ʔ']", "'", text, flags=re.IGNORECASE)
    text = re.sub(r"'", "ʔ", text, flags=re.IGNORECASE)
    text = re.sub(r" +", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"[üïɨ+]", "ɨ", text, flags=re.IGNORECASE)
    text = re.sub(r"ḱ", "k", text, flags=re.IGNORECASE)
    text = re.sub(r"(ẃ|ẁ)", "w", text, flags=re.IGNORECASE)
    text = re.sub(r"[ń]", "n", text, flags=re.IGNORECASE)
    text = re.sub(r"[áàäá]", "a", text, flags=re.IGNORECASE)
    text = re.sub(r"[éèëéë́]", "e", text, flags=re.IGNORECASE)
    text = re.sub(r"[íìií]", "i", text, flags=re.IGNORECASE)
    text = re.sub(r"[óòöó]", "o", text, flags=re.IGNORECASE)
    text = re.sub(r"[úùú]", "u", text, flags=re.IGNORECASE)
     
    return text

#primarily for the bible
def aggressive_normwix(text):
    text.lower()
    text = normwix(text)
    text = re.sub(r"([a-z+])\1+", r"\1", text, flags=re.IGNORECASE)
    text = re.sub(r" ʔ", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"v", "w", text, flags=re.IGNORECASE)
    text = re.sub(r"(c|qu)", "k", text, flags=re.IGNORECASE)
    #text = re.sub(r"[0-9]+", "", text, flags=re.IGNORECASE)
    text = re.sub(r"ch", "ts", text, flags=re.IGNORECASE)
    text = re.sub(r"rr", "x", text, flags=re.IGNORECASE)
    text = re.sub(r"(?<!t|\[)s", "ts", text, flags=re.IGNORECASE)
    text = re.sub(r"([a-z+])\1+", r"\1", text, flags=re.IGNORECASE)
    return text

def tokenize(text):
    text = re.sub(r"(?<![\s])([\)|\(|.|,|,\-,\"|:|;|¿|?|¡|!])", r" \1", text)
    text = re.sub(r"([\)|\(|.|,|,\-,\"|:|;|¿|?|¡|!])(?<![\s])", r"\1 ", text)
    text = re.sub(r"(ç|_)",'',text, flags=re.IGNORECASE)
    text = re.sub(r"	",' ',text, flags=re.IGNORECASE)
    text = re.sub(r"^ ", "", text, flags=re.IGNORECASE)
    return text


In [45]:
def normalization(x):
    ### your code goes here ###
    return x

In [46]:
def dataset(path,normalization=normalization):
    df = pd.read_csv(path)
    df['word'] = df.segmentations.apply(lambda x: unmorph(x))
    df['normalized'] = df.word.apply(lambda x: normalization(x))
    return df
    

In [51]:
spanish_gomez = dataset('gold/spanish.gomez')
spanish_gomez.head()

Unnamed: 0,segmentations,word,normalized
0,A*,A,A
1,A*c|o|s|t|u|m|b|r*o*,Acostumbro,Acostumbro
2,A*g|a|r|r*e*n*,Agarren,Agarren
3,A*l*,Al,Al
4,A|c|u|é|r|d*a*t|e*,Acuérdate,Acuérdate


In [52]:
wixarika_gomez = dataset('gold/wixarika.gomez',aggressive_normwix)
wixarika_gomez.head()

Unnamed: 0,segmentations,word,normalized
0,'|+|k|w|a|i*,'+kwai,ʔɨkwai
1,'|+|p|a|r|i*t|s|i|e*,'+paritsie,ʔɨparitsie
2,'|+|x|a*,'+xa,ʔɨxa
3,'|a*'|i|w|a*m|a*,'a'iwama,ʔaʔiwama
4,'|a*k|a|w|a|y|u*,'akawayu,ʔakawayu


In [53]:
spanish_ramos  = dataset('gold/spanish.ramos')
spanish_ramos.head()

Unnamed: 0,segmentations,word,normalized
0,A|l|l*í*,Allí,Allí
1,A|n|d*a*b|a*n*,Andaban,Andaban
2,E*l*,El,El
3,E|s|t*e*,Este,Este
4,E|s|t*o*,Esto,Esto


In [54]:
wixarika_gramos  = dataset('gold/wixarika.ramos',aggressive_normwix)
wixarika_gramos.head()

Unnamed: 0,segmentations,word,normalized
0,'|+|i|m|a|r|i*,'+imari,ʔɨimari
1,'|+|k|+*,'+k+,ʔɨkɨ
2,'|+|k|i|t|s|i|k|a*,'+kitsika,ʔɨkitsika
3,'|+|p|a|r|i*t|s|i|e*,'+paritsie,ʔɨparitsie
4,'|+|p|i|n|a*,'+pina,ʔɨpina


In [None]:
import numpy as np
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


In [9]:
# returns character boundaries as a vector with 
# 1 representing morpheme boundary, 0 otherwise

def as_ones(word):
    ones = []
    for i in word:
        if i == '*':
            ones.append(1)
        if i == '|':
            ones.append(0)
    return ones

# returns a word without character or morph
# boundaries
def unmorph(text):
    text = re.sub(r"[\|\*]", '', text)
    return text

# returns a word with morpheme boundaries
# as spaces, undoes morphprepare
def space_morph(text):
    text = re.sub(r"[\|]", '', text)
    text = re.sub(r"[\*]", ' ', text)
    return text

#returns the word form
def word_form(text):
    return re.sub(r" ","", space_morph(text))
    
def metrics(y, y_hat):
    assert len(y) == len(y_hat), (y,y_hat)
    ratio = y_hat.count('*')/y.count('*')
    tr,pr =  np.array([(i,j) for i,j in zip(y,y_hat) if "*" in (i,j)]).T
    tr,pr = as_ones(tr), as_ones(pr)
    precision = precision_score(tr,pr, zero_division = 1)
    recall = recall_score(tr,pr, zero_division = 1)
    f1 =  f1_score(tr,pr, zero_division = 1)
    token_accuracy = int(y==y_hat)
    return( precision, recall, f1, token_accuracy, ratio)

def morph_metrics(gold,segmentations):  
    prec, reca,f1,acc,ra = [], [], [], [], []
    for y,y_hat in zip(gold, segmentations):     
        a = metrics(y, y_hat)
        prec.append(a[0])
        reca.append(a[1])
        f1.append(a[2])
        acc.append(a[3])
        ra.append(a[4])
    m = {'precision': prec , 'recall': reca, 'f1': f1,'token_accuracy':acc, 'ratio':ra}
    return {i: np.array(m[i]).mean() for i in m}

In [None]:
("u|n*a*s*")
[0,1,1,1], 

## Word level tokenization

In word level tokenization, we separate words by spaces. 

In [11]:
word = spanish_gomez.segmentations.apply(lambda x : ''.join([i+'|' for i in unmorph(x.lower())])[:-1]+'*')

In [12]:
len(word)

856

In [13]:
w = [(i,j) for i,j in (set([i for  i in zip(spanish_gomez.segmentations.apply(lambda x: x.lower()), word)]))]

In [14]:
word_segementations = pd.DataFrame(w, columns=['gold','prediction'])

In [15]:
morph_metrics(word_segementations.gold, word_segementations.prediction)

{'precision': 1.0,
 'recall': 0.5234897870491091,
 'f1': 0.6620599739243808,
 'token_accuracy': 0.16297262059973924,
 'ratio': 0.5234897870491091}

In [16]:
### your code goes here ###

## Character level tokenization
In character level tokenization, we separte words by characters and use a special character to denote word boundaries. 

In [17]:
char = spanish_gomez.segmentations.apply(lambda x : ''.join([i+'*' for i in unmorph(x)]))
c = [(i,j) for i,j in (set([i for  i in zip(spanish_gomez.segmentations.apply(lambda x: x.lower()), char)]))]
char_segementations = pd.DataFrame(c, columns=['gold','prediction'])

In [18]:
morph_metrics(char_segementations.gold, char_segementations.prediction)

{'precision': 0.4403130277692194,
 'recall': 1.0,
 'f1': 0.5901873596767226,
 'token_accuracy': 0.036214953271028034,
 'ratio': 2.7003504672897196}

In [19]:
### your code goes here ###

# Subword Tokenization

In this task you will be training multiple tokenization methods by modifying the input and method-specific hyperparamters. 



1. Unnormalized: the data as is
2. Punctuation normalization : separating punction from words using white space
3. Aggressive normalization: punctuation normalization + langugage specific normalization 


## Morfessor

Morfessor is 

In [158]:
from UnsupervisedSegmenters import MorfessorTokenizer as morfessor

In [159]:
from collections import Counter

In [160]:
spanish_morf = morfessor()
spanish_morf.load_model('tokenizers/morf/spanish.pickle')

In [161]:
morfs =  spanish_gomez.segmentations.apply(lambda x: 
                                         "|".join([ i for i in "*".join(spanish_morf.segment_word(unmorph(x).lower(), n=1))+'*']
                                         ).replace('|*|','*').replace('|*','*'))

m = [(i,j) for i,j in (set([i for  i in zip(spanish_gomez.segmentations.apply(lambda x: x.lower()), morfs)]))]
morf_segementations = pd.DataFrame(m, columns=['gold','prediction'])



In [162]:
morph_metrics(morf_segementations.gold, morf_segementations.prediction)

{'precision': 0.7204476314645807,
 'recall': 0.6460669274228595,
 'f1': 0.6476324993274146,
 'token_accuracy': 0.16297262059973924,
 'ratio': 1.0280312907431552}

In [163]:
morf_segementations.head(10)

Unnamed: 0,gold,prediction
0,g|u|s|t*a*,g|u|s|t|a*
1,m|u|n|i|c|i|p*a|l*,m|u*n|i*c|i*p|a*l*
2,h|i|n|c|h*ó*,h|i*n*c|h*ó*
3,h|a|b|r*í*a*,h|a|b*r|í|a*
4,v|a|r|i*a*s*,v|a|r|i|a|s*
5,p|i|d*o*,p|i|d|o*
6,n|o|c|h|e*,n|o|c|h|e*
7,f|l|a|c*o*,f|l|a|c|o*
8,m*í*a*,m|í*a*
9,s|i|g*a*,s|i*g|a*


In [25]:
### your code goes here ###

In [92]:
def sentencepiece_encode(x,model):
    x = unmorph(x)
    x = " ".join(model.encode(x.lower(), out_type=str)).replace('▁ ','').replace('▁','')
    x = x.replace(' ','*')+'*'
    x = "|".join([i for i in x]).replace('|*|','*').replace('|*','*')
    return x

## Byte-Pair Encoding

Train byte-pair encoding tokenizers using the following configurations:

1. Normalization
    1. raw
    2. punctuation normalization 
    3. aggresive normalization


2. vocabulary size
    1. 500 
    2. 1000 
    3. 2000
    4. 3000
    5. 4000
    6. 5000

In [93]:
### your code goes here ###

In [94]:
spanish_bpe = spm.SentencePieceProcessor(model_file='tokenizers/bpe/spanish.5000.model')

In [95]:
bpe  = spanish_gomez.segmentations.apply(lambda x: sentencepiece_encode(x,model=spanish_bpe))                                         
b = [(i,j) for i,j in (set([i for  i in zip(spanish_gomez.segmentations.apply(lambda x: x.lower()), bpe)]))]
bpe_segementations = pd.DataFrame(b, columns=['gold','prediction'])



In [96]:
morph_metrics(bpe_segementations.gold, bpe_segementations.prediction)

{'precision': 0.8399174272055628,
 'recall': 0.5809865275966971,
 'f1': 0.649088388071439,
 'token_accuracy': 0.1694915254237288,
 'ratio': 0.7872229465449804}

In [97]:
bpe_segementations.head(10)

Unnamed: 0,gold,prediction
0,n|i|ñ*o*s*,n|i|ñ|o|s*
1,g|u|s|t*a*,g|u|s|t|a*
2,e|n*f|r|e|n|t*e*,e|n|f*r|e*n|t|e*
3,p|i|d*o*,p|i|d|o*
4,n|o|c|h|e*,n|o|c|h|e*
5,c|r|e*e*s*,c|r|e|e|s*
6,f|l|a|c*o*,f|l|a|c|o*
7,s|i|g*a*,s|i*g|a*
8,p|r|i|s|a*,p|r|i|s|a*
9,a|r|r|i|b*a*,a|r|r|i|b|a*


## Unigram Language Modeling 

Train unigram language modeling tokenizers using the following configurations:

1. Normalization
    1. raw
    2. punctuation normalization 
    3. aggresive normalization


2. vocabulary size
    1. 500 
    2. 1000 
    3. 2000
    4. 3000
    5. 4000

In [98]:
spanish_ulm = spm.SentencePieceProcessor(model_file='tokenizers/unigram/spanish.4000.model')

In [99]:
### your code goes here ###

In [100]:
ulm  = spanish_gomez.segmentations.apply(lambda x: sentencepiece_encode(x,model=spanish_ulm))                                         
u = [(i,j) for i,j in (set([i for  i in zip(spanish_gomez.segmentations.apply(lambda x: x.lower()), ulm)]))]
ulm_segementations = pd.DataFrame(u, columns=['gold','prediction'])


In [101]:
morph_metrics(ulm_segementations.gold, ulm_segementations.prediction)

{'precision': 0.8585614950021729,
 'recall': 0.6725771403737506,
 'f1': 0.715507750253513,
 'token_accuracy': 0.2242503259452412,
 'ratio': 0.8934159061277706}

In [102]:
ulm_segementations.head(10)

Unnamed: 0,gold,prediction
0,d|u|r|m*i*ó*,d|u|r|m|i*ó*
1,g|u|s|t*a*,g|u|s|t|a*
2,g|r|i|t*o*s*,g|r|i|t*o|s*
3,m|a|t*ó*,m|a|t*ó*
4,m|u|n|i|c|i|p*a|l*,m|u*n|i*c|i*p|a*l*
5,o|j|a|l|á*,o|j*a*l|á*
6,n|o|c|h|e*,n|o|c|h|e*
7,f|l|a|c*o*,f|l|a|c|o*
8,m*í*a*,m|í*a*
9,s|i|g*a*,s|i*g|a*
