In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [2]:
lang = 'uralic/fin'

In [3]:
with open('../sigmorphon2020/DEVELOPMENT-LANGUAGES/'+lang+'.trn', encoding='utf8') as f:
    data = f.read().split('\n')
    data = [sent.split('\t') for sent in data]
    data.pop(-1)
data = np.array(data,dtype='object')

In [4]:
df = pd.DataFrame(data,columns=['Lemma','Forms','Attrs'],dtype='object')
list(df.groupby('Attrs').groups.keys())

['ADJ;ACC;PL',
 'ADJ;ACC;SG',
 'ADJ;AT+ABL;PL',
 'ADJ;AT+ABL;SG',
 'ADJ;AT+ALL;PL',
 'ADJ;AT+ALL;SG',
 'ADJ;AT+ESS;PL',
 'ADJ;AT+ESS;SG',
 'ADJ;COM;PL',
 'ADJ;FRML;PL',
 'ADJ;FRML;SG',
 'ADJ;GEADJ;PL',
 'ADJ;GEADJ;SG',
 'ADJ;IN+ABL;PL',
 'ADJ;IN+ABL;SG',
 'ADJ;IN+ALL;PL',
 'ADJ;IN+ALL;SG',
 'ADJ;IN+ESS;PL',
 'ADJ;IN+ESS;SG',
 'ADJ;INS;PL',
 'ADJ;NOM;PL',
 'ADJ;NOM;SG',
 'ADJ;PRIV;PL',
 'ADJ;PRIV;SG',
 'ADJ;PRT;PL',
 'ADJ;PRT;SG',
 'ADJ;TRANS;PL',
 'ADJ;TRANS;SG',
 'N;ACC;PL',
 'N;ACC;SG',
 'N;AT+ABL;PL',
 'N;AT+ABL;SG',
 'N;AT+ALL;PL',
 'N;AT+ALL;SG',
 'N;AT+ESS;PL',
 'N;AT+ESS;SG',
 'N;COM;PL',
 'N;FRML;PL',
 'N;FRML;SG',
 'N;GEN;PL',
 'N;GEN;SG',
 'N;IN+ABL;PL',
 'N;IN+ABL;SG',
 'N;IN+ALL;PL',
 'N;IN+ALL;SG',
 'N;IN+ESS;PL',
 'N;IN+ESS;SG',
 'N;INS;PL',
 'N;NOM;PL',
 'N;NOM;SG',
 'N;PRIV;PL',
 'N;PRIV;SG',
 'N;PRT;PL',
 'N;PRT;SG',
 'N;TRANS;PL',
 'N;TRANS;SG',
 'V;COND;PL;1;POS;PRS;ACT',
 'V;COND;PL;2;POS;PRS;ACT',
 'V;COND;PL;3;POS;PRS;ACT',
 'V;COND;POS;PRS;PASS',
 'V;COND;SG;1;PO

In [5]:
class BytePairEncoder():
    '''
    BPE algorithm
    '''

    def __init__(self):
        self.ngrams  = {'UNK'}
        self.word_series = None
        self.most_freq_pair = None

    def get_most_freq_pair(self):
        pairs = defaultdict(int)
        for form in self.word_series:
            symbols = form.split()
            for i in range(len(symbols) -1):
                pairs[symbols[i],symbols[i+1]] += 1
        return max(pairs, key=pairs.get)

    def merge(self):
        ab = ''.join(self.most_freq_pair)
        # add 'ab' to the set of ngrams
        self.ngrams.add(ab)
        new_series = np.empty(len(self.word_series),dtype='object')
        for i,form in enumerate(self.word_series):
            # turn 'a b' to 'ab'
            new_series[i] = form.replace(' '.join(self.most_freq_pair),ab )
        self.word_series = new_series
    
    def fit(self,corpus,niter):
        self.word_series = [' '.join(list(w)) for w in corpus]
        for word in self.word_series:
            s = set(word)
            self.ngrams = self.ngrams | s

        for _ in range(niter):
            self.most_freq_pair = self.get_most_freq_pair()
            self.merge()
    



In [6]:
class NgramsExtractor():
    '''
    
    '''
    def __init__(self):
        self.train_data = None
        self.train_stems = None
        self.ntrains = 0
        self.mapping = dict()
    
    def fit(train_set,stems = None):
        if stems is None:
            stemmer = Stemmer()
            stemmer.fit(train_set)
            raise NotImplementedError
        else:
            if len(stems)!= data.shape[0]:
                raise ValueError('Length of stems list not matching dataset length.')
            self.train_stems = pd.Series(stems)
        
        self.train_data = pd.DataFrame(train_set,columns=['Lemma','Forms','Attrs'],dtype='object')
        self.ntrains = self.train_data.shape[0]
        grouped = self.train_data.groupby('Attrs')
        groups = grouped.groups.keys()
        for group in groups:
            
            grouped.get_group(group)
            
            

In [33]:
bpe = BytePairEncoder()
bpe.fit(["mangerai","mangions","goûterai","goûta","punirai","finissais", "finiras"],10)
print(np.sort(list(bpe.ngrams)))

[' ' 'UNK' 'a' 'e' 'erai' 'f' 'g' 'go' 'goû' 'goût' 'i' 'm' 'ma' 'man'
 'mang' 'n' 'ni' 'o' 'p' 'r' 'ra' 'rai' 's' 't' 'u' 'û']
