In [52]:
import enchant
import scipy.stats
import numpy as np
import pandas as pd

In [2]:
lang = 'austronesian/tgl'

In [3]:
with open('task0-data/DEVELOPMENT-LANGUAGES/'+lang+'.trn', encoding='utf8') as f:
    data = f.read().split('\n')
    data = [sent.split('\t') for sent in data]
    data.pop(-1)

In [4]:
data[:5]
for l in data:
    if len(l) != 3:
        print('problem: {}'.format(l))

In [102]:
dic = dict()
count = dict()
for sent in data:
    try :
        count[sent[0]]+=1
        dic[sent[0]].append( (sent[1],sent[2]) )
    except KeyError:
        try : 
            count[sent[0]] = 1
            dic[sent[0]] =  [ (sent[1],sent[2]) ,]
        except IndexError:
            raise IndexError(str(sent))

In [103]:
word = list(dic.keys())[2]
print('liban')
l = list(zip(['lib']*3,dic['liban']))
for s, (f, a) in l:
    print(s,f,a)

liban
lib nililibanan V;IPFV;PFOC
lib lumiban V;PFV;AGFOC
lib lumiliban V;IPFV;AGFOC


In [95]:
df = pd.DataFrame(data,columns=['Lemma','Forms','Attrs'],dtype='object')
grouped = df.groupby('Lemma')
d = {}
groups = grouped.groups.keys()
#list(grouped.get_group('liban')['Forms'])
for group in groups:
    d[group] = grouped.get_group(group)[['Forms','Attrs']]
d['abot']

Unnamed: 0,Forms,Attrs
194,aabutin,V;PFOC;LGSPEC1
195,abot,V;NFIN
516,umaabot,V;IPFV;AGFOC
1072,inabutan,V;PFV;PFOC
1218,umabot,V;PFV;AGFOC
1486,nag-aabot,V;IPFV;AGFOC
1505,inabot,V;PFV;PFOC
1850,inaabutan,V;IPFV;PFOC


Rule 1 : the lemma determines the first the grammatical class (N,V,etc.)

In [76]:
# function to find the stem (longest common substring) from the string array
def findstem(arr):
    accept = False
    res = ""
    s = arr[0] # Take first word from array as reference

    for i in range(len(s)):
        for j in range(i + 1, len(s) + 1):

            # generating all possible substrings of our reference string
            stem = s[i:j]
            k = 1
            for k in range(1, len(arr)):
                accept = True
                if stem not in arr[k]:
                    accept = False
                    break

            # If current substring is present in all strings and its length is greater than current result
            if (accept and len(res) < len(stem)):
                res = stem

    return res

def stemDict(dic):
    stem_dic = dict()
    for lemma in list(dic.keys()):
        lemma_plus_form = [lemma] + [a[0] for a in dic[lemma]]
        stem = findstem(lemma_plus_form)
        start = lemma.index(stem)
        end = len(lemma) - (start+len(stem))
        pos = (start, end)

        try :
            stem_dic[pos].append(lemma)
        except KeyError:
            stem_dic[pos] = [lemma ,]
    return stem_dic

def stemDictToXY(dic):
    X = []
    Y = []
    for key in dic.keys():
        X += dic[key]
        Y += [key]*len(dic[key])
    return np.array(X), np.array(Y)

In [86]:
stem_dict = stemDict(dic)
x, y = stemDictToXY(stem_dict)
print([(key, len(stem_dict[key])) for key in stem_dict.keys()])

[((1, 0), 141), ((0, 2), 13), ((0, 0), 112), ((1, 2), 21), ((2, 0), 5), ((2, 2), 1), ((1, 1), 16), ((1, 3), 5), ((0, 4), 8), ((0, 1), 12), ((4, 0), 1), ((0, 3), 7), ((1, 6), 1), ((3, 1), 1)]


In [83]:
def customMetric(x, y):
    return enchant.utils.levenshtein(x, y) / (len(x) + len(y))

class Knn:
    def __init__(self, n_neighbors):
        self.k = n_neighbors
        
    def fit(self, x, y):
        self.X = np.array(x)
        self.y = np.array(y)
     
    def predict(self, test):
        dists = np.zeros(len(self.X))
        for i in range(len(self.X)):
            dists[i] = customMetric(self.X[i], test)
        nn_ids = dists.argsort()[:self.k]
        return tuple(scipy.stats.mode(self.y[nn_ids])[0][0])

knn = Knn(n_neighbors=1)
knn.fit(x,y)
y_hat = knn.predict('liban')
y_hat

(1, 0)

In [51]:
customMetric(x[0,0],'test'), x[0,0]

(0.5555555555555556, 'patay')

In [10]:
gramm_classes = dict()
for lemma in dic.keys():
    gramm_classes[lemma] = dict()
    for form in dic[lemma]:
        gc = form[1].split(';')[0]
        try:
            gramm_classes[lemma][gc]+=1
        except KeyError:
            gramm_classes[lemma][gc]=1


In [11]:
#Number of exceptions
gramm_class_except = [lemma for lemma in gramm_classes.keys() if len(gramm_classes[lemma])>1]
print('Exceptions to Rule1 : ',len(gramm_class_except))
print('Nb of lemma : ', len(dic))
print('% of failure : ',round(len(gramm_class_except)/len(dic),4)*100)

Exceptions to Rule1 :  0
Nb of lemma :  344
% of failure :  0.0


In [12]:
with open('task0-data/DEVELOPMENT-LANGUAGES/'+lang+'.tst', encoding='utf8') as f:
    test_data = f.read().split('\n')
    test_data = [sent.split('\t') for sent in test_data]
    test_data.pop(-1)

In [13]:
# test_data

In [14]:
test_dic = dict()
test_count = dict()
for sent in test_data:
    try :
        test_count[sent[0]]+=1
        test_dic[sent[0]].append( (sent[1]) )
    except KeyError:
        try : 
            test_count[sent[0]] = 1
            test_dic[sent[0]] =  [ (sent[1]) ,]
        except IndexError:
            raise IndexError(str(sent))

In [15]:
not_in_train = 0
train_lemmas_list = list(dic.keys())
for lemma in list(test_dic.keys()):
    if lemma in train_lemmas_list:
        not_in_train +=1

In [16]:
len(list(test_dic.keys()))

266

In [17]:
not_in_train

266

## -------------------------------------

In [18]:
import numpy as np
data_np = np.array(data)
morpho_attribute_raw = data_np[:,2]
morpho_attribute_splitted  = [attr.split(';') for attr in morpho_attribute_raw]       
morpho_attribute_flat = np.array([item for sublist in morpho_attribute_splitted for item in sublist])

list_morpho_attribute = np.unique(morpho_attribute_flat)
print(len(list_morpho_attribute), list_morpho_attribute)

# onehot_dict = {}
# for i, m in enumerate(list_morpho_attribute):
#     #vect = [0]*len(list_morpho_attribute)
#     vect = np.zeros(len(list_morpho_attribute))
#     vect[i] = 1
#     onehot_dict[m] = vect
    
# onehot_dict

7 ['AGFOC' 'IPFV' 'LGSPEC1' 'NFIN' 'PFOC' 'PFV' 'V']
