# Welcome to pyiunstir!


This is a model to identify different words in iberian 6-symbol fragments

Exaple: "teitataŕeseŕaśoankeibonatintaneśte" should be read as "deitataŕes eŕaśoan geibon adintaneśde"


In [7]:
import json
import numpy as np
from pyiunstir.encoding import *

import itertools
from collections import Counter
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier


f_json = open('corpus/iberian.json', 'r')
symbols = json.load(f_json)
f_json.close()

f_json = open('corpus/NE_database.json', 'r')
corpus_NE = json.load(f_json)
f_json.close()



### Extract the NE corpus and remove the scriptio continua being used for validation

In [8]:
words = []

for instance in corpus_NE:
    line = instance['text']
    # Remove the validation example from the training set
    if instance['text_simplified'][0].find('teitata') == 0:
        continue
    for row in line:
        word = []
        for w in row:
            symbol = ' '
            for csymbol in symbols:
                if w == csymbol['id']:
                    symbol = csymbol['simplified']
                    word.append(symbol)
                    break
        words.append(word)

symbol_set = list(set(list(itertools.chain.from_iterable(words))))
labelencoder = LabelEncoder()
labelencoder = labelencoder.fit(symbol_set)

        
len(words)

2883

### Pre-process the training set of sequence for the segmentation model

In [9]:
X = []
y = []
n1 = 0
for w0 in words:
 #print(len(w0))
 for c1 in range(len(w0)):
    c2 = c1+7
    frag =w0[c1:c2]
    if len(frag) == 7:
        n1 += 1
        label = 0
        if frag[3] == ':':
            label = 1
        vect = []
        for n in range(len(frag)):
            if n == 3:
                continue
            c = frag[n]
            vect.append(labelencoder.transform([c])[0])
        X.append(vect)
        y.append(label)
        #print(frag,label, vect)

In [10]:
n1

9894

In [11]:
Counter(y)

Counter({0: 8591, 1: 1303})

In [12]:
m = RandomForestClassifier()
m.fit(X,y)


RandomForestClassifier()

### Validate model with the scriptio

In [13]:
tq_ib = break_syl('teitataŕeseŕaśoankeibonatintaneśte')
#tq_ib = break_syl('sŕkoanetabebentosutanbirtebitu')


In [14]:
for w0 in [tq_ib]:
 if 1 == 1:
  print(w0)
  w0b = w0
  nw = []
  wclean= []
  for q in w0:
    if q == ':':
        continue
    wclean.append(q)
  for c1 in range(len(wclean)):
    c2 = c1+6
    frag =wclean[c1:c2]
    if len(frag) == 6:
        print(frag)
        vectt = labelencoder.transform(frag).tolist()
        t0 = ''.join(frag[0:3])
        t1 = ''.join(frag[3:])
        prob = m.predict_proba([vectt])[0][1]
        if prob > 0.5:
            print('-->',  t0,':',t1, prob)
            nw.append(c1+3)
            
  if len(nw) > 0:
    final_w = []
    for i in range(len(wclean)):
        if i in nw:
            final_w.append(':')
        final_w.append(wclean[i])
    print('==>',''.join(w0b))

    print('-->',''.join(final_w))
print('..> teitataŕes:eŕaśoan:keibon:atintaneśte')


['te', 'i', 'ta', 'ta', 'ŕ', 'e', 's', 'e', 'ŕ', 'a', 'ś', 'o', 'a', 'n', 'ke', 'i', 'bo', 'n', 'a', 'ti', 'n', 'ta', 'n', 'e', 'ś', 'te']
['te', 'i', 'ta', 'ta', 'ŕ', 'e']
['i', 'ta', 'ta', 'ŕ', 'e', 's']
['ta', 'ta', 'ŕ', 'e', 's', 'e']
['ta', 'ŕ', 'e', 's', 'e', 'ŕ']
['ŕ', 'e', 's', 'e', 'ŕ', 'a']
--> ŕes : eŕa 0.75
['e', 's', 'e', 'ŕ', 'a', 'ś']
['s', 'e', 'ŕ', 'a', 'ś', 'o']
--> seŕ : aśo 0.76
['e', 'ŕ', 'a', 'ś', 'o', 'a']
['ŕ', 'a', 'ś', 'o', 'a', 'n']
--> ŕaś : oan 0.52
['a', 'ś', 'o', 'a', 'n', 'ke']
['ś', 'o', 'a', 'n', 'ke', 'i']
['o', 'a', 'n', 'ke', 'i', 'bo']
['a', 'n', 'ke', 'i', 'bo', 'n']
['n', 'ke', 'i', 'bo', 'n', 'a']
['ke', 'i', 'bo', 'n', 'a', 'ti']
['i', 'bo', 'n', 'a', 'ti', 'n']
--> ibon : atin 0.59
['bo', 'n', 'a', 'ti', 'n', 'ta']
['n', 'a', 'ti', 'n', 'ta', 'n']
['a', 'ti', 'n', 'ta', 'n', 'e']
['ti', 'n', 'ta', 'n', 'e', 'ś']
['n', 'ta', 'n', 'e', 'ś', 'te']
--> ntan : eśte 0.59
==> teitataŕeseŕaśoankeibonatintaneśte
--> teitataŕes:eŕ:aś:oankeibon:atintan:e

### As it can be seen, the segmentation model allows overlapping of cleavage sites even when the resulting word is very unlikely (2 symbols). It also missed a site and removed the ergative ending of the personal name "Adintanes"