# Welcome to pyiunstir!

### This notebook shows how to build a Part-of-Speech tagger using pyiunstir.
### This POS tagger takes advantage of the combinatorial nature of some parts of the iberian language (names, verbs, numbers) to tag the terms

In [1]:
import json

from pyiunstir.encoding import *
from pyiunstir.pos_tagger import POS_tagger

# Load the encodings
f_json = open('corpus/iberian.json', 'r')
symbols = json.load(f_json)
f_json.close()

# Load a collection of text written using the Northeastern script
f_json = open('corpus/NE_database.json', 'r')
corpus_NE = json.load(f_json)
f_json.close()

# Load a collection of text written using the Southeastern script
f_json = open('corpus/SE_database.json', 'r')
corpus_SE = json.load(f_json)
f_json.close()

# Load a collection of text written using the Greek Iberian script
f_json = open('corpus/GreekIberian_database.json', 'r')
corpus_GI = json.load(f_json)
f_json.close()



In [2]:
tagger = POS_tagger()

## Let's find documents with the "kutu" particle. It is though to be related to letters (send a letter), alphabets (kutu = abc) and writing in general 

In [3]:
for scp in corpus_NE:
    flag = 0
    if scp['support'] != 'coin':
        for t in scp['text_simplified']:
            if t.find('kutu') < 0:
                continue
            print ('===',t)
            for w in t.split(':'):
                w = standarize_word(w)
                print(w,tagger.tag_word(w))
            


=== kutui
kutui []
=== kutui
kutui []
=== kutuka
kutuka []
=== iŕe:bototaś:bitebakiŕś:bane:baŕenḿliki:antinḿlitu:tuŕane:aŕikaŕ:sekeniusu:atilebeiu:lauŕiskeŕkate:banḿliŕbai:tuŕane:kaisanḿliŕbai:tuŕane:itailiniŕe:kutur:biteŕoketetine:eŕatiaŕe:kokor:tauebartiate:aŕikaŕ:binḿlikise:iunstirlaku:bototaśeai:selkeai:bartuneai:unibeikeai:aneŕai:unibeikeai:iunstirlaku:uskeike:bototiki:keietisiatense:uśtalar:ilune:ban:ḿiŕ:eśu_lu:bitiŕoke:betense:uśkeaneŕ:lati
ire []
bototas []
bitebakirs []
bane [['Verb', ['bane', '', 'ban', 'e']]]
marenaliki []
antinalitu []
turane []
arikar []
sekeniusu []
atilebeiu []
lauriskerkate [['Personal Name', ['lauriskerkate', 'lauriskerkate', 'laurisker', 'laur', 'isker', '?', '']]]
banalirbai []
turane []
kaisanalirbai []
turane []
itailinire []
kutur [['Name', ['kutur', 'kutur', '', ' letter/document']]]
biteroketetine [['Verb', ['biteroketetine', 'bite', 'rok', 'etetine']]]
eratiare []
kokor []
tauebartiate []
arikar []
binalikise []
iunstirlaku []
bototaseai []
sel

## Now apply the same tagger to Southeastern texts (different encoding)

In [4]:
for scp in corpus_SE:
    flag = 0
    if scp['support'] != 'coin':
        for t in scp['text_simplified']:
            print ('===',t)
            for w in t.split(':'):
                w = standarize_word(w)
                print(w,tagger.tag_word(w))


=== aitikeltunki:iunśtir:bekoŕ:śalbitas:teŕoketa:banotaŕan
aitikeltunki []
iunstir [['Verb', ['iunstir', 'i', 'unst', 'ir']]]
bekor []
salbitas []
teroketa [['Sim Verb', ['teroketan', 'te', 'rok', 'etan']]]
banotaran []
=== kobeśirekian
kobesirekian []
=== kaŕesi_
karesi_ []
=== iskeŕiar
iskeriar [['Sim Personal Name', ['iskerir', 'iskerir', 'iskerir', 'isker', 'ir', '', '']], ['Sim Personal Name', ['iskerirar', 'iskerirar', 'iskerir', 'isker', 'ir', 'for/from', '']], ['Sim Personal Name', ['iskerar', 'iskerar', 'isker', 'isker', '', 'for/from', '']], ['Sim Name', ['iseriar', 'eriar', 'is', 'this pottery']]]
=== koniltiŕar:bitiar
koniltirar [['Sim Personal Name', ['lakoniltirar', 'lakoniltirar', 'lakoniltir', 'lakon', 'iltir', 'for/from', '']], ['Sim Personal Name', ['bekoniltirar', 'bekoniltirar', 'bekoniltir', 'bekon', 'iltir', 'for/from', '']]]
bitiar [['Sim Verb', ['bitar', 'bi', 'tar', '']], ['Sim Verb', ['bititar', 'biti', 'tar', '']]]
=== aituŕkin:bitiar
aiturkin []
bitiar [['Si

### Finally, to writings in Greek Iberian

In [3]:
for scp in corpus_GI:
    flag = 0
    if scp['support'] != 'coin':
        for t in scp['text_simplified']:
            print ('===',t)
            for w in t.split(':'):
                w = standarize_word(w)
                print(w,tagger.tag_word(w))


=== iŕike:orti:kaŕokan:tatula:baśk:buiśtiner:bakaŕok:sss:tuŕlbailuŕa:lekuśekik:baseŕokeiunbaita:uŕke:basbitiŕbartin:iŕike:baseŕokar:tebint:belakaśikauŕ:isbinai:askantis:takiskaŕok:binikebin:śalir:kitei:kaibikait
irike []
orti []
karokan []
tatula []
bask []
buistiner []
bakarok []
sss []
turlbailura []
lekusekik []
baserokeiunbaita []
urke []
basbitirbartin []
irike []
baserokar []
tebint []
belakasikaur []
isbinai []
askantis []
takiskarok []
binikebin [['Sim Quantity', ['binkebin', 'binkebin', 'binke', 'bin', '']]]
salir [['Name', ['salir', 'salir', '', ' money']]]
kitei [['Name', ['kitei', 'kitei', '', ' silver']]]
kaibikait []
=== sakaŕiskeŕar:nai
sakariskerar [['Personal Name', ['sakariskerar', 'sakariskerar', 'sakarisker', 'sakar', 'isker', 'for/from', '']]]
nai [['Verb', ['nai', 'n', 'ai', 'I am']]]
=== iunstir:śalirk:basiŕtiŕ:sabaŕitai:birinaŕ:kuŕś:boiśtinkiśtit:seśkeŕśtuŕan:sestiŕkatetin:seŕaikala:naltinke:bitutenin:iltuniŕaenai:bekoŕ:sebaketiŕan
iunstir [['Verb', ['iunstir', 