In [1]:
import pandas as pd
import pickle

morphemes = pd.read_pickle("morphemes_protoslavic.pkl")


In [532]:
morphemes.to_pickle("morphemes_protoslavic_v2.pkl")

In [315]:
import re

In [138]:
def set_base_verb(g, base_verb):
    morphemes.loc[
        g.index,
        "base_verb"
    ] = base_verb


In [2]:
AO = 'ꜵ'
YI = "⒥"
NASAL = "⒩"
BACKTICK = "’"

In [230]:
def split_carefully(x, true_stem):
    if " " in x:
        x = x.split(" ")[0]
    if any(weird in true_stem for weird in "[]()?"):
        splitted = re.split(true_stem, x)
        found_stem = x[len(splitted[0]): -len(splitted[1])]
        assert len(splitted) == 2
        return splitted[0], found_stem, splitted[1]
    x = x.replace('å', 'a')
    true_stem = true_stem.replace('å', 'a')

    if AO in true_stem:
        if true_stem.replace(AO, "a") in x:
            return list(x.partition(true_stem.replace(AO, "a")))
    return x.partition(true_stem.replace(AO, "o"))

def insert_YI(word, signature):

    if " " in word:
        word = word.split(" ")[0]

    if word[-3:] == "ęti":
        word = word[:-3] + "in" + NASAL + "ti"
    if "trim" not in word and "klimat" not in word:
        word = word.replace("idti", "jdti").replace("imati", "jmati")
    if signature == {'s', 't'}:
        if word[-3:] == "sti":
            # rasti -> rastti
            word = word[:-3] + "stti"
        
    base, end = word[:-5], word[-5:]
    
    # jehati
    if signature == {'đ', 'h'}:
        end = end.replace("žđati", "h" + YI + "ati")
    
    if signature == {'ć', 's'}:
        end = end.replace("šćati", "st" + YI + "ati")
    if signature == {'ć', 't'}:
        end = end.replace("šćati", "st" + YI + "ati")
        end = end.replace("ćati", "t" + YI + "ati")
    if signature == {'ć', 'č'}:
        end = end.replace("čiti", "ćiti")
        end = end.replace("čati", "ćati")

    if signature == {'đ', 'd'}:
        end = end.replace("žđati", "zd" + YI + "ati")
        end = end.replace("đati", "d" + YI + "ati")

    # puskati?
    if signature == {'ć', 'k'}:
        end = end.replace("šćati", "sk" + YI + "ati")
    if signature == {"c", "č"}:
        end = end.replace("čati", "c" + YI + "ati")
        end = end.replace("čiti", "c" + YI + "iti")

    if signature == {'k', 'č'}:
        base, end = word[:-6], word[-6:]
        end = end.replace("ščiti", "sk" + YI + "iti")
        end = end.replace("čiti", "k" + YI + "iti").replace("čivati", "k" + YI + "ivati")


    # pisati
    if signature == {'š', 's'}:  
        end = end.replace("šati", "s" + YI + "ati")
        end = end.replace("šiti", "s" + YI + "iti")
        
    if signature == {'š', 'h'}:
        end = end.replace("šiti", "h" + YI + "iti")
        end = end.replace("šati", "h" + YI + "ati")
    if signature == {'š'}:
        base, end = word[:-6], word[-6:]
        end = end.replace("šiti", "h" + YI + "iti")
        end = end.replace("šati", "h" + YI + "ati")
        end = end.replace("šivati", "h" + YI + "ivati")
        

    if signature == {'č', 'k'}:
        end = end.replace("čati", "k" + YI + "ati")
        
    if signature == {'ž', 'z'}:
        end = end.replace("žati", "z" + YI + "ati")
    if signature == {'ž', 'g'}:
        base, end = word[:-6], word[-6:]

        end = end.replace("žati", "g" + YI + "ati")
        end = end.replace("žiti", "g" + YI + "iti")
        end = end.replace("živati", "g" + YI + "ivati")

    return base + end

def manual_insert(base_verb, true_stem, last_cons=set(), dry_run=True, from_base_verb=True):
    if from_base_verb:
        g = morphemes.query("base_verb == @base_verb")
    else:
        g = morphemes.query("isv == @base_verb")
    manual_insert_g(g, true_stem, last_cons, dry_run)

def manual_insert_g(g, true_stem, last_cons=set(), dry_run=True):

    for i, row in g.isv.apply(lambda x: split_carefully(insert_YI(x, last_cons), true_stem)).items():
        if any(weird in true_stem for weird in "[]()?"):
            found_stem = row[1]
        else: 
            found_stem = true_stem

        if not dry_run:
            morphemes.loc[i, ['_prefix', '_stem', '_suffix']] = row[0], found_stem, row[2]
        raw_isv = g.loc[i, 'isv']
        #if also_insert_noun:
        #    morphemes.loc[[i], 'base_noun'] = true_stem
        #    print(raw_isv, " <- ", true_stem)

        print(raw_isv, "=", row[0], found_stem, row[2])
        



In [691]:
A = 65535 * (1-0.99)
trunc_B = 65535 
B = 76035

B/(A+B) - trunc_B/(A+trunc_B)

0.00135558640741118

In [4]:
import json

with open("wiktionary_extended_new.json", "r", encoding="utf8") as f:
    words_data = json.load(f)

In [5]:
morphemes.prefix.unique()

array(['?', '’', 'de’', 'dez’', 'na’', 'obez’', 'råz’', 'sȯ’', 'vȯ’',
       'vȯz’', 'v’', 'vy’', 'za’'], dtype=object)

In [680]:

possible_prefixes = {
    "ne", "bez", "naj",
    "do", "iz", "na", "nad", "ne", "o", "ob", "obez", "od", "po", "pod", 
    "prě", "prěd", "pri", "pro", 
    "råz",
    "de", 
    "s", "sȯ",
    "sų", "u", "v", 
    "vȯz", "vȯ",
    "vy", "za"
}

In [None]:
¬ne¬prě√měn^n|y

In [6]:
svc = morphemes._suffix.value_counts()
pvc = morphemes._prefix.value_counts()


sWTF = svc[svc < 6].index
pWTF = pvc[pvc < 6].index


morphemes.query("_prefix in @pWTF or _suffix in @sWTF or _suffix == ''")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
130,24156.0,aklimatizovati,,v.intr. ipf.,acclimatize,I,verb,?,?,aklimatizovati,aklimatizovati,aklimatiz,aklimatizovati,|aklimatizacija,klimat,a,klimat,izovati,0,
144,24165.0,akompanovati,,v.tr. ipf.,accompany,I,verb,?,?,akompanovati,akompanovati,akompan,akompanovati,,,a,kompan,ovati,0,
338,37.0,anulovati,,v.tr. ipf./pf.,"cancel, nullify",I,verb,?,?,anulovati,anulovati,anul,anulovati,|anulacija,,a,nul,ovati,0,
417,6246.0,asimilovati,,v.tr. ipf.,assimilate,I,verb,?,?,asimilovati,asimilovati,asimil,asimilovati,|asimilacija,,a,simil,ovati,0,
784,36698.0,bezpokojiti,,v.refl. ipf.,"worry, be anxious, feel uneasy",,verb,?,?,†pokojiti,bezpokojiti sę,bezpoko,?kojiti?,,pokoj,¬bez,pokoj,iti,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17352,5474.0,zapakovyvati,,v.tr. ipf.,pack,I,verb,?,?,pakovati,zapakovyvati,zapak,?pakovyvati?,,,¬za,pak,ovyvati,0,
17397,21659.0,zapoznati,,v.tr. pf.,acquaint,,verb,?,?,znati,zapoznati,zapozn,znati,,,¬za¬po,zn,ati,0,(zapo)+znati
17407,21238.0,zapropastiti,,v.refl. pf.,disappear completely,,verb,?,?,pasti,zapropastiti sę,zapropast,?pastiti?,,,¬za¬pro,pas,titi,0,?pasti
17539,23977.0,zauvažati,,v.tr. ipf.,notice,,verb,?,?,važiti,zauvažati,zauvaž,uvažati,,,¬za¬u,vag,⒥ati,0,


In [633]:
PREV = set(ORPHANS) or set(morphemes.base_verb.unique())
PREV = PREV & set(morphemes.base_verb.unique())

ORPHANS = set()

for base in PREV:
    if check_if_orphan(base):
        print(base, check_if_orphan(base))
        ORPHANS |= {base}



råzmetati True
naklåti True


In [634]:
ORPHANS -= {'†ložiti', '†niziti', '†pokojiti', '†vysiti',
 '†gynųti',
 '†jati',
 '†kladati',
 '†lagati',
 '†městiti',
 '†nažiti',
 '†niknųti',
 '†pitati',
 '†plåšiti',
 '†spěti',
 '†strěti',
 '†trčati',
 '†vihnųti',
  'končati', 'jęti', 'slåditi', 'věděti', 'poriti', 'siliti', 'sȯmknųti', 'skråtiti', 'vladati', 'imati',
           "aklimatizovati", 'belěti', 'mråzosušati', 'bočiti', 'programovati', 'projektovati', 
            'obuzdati', 'opravdati', 'odšlupati', "ogråditi", "råzgråditi", "podobniti", 'prisposobiti',
            'dokonati', 'drěmnųti', 'nagråditi', 'zamknųti', 'utočniti', 'trojiti', 'niti', 'sȯvětovati', 'kati', 'obuti', 'odčajati',
            #?
            'vŕnųti', 'mrěti', 'sųsrědotočati', 'sȯniti', 'sloviti', 'občiti', 'pok' + YI + 'iti', #''
           }

morphemes.query("base_verb in @ORPHANS").shape, len(ORPHANS)
# # nakalati

((3, 19), 2)

## ANOMALY 1: missing prefixes

In [183]:
morphemes.query("_prefix == '' and prefix not in ['?', @BACKTICK]")#._prefix.value_counts()



Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
14076,34119.0,sȯhnųti,,v.intr. ipf.,"dry (intr.), become dry, wither",,verb,sȯ’,hnųti,sȯhnųti,...,sȯh,?hnųti?,,,,sȯh,nųti,0,,ipf
14077,34121.0,sȯhnųti,,v.intr. ipf.,pine away,,verb,sȯ’,hnųti,sȯhnųti,...,sȯh,?hnųti?,,,,sȯh,nųti,0,,ipf
14119,4150.0,sȯsati,,v.tr. ipf.,suck,,verb,sȯ’,sati,sȯsati,...,sȯs,?ati?,|sȯska,,,sȯs,ati,0,,ipf


In [181]:
morphemes.query("isv.str.contains('stv')")#._prefix.value_counts()


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
1568,24981.0,črstvěti,,v.intr. ipf.,become stale,,verb,?,?,črstvěti,...,črstv,črstvěti,,,,črstv,ěti,0,,ipf
4592,14605.0,jestvovati,,v.intr. ipf.,exist,,verb,?,?,byti,...,jestv,jestvovati,,,,,,1,?byti,ipf
9061,36666.0,odsųtstvovati,,v.intr. ipf.,"be absent, be away",,verb,?,?,byti,...,odsųtstv,sųtstvovati,,,,,,1,?byti,ipf
11859,23860.0,prisųtstvovati pri,(+6),v.intr. ipf.,"attend, be present at",,verb,?,?,byti,...,prisųtstvovati pri,?tstvovati pri?,,,,,,1,?byti,ipf
14056,34940.0,sȯčuvstvovati,,v.intr. ipf.,"sympathize, commiserate, feel compassion",,verb,sȯ’,čuvstvovati,čuvati,...,sȯčuvstv,?čuvstvovati?,,,,,,1,?čuti,ipf
14589,6306.0,stvarjati,,v.tr. ipf.,"create, make, form",,verb,?,?,tvoriti,...,stvar,?tvarjati?,,,¬s,tvꜵr,jati,0,?tvoriti,ipf
14593,6307.0,stvoriti,,v.tr. pf.,"create, make, form",,verb,?,?,tvoriti,...,stvor,tvoriti,,,¬s,tvꜵr,iti,0,sъtvoriti,pf
14595,18738.0,stvŕditi,,v.tr. pf.,"state, claim, assert, allege, affirm, contend,...",,verb,?,?,tvŕditi,...,stvŕd,tvŕditi,,,¬s,tvŕd,iti,0,,pf
14707,36821.0,sųtstvovati,,v.intr. ipf.,exist,,verb,?,?,byti,...,sųtstv,?tstvovati?,,,,,,1,?byti,ipf
15578,2948.0,učęstvovati,,v.intr. ipf.,participate,,verb,?,?,,...,učęstv,?čęstvovati?,,čęsť,¬u,čęstv,ovati,0,?uměti,ipf


In [182]:

g = morphemes.query("isv == 'sȯvětovati'")
manual_insert_g(g, "vět", dry_run=0)
set_base_verb(g, 'sȯvětovati')

g = morphemes.query("isv == 'sȯčuvstvovati'")
manual_insert_g(g, "čuvstv", dry_run=0)

g = morphemes.query("isv == 'vȯzpitati'")
manual_insert_g(g, "pit", dry_run=0)

g = morphemes.query("base_verb == 'sȯzdati'")
manual_insert_g(g, "zd", dry_run=0)
set_base_verb(g, 'zdati')



sȯvětovati = sȯ vět ovati
sȯčuvstvovati = sȯ čuvstv ovati
vȯzpitati = vȯz pit ati


In [178]:
#morphemes.query("base_verb == 'sil'")
g = morphemes.query("base_verb.str.contains('sȯs')")
g

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
4274,4151.0,izsȯsati,,v.tr. pf.,suck out,,verb,?,?,sȯsati,...,izsȯs,sȯsati,,,¬iz,sȯs,ati,0,,pf
4297,5567.0,izsysati,,#v.tr. ipf.,suck out,,,?,?,sȯsati,...,izsys,?ysati?,,,¬iz,sys,ati,0,,ipf
14119,4150.0,sȯsati,,v.tr. ipf.,suck,,verb,sȯ’,sati,sȯsati,...,sȯs,?ati?,|sȯska,,,sȯs,ati,0,,ipf
16929,20273.0,vsosati,,v.tr. pf.,suck in,,verb,?,?,sȯsati,...,vsos,sosati,,,¬v,sos,ati,0,,pf


In [179]:
morphemes.query("en.str.contains('suck')")


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
4274,4151.0,izsȯsati,,v.tr. pf.,suck out,,verb,?,?,sȯsati,...,izsȯs,sȯsati,,,¬iz,sȯs,ati,0,,pf
4297,5567.0,izsysati,,#v.tr. ipf.,suck out,,,?,?,sȯsati,...,izsys,?ysati?,,,¬iz,sys,ati,0,,ipf
14119,4150.0,sȯsati,,v.tr. ipf.,suck,,verb,sȯ’,sati,sȯsati,...,sȯs,?ati?,|sȯska,,,sȯs,ati,0,,ipf
16929,20273.0,vsosati,,v.tr. pf.,suck in,,verb,?,?,sȯsati,...,vsos,sosati,,,¬v,sos,ati,0,,pf


In [140]:
g = morphemes.query("base_verb == 'deportovati'")
manual_insert_g(g, "port", dry_run=0)

g = morphemes.query("base_verb == 'dezinfikovati'")
manual_insert_g(g, "infik", dry_run=0)
set_base_verb(g, 'infikovati')

g = morphemes.query("base_verb == 'obezglåviti'")
manual_insert_g(g, "glåv", dry_run=0)



deportovati = de port ovati
dezinfikovati = dez infik ovati
obezglåviti = obez glåv iti
obezglåvjati = obez glåv jati


In [155]:
g = morphemes.query("base_verb == 'vȯznikati'")
manual_insert_g(g, "nik", dry_run=0)

g = morphemes.query("isv.str.contains('veli')")
manual_insert_g(g, "velik", dry_run=0, last_cons={"č", 'k'})
set_base_verb(g, 'veličati')

g = morphemes.query("base_verb.str.contains('bogatiti')")
manual_insert_g(g, "bogat", dry_run=0, last_cons={'t', 'ć'})
set_base_verb(g, 'bogatiti')




vȯznikati = vȯz nik ati
vȯznikati = vȯz nik ati
vȯzniknųti = vȯz nik nųti
vȯzniknųti = vȯz nik nųti
prěuveličati = prěu velik ⒥ati
prěuveličiti = prěu velik ⒥iti
uveličati = u velik ⒥ati
uveličiti = u velik ⒥iti
vȯzveličati = vȯz velik ⒥ati
vȯzveličivati = vȯz velik ⒥ivati
obogaćati = o bogat ⒥ati
obogatiti = o bogat iti
vȯzbogaćati = vȯz bogat ⒥ati
vȯzbogatiti = vȯz bogat iti


In [166]:
g = morphemes.query("base_verb.str.contains('plȯt')")
manual_insert_g(g, "plȯt", dry_run=0, last_cons={'t', 'ć'})

g = morphemes.query("base_verb.str.contains('vŕš[ia]ti')")
manual_insert_g(g, "vŕš", dry_run=0, last_cons={'t', 'ć'})
set_base_verb(g, 'vŕšiti')

g = morphemes.query("base_verb.str.contains('staviti')")
manual_insert_g(g, "stav", dry_run=0)#last_cons={'t', 'ć'})
set_base_verb(g, 'staviti')

g = morphemes.query("base_verb.str.contains('kres[ai]')")
manual_insert_g(g, "kres", dry_run=0, last_cons={'s', 'š'})
set_base_verb(g, 'kresati')


vȯplȯćati = vȯ plȯt ⒥ati
vȯplȯtiti = vȯ plȯt iti
dovŕšati = do vŕš ati
dovŕšiti = do vŕš iti
sȯvŕšati = sȯ vŕš ati
sȯvŕšati = sȯ vŕš ati
sȯvŕšiti = sȯ vŕš iti
sȯvŕšiti = sȯ vŕš iti
vȯzvŕšati = vȯz vŕš ati
vȯzvŕšiti = vȯz vŕš iti
zavŕšati = za vŕš ati
zavŕšiti = za vŕš iti
dostaviti = do stav iti
dostaviti = do stav iti
izstaviti = iz stav iti
izstavjati = iz stav jati
nastaviti = na stav iti
nastaviti = na stav iti
nastaviti = na stav iti
nastaviti = na stav iti
odstaviti = od stav iti
ostaviti = o stav iti
ostaviti = o stav iti
postaviti pytańje = po stav iti
postaviti = po stav iti
postaviti = po stav iti
postavjati = po stav jati
postavjati = po stav jati
pozastaviti = poza stav iti
prědstaviti = prěd stav iti
prědstaviti = prěd stav iti
prěstaviti = prě stav iti
prěstaviti = prě stav iti
råzstaviti = raz stav iti
råzstaviti = raz stav iti
sȯstaviti = sȯ stav iti
sȯstavjati = sȯ stav jati
staviti na něčto =  stav iti
staviti pytańje =  stav iti
staviti =  stav iti
sųpostaviti = sųpo

In [143]:
g = morphemes.query("isv.str.contains('sil')")
manual_insert_g(g, "sil", dry_run=0)
set_base_verb(g, 'siliti')


obezsiliti = obez sil iti
obezsiljati = obez sil jati
osiliti = o sil iti
osiljati = o sil jati
siliti =  sil iti
usiliti = u sil iti
usiljati = u sil jati
usilovati = u sil ovati
vȯzsiliti = vȯz sil iti
vȯzsilovati = vȯz sil ovati


In [221]:
g = morphemes[morphemes.base_verb.str.contains("viniti")]
manual_insert_g(g, "vin", dry_run=0)
set_base_verb(g, 'viniti')

g = morphemes[morphemes.base_verb.str.contains("věda")]
manual_insert_g(g, "věd", dry_run=0)
set_base_verb(g, 'věděti')


izviniti = iz vin iti
izviniti = iz vin iti
izvinjati = iz vin jati
izvinjati = iz vin jati
obviniti za = ob vin iti
obvinjati za = ob vin jati
uneviniti = une vin iti
unevinjati = une vin jati
viniti za =  vin iti
dovědati = do věd ati
dověděti = do věd ěti
izpovědati = izpo věd ati
izpověděti = izpo věd ěti
povědati = po věd ati
pověděti = po věd ěti
prědpovědati = prědpo věd ati
prědpověděti = prědpo věd ěti


In [186]:
vc = morphemes.query("_prefix == ''").isv.apply(lambda x: bite_all_prefixes_off(x, '')).value_counts()
vc[vc > 10]

NameError: name 'bite_all_prefixes_off' is not defined

In [390]:
vc = morphemes.query("_prefix == ''").isv.str[:3].value_counts()
vc.head(22)

obr    22
str    18
pro    17
sta    15
pod    12
ima    11
zas    10
kon    10
vla     9
pos     9
obn     8
odo     8
prě     8
tra     8
byt     8
upo     8
kri     7
par     7
dos     7
vit     7
pra     7
kom     7
Name: isv, dtype: int64

In [193]:
def move_prefix(g, prefix):
    assert all(g._stem.str.startswith(prefix))
    assert all(g._prefix == '')
    morphemes.loc[g.index, '_stem'] = g._stem.str[len(prefix):]
    morphemes.loc[g.index, '_prefix'] = prefix


g = morphemes.query("isv.str.contains('^iz') and not base_verb.str.contains('^iz') and _prefix == ''")

move_prefix(g, 'iz')

morphemes.loc[g.index]

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect


In [241]:
g = morphemes.query("isv.str.contains('^iz') and base_verb.str.contains('^iz') and _prefix == ''")
g.base_verb.unique()

array(['izčezati', 'izčrkati', 'izjasniti', 'izključati', 'izkusiti',
       'iznuriti', 'izobličati', 'izolovati', 'izopačati', 'izpråzdniti',
       'iztěkati', 'izučati', 'izvěstiti'], dtype=object)

In [265]:

g1 = morphemes.query("base_verb.str.contains('věst') and not base_verb.str.contains('cv')")
g1 #šć

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
4341,3688.0,izvěstiti,,v.tr. pf.,"inform, report",,verb,?,?,izvěstiti,...,izvěst,?ěstiti?,|izvěsťje|izvěstnik,,,izvěst,iti,0,,pf
4348,3692.0,izvěšćati,,v.tr. ipf.,"inform, report",,verb,?,?,izvěstiti,...,izvěšć,?ěšćati?,,,,izvěst,⒥ati,0,,ipf
8633,22225.0,obvěstiti,,v.tr. pf.,announce,,verb,?,?,obvěstiti,...,obvěst,?ěstiti?,,,,obvěst,iti,0,,pf
8634,22224.0,obvěšćati,,v.tr. ipf.,announce,,verb,?,?,obvěstiti,...,obvěšć,?ěšćati?,,,,obvěst,⒥ati,0,,ipf


In [267]:
g1 = morphemes.query("isv.str.contains('črk')")
manual_insert_g(g1, "črk", dry_run=0)
set_base_verb(g1, 'črkati')

g1 = morphemes.query("isv.str.contains('jasn')")
manual_insert_g(g1, "jasn", dry_run=0)
set_base_verb(g1, 'jasněti')

g1 = morphemes.query("isv.str.contains('ključ')")
manual_insert_g(g1, "ključ", dry_run=0)
set_base_verb(g1, 'ključiti')

g1 = morphemes.query("base_verb.str.contains('nuri')")
manual_insert_g(g1, "nur", dry_run=0)
set_base_verb(g1, 'nuriti')

g1 = morphemes.query("base_verb.str.contains('kus[ia]')")
manual_insert_g(g1, "kus", dry_run=0, last_cons={"s", "š"})
set_base_verb(g1, 'kusati')

g1 = morphemes.query("base_verb.str.contains('izobličati')")
manual_insert_g(g1, "oblik", last_cons={'k', 'č'}, dry_run=0)
set_base_verb(g1, 'obličati')

g1 = morphemes.query("base_verb.str.contains('opač')")
manual_insert_g(g1, "opak", last_cons={'k', 'č'}, dry_run=0)
set_base_verb(g1, 'opačiti')

g1 = morphemes.query("base_verb.str.contains('pråzd')") #izpråzdniti
manual_insert_g(g1, "pråzd", dry_run=0)
set_base_verb(g1, 'pråzdniti')

g1 = morphemes.query("base_verb.str.contains('těk')")  # tekti
manual_insert_g(g1, "těk", dry_run=0)
set_base_verb(g1, 'tekti')

g1 = morphemes.query("base_verb.str.contains('uč[ai]ti') and not base_verb.str.contains('lj?u')").query("isv not in ['bučati', 'čučati', 'mučati']")
manual_insert_g(g1, "uk", dry_run=0, last_cons={'k', 'č'})
set_base_verb(g1, 'učiti')

g1 = morphemes.query("base_verb.str.contains('věst') and not base_verb.str.contains('cv')")
manual_insert_g(g1, "věst", dry_run=0, last_cons={"ć", 't'})
set_base_verb(g1, 'izvěstiti')



izčrkati = iz črk ati
izčrknųti = iz črk nųti
očrkati = o črk ati
očrknųti = o črk nųti
podčrkati = pod črk ati
podčrknųti = pod črk nųti
izjasniti = iz jasn iti
izjasnjati = iz jasn jati
jasněti =  jasn ěti
objasniti = ob jasn iti
objasnjati = ob jasn jati
råzjasniti = raz jasn iti
råzjasnjati = raz jasn jati
izključati = iz ključ ati
izključati = iz ključ ati
izključiti = iz ključ iti
izključiti = iz ključ iti
odključati = od ključ ati
odključati = od ključ ati
odključiti = od ključ iti
odključiti = od ključ iti
podključati = pod ključ ati
podključati = pod ključ ati
podključiti = pod ključ iti
podključiti = pod ključ iti
prěključati = prě ključ ati
prěključiti = prě ključ iti
vključati = v ključ ati
vključati = v ključ ati
vključiti = v ključ iti
vključiti = v ključ iti
zaključati = za ključ ati
zaključati = za ključ ati
zaključati = za ključ ati
zaključiti = za ključ iti
zaključiti = za ključ iti
zaključiti = za ključ iti
iznuriti = iz nur iti
iznurjati = iz nur jati
zanuriti = za 

In [271]:
set_base_verb(morphemes.query("isv == 'tknųti'"), 'tykati')

In [272]:
morphemes.query("_stem == 'tk'")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
2253,22119.0,dotknųti,,v.refl. pf.,touch,,verb,?,?,tykati,...,dotk,tknųti,,,¬do,tk,nųti,0,?tykati,pf
2254,481.0,dotknųti,,v.tr. pf.,touch,,verb,?,?,tykati,...,dotk,tknųti,,,¬do,tk,nųti,0,?tykati,pf
4309,26247.0,iztkati,,v.tr. pf.,weave,,verb,?,?,tkati,...,iztk,tkati,,,¬iz,tk,ati,0,(iz)+tъkati,pf
14133,34857.0,sȯtkati,,v.tr. pf.,weave,,verb,sȯ’,tkati,tkati,...,sȯtk,tkati,,,¬sȯ,tk,ati,0,(sȯ)+tъkati,pf
14248,22123.0,spotknųti,,v.refl. pf.,stumble,,verb,?,?,tykati,...,spotk,tknųti,,,¬s¬po,tk,nųti,0,?tykati,pf
15231,34856.0,tkati,,v.tr. ipf.,weave,,verb,?,?,tkati,...,tk,tkati,,,,tk,ati,0,tъkati,ipf
15232,22108.0,tknųti,,v.refl. pf.,"relate to, concern",,verb,?,?,tykati,...,tk,tknųti,,,,tk,nųti,0,tъknǫti,pf
15233,22099.0,tknųti,,v.tr. pf.,touch,,verb,?,?,tykati,...,tk,tknųti,,,,tk,nųti,0,tъknǫti,pf
15234,22101.0,tknųti,,v.tr. pf.,"stick, jab, poke, prod",,verb,?,?,tykati,...,tk,tknųti,,,,tk,nųti,0,tъknǫti,pf
17505,33093.0,zatknųti,,v.refl. pf.,be stuck,,verb,?,?,tykati,...,zatk,tknųti,,,¬za,tk,nųti,0,?tykati,pf


In [236]:
g

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
3944,3550.0,izčezati,,v.intr. ipf.,"disappear, vanish",,verb,?,?,izčezati,...,izčez,?čezati?,,,,izčez,ati,0,,ipf
3946,3557.0,izčeznųti,,v.intr. pf.,"disappear, vanish",,verb,?,?,izčezati,...,izčez,?čeznųti?,,,,izčez,nųti,0,,pf
3952,3253.0,izčrkati,,v.tr. ipf.,"delete, cross out",,verb,?,?,izčrkati,...,izčrk,?črkati?,,,,izčrk,ati,0,,ipf
3953,3271.0,izčrknųti,,v.tr. pf.,"delete, cross out",,verb,?,?,izčrkati,...,izčrk,?črknųti?,,,,izčrk,nųti,0,,pf
4016,4145.0,izjasniti,,v.tr. pf.,clarify,,verb,?,?,izjasniti,...,izjasn,?jasniti?,,,,izjasn,iti,0,,pf
4017,4144.0,izjasnjati,,v.tr. ipf.,clarify,,verb,?,?,izjasniti,...,izjasn,?jasnjati?,,,,izjasn,jati,0,,ipf
4032,2504.0,izključati,,v.tr. ipf.,"exclude, except",,verb,?,?,izključati,...,izključ,?ključati?,,,,izključ,ati,0,,ipf
4033,3918.0,izključati,,v.tr. ipf.,"turn off, switch off",,verb,?,?,izključati,...,izključ,?ključati?,,,,izključ,ati,0,,ipf
4038,482.0,izključiti,,v.tr. pf.,"exclude, except",,verb,?,?,izključati,...,izključ,?ključiti?,,,,izključ,iti,0,,pf
4039,3920.0,izključiti,,v.tr. pf.,"turn off, switch off",,verb,?,?,izključati,...,izključ,?ključiti?,,,,izključ,iti,0,,pf


In [227]:


for v in g.base_verb.str[2:].unique():
    if any(morphemes.isv == v) or any(morphemes.base_verb == v):
        print(v)
        print(morphemes[morphemes.isv == v].base_verb)
        stem = morphemes[morphemes.isv == v]._stem.values[0]
        print(stem)
        print(morphemes[morphemes.base_verb == v].shape)
        
        g1 = morphemes[morphemes.base_verb == "iz" + v]
        manual_insert_g(g1, stem, dry_run=0, last_cons={"h", "š"})
        set_base_verb(g1, v)


probovati
11961    probovati
Name: base_verb, dtype: object
probo
(3, 21)
izprobovati = iz probo vati
izprobovyvati = iz probo vyvati
seliti
13526    seliti
Name: base_verb, dtype: object
sel
(2, 21)
izseliti = iz sel iti
izseliti = iz sel iti
izseljati = iz sel jati
izseljati = iz sel jati
slušati
13925    slušati
Name: base_verb, dtype: object
sluh
(5, 21)
izslušati = izslušati sluh 
izslušivati = izslušivati sluh 


In [234]:
#        g1 = morphemes[morphemes.base_verb == "iz" + v]
manual_insert_g(g1, stem, dry_run=0, last_cons={"š"})


izslušati = iz sluh ⒥ati
izslušivati = iz sluh ⒥ivati


In [233]:
morphemes.loc[g1.index]

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
4265,36533.0,izslušati,,v.tr. pf.,"hear out, listen to",,verb,?,?,slušati,...,izsluš,slušati,,,izslušati,sluh,,0,,pf
4266,36532.0,izslušivati,,v.tr. ipf.,"hear out, listen to",,verb,?,?,slušati,...,izsluš,?lušivati?,,,izslušivati,sluh,,0,,ipf


In [400]:
morphemes.query("isv.str.contains('prěd')")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
3638,13823.0,idti vprěd,#(ide; šel),v.ipf.,advance,,verb,?,?,idti,...,idti vprěd,?idti vprěd?,,,,jd,ti,0,,ipf
9428,29544.0,oprěděliti,,v.tr. pf.,"define, determine, specify",,verb,?,?,oprěděliti,...,oprěděl,děliti,,,,oprěděl,iti,0,,pf
9429,32606.0,oprěděljati,,v.tr. ipf.,"define, determine, specify",,verb,?,?,oprěděliti,...,oprěděl,?ěljati?,,,,oprěděl,jati,0,,ipf
11252,1189.0,prědati,(prěda),v.tr. pf.,"hand over, deliver, convey",,verb,?,?,dati,...,prěd,dati,,,¬prě,da,ti,0,perdati,pf
11253,3247.0,prědati,(prěda),v.tr. pf.,broadcast,,verb,?,?,dati,...,prěd,dati,,,¬prě,da,ti,0,perdati,pf
11254,35613.0,prědati,(prěda),v.tr. pf.,betray,,verb,?,?,dati,...,prěd,dati,,,¬prě,da,ti,0,perdati,pf
11256,1825.0,prědavati,,v.tr. ipf.,"hand over, deliver, convey",,verb,?,?,davati,...,prědav,davati,|prědavnik|prědavnica|prědavstvo,,¬prě,dav,ati,-1,(prě)+davati,ipf
11257,3246.0,prědavati,,v.tr. ipf.,broadcast,,verb,?,?,davati,...,prědav,davati,|prědavnik|prědavnica|prědavstvo,,¬prě,dav,ati,-1,(prě)+davati,ipf
11258,35612.0,prědavati,,v.tr. ipf.,betray,,verb,?,?,davati,...,prědav,davati,|prědavnik|prědavnica|prědavstvo,,¬prě,dav,ati,-1,(prě)+davati,ipf
11266,23904.0,prědčuvati,,v.tr. ipf.,"have a premonition, have an inkling",,verb,?,?,čuvati,...,prědčuv,čuvati,,,¬prěd,ču,vati,0,?čuti,ipf


In [276]:
morphemes.query("isv.str.contains('uprě')")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
11353,3128.0,prěduprěditi,,v.tr. pf.,"warn, caution, admonish",,verb,?,?,prěduprěditi,...,prěduprěd,?iti?,,,,prěduprěd,iti,0,,pf
11354,21047.0,prěduprěditi,,v.tr. pf.,notify in advance,,verb,?,?,prěduprěditi,...,prěduprěd,?iti?,,,,prěduprěd,iti,0,,pf
11355,21048.0,prěduprěditi,,v.tr. pf.,prevent,,verb,?,?,prěduprěditi,...,prěduprěd,?iti?,,,,prěduprěd,iti,0,,pf
11356,1884.0,prěduprěđati,,v.tr. ipf.,"warn, caution, admonish",,verb,?,?,prěduprěditi,...,prěduprěđ,?đati?,,,,prěduprěd,⒥ati,0,,ipf
11357,21045.0,prěduprěđati,,v.tr. ipf.,notify in advance,,verb,?,?,prěduprěditi,...,prěduprěđ,?đati?,,,,prěduprěd,⒥ati,0,,ipf
11358,21046.0,prěduprěđati,,v.tr. ipf.,prevent,,verb,?,?,prěduprěditi,...,prěduprěđ,?đati?,,,,prěduprěd,⒥ati,0,,ipf


In [280]:
morphemes.query("isv.str.contains('^pr[ěoi]') and not base_verb.str.contains('^pr[ěoi]') and _prefix == ''")


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
11405,35802.0,prěkladati,,v.tr. ipf.,translate,,verb,?,?,,...,prěklad,?kladati?,,prěklad,,prěklad,ati,0,perkladati,ipf
11577,21053.0,prěti,(pre),v.ipf.,"push, press",,verb,?,?,poriti,...,pr,ti,,,,pr,ěti,0,?poriti,ipf
11706,5563.0,prijati,,v.tr. ipf.,"please, be pleasant",,verb,?,?,†jati,...,pri,?jati?,,,,prija,ti,0,prijati,ipf
11707,5564.0,prijati,,v.tr. ipf.,"favour, favor, be beneficial for",,verb,?,?,†jati,...,pri,?jati?,,,,prija,ti,0,prijati,ipf
11734,20079.0,prikladati,,v.tr. ipf.,"attach, append, apply, affix",,verb,?,?,,...,priklad,?kladati?,,priklad,,priklad,ati,0,prikladati,ipf
11859,23860.0,prisųtstvovati pri,(+6),v.intr. ipf.,"attend, be present at",,verb,?,?,byti,...,prisųtstvovati pri,?tstvovati pri?,,,,,,1,?byti,ipf
11999,35326.0,profesionalizovati,,v.tr. ipf./pf.,"professionalise, professionalize",I,verb,?,?,,...,profesionaliz,?fesionalizovati?,|profesionalizacija,profesional,,profesional,izovati,0,?uměti,ipf/pf
12017,11564.0,programovati,,v.tr. ipf.,program,I,verb,?,?,,...,program,?gramovati?,,programa,,program,ovati,0,?uměti,ipf
12053,4361.0,projektovati,,v.tr. ipf.,design,I,verb,?,?,,...,projekt,?jektovati?,,projekt,,projekt,ovati,0,?uměti,ipf
12189,5479.0,protivdějati,,v.intr. ipf.,counteract,,verb,?,?,dějati,...,protivdě,?tivdějati?,,,,,,1,?dějati,ipf


In [388]:
g = morphemes.query("isv.str.contains('^pr[ěoi]') and _prefix == ''")
g

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
11240,20619.0,prěčiti,,v.tr. ipf.,"contradict, deny, negate",,verb,?,?,prěčiti,...,prěč,?čiti?,|prěčnik,,,prěč,iti,0,perčiti,ipf
11353,3128.0,prěduprěditi,,v.tr. pf.,"warn, caution, admonish",,verb,?,?,prěduprěditi,...,prěduprěd,?iti?,,,,prěduprěd,iti,0,,pf
11354,21047.0,prěduprěditi,,v.tr. pf.,notify in advance,,verb,?,?,prěduprěditi,...,prěduprěd,?iti?,,,,prěduprěd,iti,0,,pf
11355,21048.0,prěduprěditi,,v.tr. pf.,prevent,,verb,?,?,prěduprěditi,...,prěduprěd,?iti?,,,,prěduprěd,iti,0,,pf
11356,1884.0,prěduprěđati,,v.tr. ipf.,"warn, caution, admonish",,verb,?,?,prěduprěditi,...,prěduprěđ,?đati?,,,,prěduprěd,⒥ati,0,,ipf
11357,21045.0,prěduprěđati,,v.tr. ipf.,notify in advance,,verb,?,?,prěduprěditi,...,prěduprěđ,?đati?,,,,prěduprěd,⒥ati,0,,ipf
11358,21046.0,prěduprěđati,,v.tr. ipf.,prevent,,verb,?,?,prěduprěditi,...,prěduprěđ,?đati?,,,,prěduprěd,⒥ati,0,,ipf
11577,21053.0,prěti,(pre),v.ipf.,"push, press",,verb,?,?,poriti,...,pr,ti,,,,pr,ěti,0,?poriti,ipf
11698,11436.0,prijateliti,,v.refl. ipf.,make friends,,verb,?,?,prijateliti,...,prijatel,?jateliti?,,,,prijatel,iti,0,,ipf
11706,5563.0,prijati,,v.tr. ipf.,"please, be pleasant",,verb,?,?,†jati,...,pri,?jati?,,,,prija,ti,0,prijati,ipf


In [389]:
vc = g.base_verb.value_counts()
vc[vc > 1]

prěduprěditi    6
prostiti        4
                2
†jati           2
probovati       2
Name: base_verb, dtype: int64

In [399]:
g1 = morphemes.query("base_verb.str.contains('prašć')")
manual_insert_g(g1, "prast", last_cons={'ć', 't'}, dry_run=0)
set_base_verb(g1, 'prostiti')

oprašćati = o prast ⒥ati
prašćati =  prast ⒥ati
prašćati =  prast ⒥ati


In [387]:

g1 = morphemes.query("base_verb.str.contains('vŕ[tć]')")
manual_insert_g(g1, "vŕt", last_cons={'ć', 't'}, dry_run=0)
set_base_verb(g1, 'vŕtěti')

#morphemes.loc[g1.index, 'base_noun'] = 'inako'

#'vŕtěti'
g1 = morphemes.query("isv.str.contains('zemj')")
morphemes.loc[
    g1.index,
    'base_noun'
] = 'zemja'
manual_insert_g(g1, "zemj", dry_run=0)
set_base_verb(g1, '†zemjiti')


morphemes.loc[
    morphemes.query("isv == 'travmatizovati'").index,
    'base_noun'
] = 'travma'

g1 = morphemes.query("isv.str.contains('nik') and isv != 'komunikovati'")
manual_insert_g(g1, "nik", dry_run=0)
set_base_verb(g1, '†niknųti')

g1 = morphemes.query("isv.str.contains('hytr')")
manual_insert_g(g1, "hytr", dry_run=0)
set_base_verb(g1, '†hytriti')

g1 = morphemes.query("isv.str.contains('trav') and isv != 'travmatizovati'")
manual_insert_g(g1, "trav", dry_run=0)
set_base_verb(g1, '†traviti')


provŕćati = pro vŕt ⒥ati
provŕtiti = pro vŕt iti
vŕtěti =  vŕt ěti
vŕtěti =  vŕt ěti
vŕtěti =  vŕt ěti
zavŕtěti = za vŕt ěti
prizemjati = pri zemj ati
prizemjiti = pri zemj iti
izniknųti = iz nik nųti
pronikati = pro nik ati
proniknųti = pro nik nųti
vnikati = v nik ati
vniknųti = v nik nųti
vȯznikati = vȯz nik ati
vȯznikati = vȯz nik ati
vȯzniknųti = vȯz nik nųti
vȯzniknųti = vȯz nik nųti
prěhytriti = prě hytr iti
prěhytrjati = prě hytr jati
otraviti = o trav iti
otravjati = o trav jati
prětraviti = prě trav iti
prětravjati = prě trav jati


In [358]:

# morphemes.query("isv.str.contains('vyk(a|nų)ti') and not isv == 'vykati' ")




Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
4131,26209.0,izniknųti,,v.intr. pf.,"disappear, vanish",,verb,?,?,†niknųti,...,iznik,?niknųti?,,,iz,nik,nųti,0,?niknǫti,pf
5168,1277.0,komunikovati,,v.intr. ipf.,communicate,I,verb,?,?,komunikovati,...,komunik,komunikovati,|komunikacija,,,komunik,ovati,0,,ipf
12085,4454.0,pronikati,,v.intr. ipf.,"pervade, penetrate",,verb,?,?,pronikati,...,pronik,?nikati?,,,,pronik,ati,0,,ipf
12087,4455.0,proniknųti,,v.intr. pf.,"pervade, penetrate",,verb,?,?,pronikati,...,pronik,?niknųti?,,,,pronik,nųti,0,,pf
16542,20795.0,vnikati,,v.intr. ipf.,"arise, emerge",,verb,?,?,vnikati,...,vnik,?nikati?,,,,vnik,ati,0,,ipf
16543,20798.0,vniknųti,,v.intr. pf.,"arise, emerge",,verb,?,?,vnikati,...,vnik,?niknųti?,,,,vnik,nųti,0,,pf
16732,15156.0,vȯznikati,,v.intr. ipf.,originate,,verb,vȯz’,nikati,vȯznikati,...,vȯznik,?nikati?,,,vȯz,nik,ati,0,,ipf
16733,20801.0,vȯznikati,,v.intr. ipf.,"arise, emerge",,verb,vȯz’,nikati,vȯznikati,...,vȯznik,?nikati?,,,vȯz,nik,ati,0,,ipf
16735,20803.0,vȯzniknųti,,v.intr. pf.,arise,,verb,vȯz’,niknųti,vȯznikati,...,vȯznik,?niknųti?,,,vȯz,nik,nųti,0,,pf
16736,20804.0,vȯzniknųti,,v.intr. pf.,emerge,,verb,vȯz’,niknųti,vȯznikati,...,vȯznik,?niknųti?,,,vȯz,nik,nųti,0,,pf


In [378]:

g1 = morphemes.query("base_verb.str.contains('mě[tć]iti')")
move_prefix(g1, 'pri')
morphemes.loc[g1.index, 'reconstructed'] = "(" + morphemes.loc[g1.index, '_prefix'] + ")+" + 'mětiti'


  morphemes.loc[g.index, '_stem'] = g._stem.str[len(prefix):]
  morphemes.loc[g1.index, 'reconstructed'] = "(" + morphemes.loc[g1.index, '_prefix'] + ")+" + 'mětiti'


In [386]:
g1 = morphemes.query("isv.str.contains('inač')")

manual_insert_g(g1, "inak", last_cons={'č', 'k'}, dry_run=0)
set_base_verb(g1, '†inačiti')
morphemes.loc[g1.index, 'base_noun'] = 'inako'
morphemes.loc[g1.index]

prěinačati = prě inak ⒥ati
prěinačiti = prě inak ⒥iti


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
11395,18420.0,prěinačati,,v.tr. ipf.,alter,,verb,?,?,†inačiti,...,prěinač,?inačati?,,inako,prě,inak,⒥ati,0,,ipf
11398,18423.0,prěinačiti,,v.tr. pf.,alter,,verb,?,?,†inačiti,...,prěinač,?inačiti?,,inako,prě,inak,⒥iti,0,,pf


In [353]:
for v1 in vc[vc > 1].index:
    v = v1[3:]
    if v == '': continue
    if any(morphemes.isv == v) or any(morphemes.base_verb == v):
        print(v, v1)
        stem_ = None
        for g_cand in [morphemes[morphemes.isv == v], morphemes[morphemes.base_verb == v]]:
            if len(g_cand):
                stem_ = g_cand._stem.unique()
                if len(stem_) == 1:
                    stem = stem_[0]
                # print(g_cand)
        
        g1 = morphemes[morphemes.base_verb == v1]
        manual_insert_g(g1, stem, dry_run=0, last_cons={"š"})
        set_base_verb(g1, v)


větriti provětriti
provětriti = pro větr iti
provětrjati = pro větr jati
vabiti privabiti
privabiti = pri vab iti
privabjati = pri vab jati
sęgati prisęgati
prisęgati = pri sęg ati
prisęgnųti = pri sęg nųti
vŕgati provŕgati
provŕgati = pro vŕg ati
provŕgnųti = pro vŕg nųti
stigati prěstigati
prěstigati = prě stig ati
prěstignųti = prě stig nųti
slušati prěslušati
prěslušati = prě sluh ⒥ati
prěslušivati = prě sluh ⒥ivati
variti prěvariti
prěvariti = prě var iti
prěvarjati = prě var jati


In [342]:
g1 = morphemes.query("base_verb.str.contains('brěgti')")
manual_insert_g(g1, "brěg", dry_run=0)
set_base_verb(g1, 'brěgti')
morphemes.loc[g1.index, 'reconstructed'] = "(" + morphemes.loc[g1.index, '_prefix'] + ")+" + 'berťi'


prěnebrěgati = prěne brěg ati
prěnebrěgti = prěne brěg ti


  morphemes.loc[g1.index, 'reconstructed'] = "(" + morphemes.loc[g1.index, '_prefix'] + ")+" + 'berťi'


In [295]:
BV = ['prisjediniti', 'prostiti', 'privykati', 'prěseliti', 'prilųčati']
morphemes.query("base_verb in @BV").sort_values(by='base_verb')


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
11747,36903.0,prilųčati,,v.refl. ipf.,"join, connect",,verb,?,?,prilųčati,...,prilųč,?lųčati?,,,,prilųč,ati,0,,ipf
11748,36897.0,prilųčati,,v.tr. ipf.,"connect, attach",,verb,?,?,prilųčati,...,prilųč,?lųčati?,,,,prilųč,ati,0,,ipf
11749,36900.0,prilųčiti,,v.refl. pf.,"join, connect",,verb,?,?,prilųčati,...,prilųč,?lųčiti?,,,,prilųč,iti,0,,pf
11750,36894.0,prilųčiti,,v.tr. pf.,"connect, attach",,verb,?,?,prilųčati,...,prilųč,?lųčiti?,,,,prilųč,iti,0,,pf
11822,36896.0,prisjediniti,,v.tr. pf.,"connect, attach",,verb,?,?,prisjediniti,...,prisjedin,sjediniti,,,,prisjedin,iti,0,,pf
11821,36902.0,prisjediniti,,v.refl. pf.,"join, connect",,verb,?,?,prisjediniti,...,prisjedin,sjediniti,,,,prisjedin,iti,0,,pf
11823,36905.0,prisjedinjati,,v.refl. ipf.,"join, connect",,verb,?,?,prisjediniti,...,prisjedin,sjedinjati,,,,prisjedin,jati,0,,ipf
11824,36899.0,prisjedinjati,,v.tr. ipf.,"connect, attach",,verb,?,?,prisjediniti,...,prisjedin,sjedinjati,,,,prisjedin,jati,0,,ipf
11929,351.0,privyknųti,,v.tr. pf.,"accustom, inure",,verb,?,?,privykati,...,privyk,?knųti?,,,,privyk,nųti,0,privyknǫti,pf
11928,11520.0,privyknųti,,v.refl. pf.,get used (to),,verb,?,?,privykati,...,privyk,?knųti?,,,,privyk,nųti,0,privyknǫti,pf


In [320]:
g1 = morphemes.query("isv.str.contains('klast') and not isv.str.contains(' ')")
g1

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
4925,901.0,klasti,(klade),v.tr. ipf.,"put, lay",,verb,?,?,klasti,...,klasti,klasti,,,,klas,ti,0,klasti,ipf


In [327]:
g1 = morphemes.query("base_verb.str.contains('lųč[ai]ti')").sort_values(by='base_verb')
manual_insert_g(g1, "lųč", dry_run=0)
set_base_verb(g1, 'lųčiti')
morphemes.loc[g1.index, 'reconstructed'] = "(" + morphemes.loc[g1.index, '_prefix'] + ")+" + 'lǫčiti'

prilųčati = pri lųč ati
prilųčati = pri lųč ati
prilųčiti = pri lųč iti
prilųčiti = pri lųč iti
råzlųčati = raz lųč ati
råzlųčiti = raz lųč iti
slųčati = s lųč ati
slųčiti = s lųč iti


  morphemes.loc[g1.index, 'reconstructed'] = "(" + g1._prefix + ")+" + 'lǫčiti'


In [332]:
morphemes.loc[g1.index]

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
11747,36903.0,prilųčati,,v.refl. ipf.,"join, connect",,verb,?,?,lųčiti,...,prilųč,?lųčati?,,,pri,lųč,ati,0,(pri)+lǫčiti,ipf
11748,36897.0,prilųčati,,v.tr. ipf.,"connect, attach",,verb,?,?,lųčiti,...,prilųč,?lųčati?,,,pri,lųč,ati,0,(pri)+lǫčiti,ipf
11749,36900.0,prilųčiti,,v.refl. pf.,"join, connect",,verb,?,?,lųčiti,...,prilųč,?lųčiti?,,,pri,lųč,iti,0,(pri)+lǫčiti,pf
11750,36894.0,prilųčiti,,v.tr. pf.,"connect, attach",,verb,?,?,lųčiti,...,prilųč,?lųčiti?,,,pri,lųč,iti,0,(pri)+lǫčiti,pf
12676,20108.0,råzlųčati,,v.tr. ipf.,separate,,verb,råz’,lųčati,lųčiti,...,råzlųč,?lųčati?,|råzlųčnik,,raz,lųč,ati,0,(raz)+lǫčiti,ipf
12678,20109.0,råzlųčiti,,v.tr. pf.,separate,,verb,råz’,lųčiti,lųčiti,...,råzlųč,?lųčiti?,|råzlųčnik,,raz,lųč,iti,0,(raz)+lǫčiti,pf
13916,20162.0,slųčati,,v.tr. ipf.,"join, combine, couple",,verb,?,?,lųčiti,...,slųč,?lųčati?,,,s,lųč,ati,0,(s)+lǫčiti,ipf
13918,20166.0,slųčiti,,v.tr. pf.,"join, combine, couple",,verb,?,?,lųčiti,...,slųč,?lųčiti?,,,s,lųč,iti,0,(s)+lǫčiti,pf


In [333]:


g1 = morphemes.query("isv.str.contains('vyk(a|nų)ti') and not isv == 'vykati' ")
manual_insert_g(g1, "vyk", dry_run=0)
set_base_verb(g1, 'vyknųti')

g1 = morphemes.query("isv.str.contains('jedin')")
manual_insert_g(g1, "jedin", dry_run=0)
set_base_verb(g1, 'jediniti')

g1 = morphemes.query("isv.str.contains('sel') and not isv.str.contains('vesel')")
manual_insert_g(g1, "sel", dry_run=0)
set_base_verb(g1, 'seliti')

g1 = morphemes.query("isv.str.contains('klad') and not isv.str.contains(' ')")
manual_insert_g(g1, "klad", dry_run=0)
set_base_verb(g1, 'klasti')

set_base_verb(
    morphemes.query("isv.str.contains('projekt') "), 
    'projektovati'
)

set_base_verb(
    morphemes.query("isv.str.contains('sel') and isv.str.contains('vesel')"),
    'veseliti'
)


odvykati = od vyk ati
odvykati = od vyk ati
odvyknųti = od vyk nųti
odvyknųti = od vyk nųti
privykati = pri vyk ati
privykati = pri vyk ati
privyknųti = pri vyk nųti
privyknųti = pri vyk nųti
objediniti = ob jedin iti
objedinjati = ob jedin jati
prisjediniti = pris jedin iti
prisjediniti = pris jedin iti
prisjedinjati = pris jedin jati
prisjedinjati = pris jedin jati
råzjediniti = raz jedin iti
råzjedinjati = raz jedin jati
sjediniti = s jedin iti
sjediniti = s jedin iti
sjediniti = s jedin iti
sjedinjati = s jedin jati
sjedinjati = s jedin jati
sjedinjati = s jedin jati
ujediniti = u jedin iti
ujedinjati = u jedin jati
izseliti = iz sel iti
izseliti = iz sel iti
izseljati = iz sel jati
izseljati = iz sel jati
naseliti = na sel iti
naseljati = na sel jati
oseliti = o sel iti
prěseliti = prě sel iti
prěseliti = prě sel iti
prěseljati = prě sel jati
prěseljati = prě sel jati
seliti =  sel iti
dokladati = do klad ati
izkladati = iz klad ati
nakladati = na klad ati
nakladati = na klad ati


  g1 = morphemes.query("isv.str.contains('vyk(a|nų)ti') and not isv == 'vykati' ")


In [324]:
morphemes.query("isv.str.contains('sel') and isv.str.contains('vesel')")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
11030,11074.0,poveseliti,,v.refl. pf.,"enjoy, have fun",,verb,?,?,veseliti,...,povesel,veseliti,,,¬po,vesel,iti,0,,pf
12982,30580.0,råzveseliti,,v.tr. pf.,"amuse, cheer up",,verb,råz’,veseliti,råzveseliti,...,råzvesel,veseliti,,,¬råz,vesel,iti,0,,pf
12983,30581.0,råzveseljati,,v.tr. ipf.,"amuse, cheer up",,verb,råz’,veseljati,råzveseliti,...,råzvesel,?eseljati?,,,¬råz,vesel,jati,0,,ipf
16294,3583.0,veseliti,,v.refl. ipf.,"#enjoy, have fun",,verb,?,?,veseliti,...,vesel,?eseliti?,|veseľje|veselosť,,,vesel,iti,0,,ipf
16295,13581.0,veseliti,,v.tr. ipf.,#liven up,,verb,?,?,veseliti,...,vesel,?eseliti?,|veseľje|veselosť,,,vesel,iti,0,,ipf


In [323]:
#BV = morphemes.query("isv.str.contains('kla(d|st|d)')").base_verb.unique()

#morphemes.query("base_verb in @BV and base_verb != ''")

morphemes.query("isv.str.contains('projekt') ")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
12053,4361.0,projektovati,,v.tr. ipf.,design,I,verb,?,?,,...,projekt,?jektovati?,,projekt,,projekt,ovati,0,?uměti,ipf
17406,20833.0,zaprojektovati,,v.tr. pf.,design,I,verb,?,?,projektovati,...,zaprojekt,projektovati,,,¬za,projekt,ovati,0,,pf


In [285]:
g[g.base_verb == '']

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
11405,35802.0,prěkladati,,v.tr. ipf.,translate,,verb,?,?,,...,prěklad,?kladati?,,prěklad,,prěklad,ati,0,perkladati,ipf
11734,20079.0,prikladati,,v.tr. ipf.,"attach, append, apply, affix",,verb,?,?,,...,priklad,?kladati?,,priklad,,priklad,ati,0,prikladati,ipf
11999,35326.0,profesionalizovati,,v.tr. ipf./pf.,"professionalise, professionalize",I,verb,?,?,,...,profesionaliz,?fesionalizovati?,|profesionalizacija,profesional,,profesional,izovati,0,?uměti,ipf/pf
12017,11564.0,programovati,,v.tr. ipf.,program,I,verb,?,?,,...,program,?gramovati?,,programa,,program,ovati,0,?uměti,ipf
12053,4361.0,projektovati,,v.tr. ipf.,design,I,verb,?,?,,...,projekt,?jektovati?,,projekt,,projekt,ovati,0,?uměti,ipf


In [279]:


for v1 in g.base_verb.unique():
    v = v1[3:]
    if any(morphemes.isv == v) or any(morphemes.base_verb == v):
        print(v, v1)
        print(morphemes[morphemes.isv == v].base_verb)
        #stem = morphemes[morphemes.isv == v]._stem.values[0]
        #print(stem)
        print(morphemes[morphemes.base_verb == v].shape)
        
        #g1 = morphemes[morphemes.base_verb == "iz" + v]
        #manual_insert_g(g1, stem, dry_run=0, last_cons={"h", "š"})
        #set_base_verb(g1, v)


 
Series([], Name: base_verb, dtype: object)
(123, 21)
seliti prěseliti
13526    seliti
Name: base_verb, dtype: object
(6, 21)
slušati prěslušati
13925    slušati
Name: base_verb, dtype: object
(7, 21)
stigati prěstigati
14443    stigati
14444    stigati
Name: base_verb, dtype: object
(4, 21)
variti prěvariti
16110    vrěti
Name: base_verb, dtype: object
(0, 21)
sęgati prisęgati
13481    sęgati
Name: base_verb, dtype: object
(2, 21)
sjediniti prisjediniti
13679    sjediniti
13680    sjediniti
13681    sjediniti
Name: base_verb, dtype: object
(6, 21)
vabiti privabiti
16060    vabiti
Name: base_verb, dtype: object
(2, 21)
vykati privykati
17010    kati
Name: base_verb, dtype: object
(0, 21)
větriti provětriti
16326    
Name: base_verb, dtype: object
(0, 21)
vŕgati provŕgati
16855    vŕgati
Name: base_verb, dtype: object
(2, 21)


In [585]:
morphemes.query("_prefix == '' and isv.str.contains('^(ob|pod|za|od|u|do)')").sort_values(by='base_verb')

  morphemes.query("_prefix == '' and isv.str.contains('^(ob|pod|za|od|u|do)')").sort_values(by='base_verb')


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
17544,31583.0,zavěćati,,v.tr. ipf./pf.,"bequeath, leave by will",,verb,?,?,,...,zavěć,?ěćati?,,zavět,,zavět,⒥ati,0,?uměti,ipf/pf
8415,5740.0,obmanųti,,v.tr. pf.,"con, cheat, delude, deceive, hoodwink",,verb,?,?,,...,obma,?manųti?,,obman,,obman,ųti,0,?uměti,pf
8419,18014.0,obmanyvati,,v.tr. ipf.,fool,,verb,?,?,,...,obman,?manyvati?,|obmannik,obman,,obman,yvati,0,?uměti,ipf
8510,22681.0,obråtiti,(+3),v.refl. pf.,"address, appeal to",,verb,?,?,,...,obråt,?råtiti?,,obråt,,obråt,iti,0,ob(v)ortiti,pf
8511,22675.0,obråtiti,,v.refl. pf.,"turn (intr.), turn around",,verb,?,?,,...,obråt,?råtiti?,,obråt,,obråt,iti,0,ob(v)ortiti,pf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17110,14041.0,začinati,,v.tr. ipf.,"begin (tr.), start (tr.), embark",,verb,?,?,začinati,...,začin,?činati?,,,,začin,ati,0,?začęti,ipf
8443,28845.0,obnažiti,,v.refl. pf.,strip,,verb,?,?,†nažiti,...,obnaž,žiti,,,,obnaž,iti,0,,pf
8444,5320.0,obnažiti,,v.tr. pf.,"denude, expose",,verb,?,?,†nažiti,...,obnaž,žiti,,,,obnaž,iti,0,,pf
8441,5321.0,obnažati,,v.refl. ipf.,strip,,verb,?,?,†nažiti,...,obnaž,?žati?,,,,obnaž,ati,0,,ipf


In [None]:
#začinati

In [584]:
morphemes.query("_prefix == '' and isv.str.contains('^(pod|do)')").sort_values(by='base_verb')

  morphemes.query("_prefix == '' and isv.str.contains('^(pod|do)')").sort_values(by='base_verb')


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
2085,35297.0,dokumentovati,,v.tr. ipf./pf.,document,I,verb,?,?,,...,dokument,?kumentovati?,|dokumentacija,dokument,,dokument,ovati,0,?uměti,ipf/pf
1988,25273.0,dobrěti,,v.intr. ipf.,become good,,verb,?,?,dobrěti,...,dobr,?brěti?,|dobrosť,dobry,,dobr,ěti,0,?uměti,ipf
2062,1181.0,dojiti,,v.tr. ipf.,milk,,verb,?,?,dojiti,...,do,?jiti?,,,,do,jiti,0,dojiti,ipf
2187,7215.0,doråzuměvati,,v.refl. ipf.,communicate,,verb,?,?,doråzuměvati,...,doråzuměv,?měvati?,,,,doråzum,ěvati,0,,ipf
2207,4208.0,doskonaliti,,v.tr. ipf.,perfect,,verb,?,?,doskonaliti,...,doskonal,?konaliti?,|doskonalosť,,,doskonal,iti,0,,ipf
10421,18268.0,podobati,,v.refl. ipf.,"please, be liked by",,verb,?,?,podobati,...,podob,?ati?,,podoba,,podob,ati,0,,ipf
10473,5612.0,podstrěkati,(podstrěče),v.tr. ipf.,"incite, instigate, abet, stir up, rouse",,verb,?,?,podstrěkati,...,podstrěk,?trěkati?,,,,podstrěk,ati,0,,ipf
10474,4494.0,podstrěknųti,,v.tr. ipf.,"incite, instigate, abet, stir up, rouse",,verb,?,?,podstrěkati,...,podstrěk,?trěknųti?,,,,podstrěk,nųti,0,,ipf


In [404]:
g

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
10421,18268.0,podobati,,v.refl. ipf.,"please, be liked by",,verb,?,?,podobati,...,podob,?ati?,,podoba,,podob,ati,0,,ipf
10454,22358.0,podręditi,,v.tr. pf.,subordinate,,verb,?,?,podręditi,...,podręd,?ręditi?,,,,podręd,iti,0,podъręditi,pf
10456,22359.0,podręđati,,v.tr. ipf.,subordinate,,verb,?,?,podręditi,...,podręđ,?ręđati?,,,,podręd,⒥ati,0,?podъręditi,ipf
10463,10769.0,podslušati,,v.tr. pf.,"eavesdrop, wiretap",,verb,?,?,podslušati,...,podsluš,slušati,,,,podsluš,ati,0,,pf
10466,22778.0,podslušivati,,v.tr. ipf.,"eavesdrop, wiretap",,verb,?,?,podslušati,...,podsluš,?lušivati?,,,,podsluš,ivati,0,,ipf
10467,5389.0,podsměhati,,v.refl. ipf.,jeer,,verb,?,?,podsměhati,...,podsměh,?měhati?,,,,podsměh,ati,0,,ipf
10468,5390.0,podsměhnųti,,v.refl. pf.,jeer,,verb,?,?,podsměhati,...,podsměh,?měhnųti?,,,,podsměh,nųti,0,,pf
10473,5612.0,podstrěkati,(podstrěče),v.tr. ipf.,"incite, instigate, abet, stir up, rouse",,verb,?,?,podstrěkati,...,podstrěk,?trěkati?,,,,podstrěk,ati,0,,ipf
10474,4494.0,podstrěknųti,,v.tr. ipf.,"incite, instigate, abet, stir up, rouse",,verb,?,?,podstrěkati,...,podstrěk,?trěknųti?,,,,podstrěk,nųti,0,,ipf
10505,22584.0,podvŕgati,,v.tr. ipf.,"expose to, subject to",,verb,?,?,podvŕgati,...,podvŕg,vŕgati,,,,podvŕg,ati,0,,ipf


In [409]:
move_prefix(morphemes.query("_stem == 'podpor'"), "pod")


  morphemes.loc[g.index, '_stem'] = g._stem.str[len(prefix):]


In [619]:
CORRECT_VERBS = [
    'organizovati', 'okupovati', 'optimizovati', 'orientovati', 'uzurpovati',
    'uměti', 'ohati', 'učiti', 'užasati', 'ostriti', 'orųdovati', 'orųžiti', 'osnovati', 'oriti'
]

In [618]:
g = morphemes.query(
    "_prefix == '' and isv.str.contains('^(u|o)') and not isv.str.contains('^(ob|od)')"
).query(
    "(base_verb not in @CORRECT_VERBS and isv not in @CORRECT_VERBS)"
).sort_values(by='en')

g

  g = morphemes.query(


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
9155,29304.0,"ogluhnųti, oglušeti",,v.intr. pf.,become deaf,,verb,?,?,,...,"ogluhnųti, oglušeti","?gluhnųti, oglušeti?",,,,,,0,?uměti,pf
9507,33797.0,oriti,,v.tr. ipf.,drag down,,verb,?,?,oriti,...,or,riti,|orač,,,or,iti,0,oriti,ipf
9479,22890.0,orati,(oŕe),v.ipf.,"plough, plow",,verb,?,?,orati,...,or,?rati?,|orač,,,or,ati,0,orati,ipf


In [601]:
g.base_verb.unique()

array(['ovplyvniti', '', 'podobniti', 'ozdobiti', 'oprěděliti', 'oriti',
       'opravniti', 'upokarnjati', 'uspravědliviti', 'orati'],
      dtype=object)

In [611]:
morphemes.query('base_verb == "upokarnjati"')
# upokorniti
# koriti

# spravědliv

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
15790,5897.0,upokarnjati,,v.tr. ipf.,humble,,verb,?,?,upokarnjati,...,upokarn,?karnjati?,,,,upokꜵrn,jati,0,,ipf
15791,5896.0,upokorniti,,v.tr. pf.,humble,,verb,?,?,upokarnjati,...,upokorn,?korniti?,,,,upokꜵrn,iti,0,,pf


In [617]:
morphemes.query('base_verb.str.contains("dobn?[ia]") and base_verb != "baviti"')


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
9802,34715.0,ozdabjati,,v.tr. ipf.,"decorate, adorn",,verb,?,?,dobiti,...,ozdab,?zdabjati?,,,oz,dab,jati,0,,ipf
9804,29789.0,ozdobiti,,v.tr. pf.,"decorate, adorn",,verb,?,?,dobiti,...,ozdob,?zdobiti?,,ozdoba,oz,dob,iti,0,,pf
10421,18268.0,podobati,,v.refl. ipf.,"please, be liked by",,verb,?,?,dobiti,...,podob,?ati?,,podoba,po,dob,ati,0,,ipf
14211,36557.0,spodobati,,v.refl. pf.,"please, be liked by",,verb,?,?,dobiti,...,spodob,podobati,,,spo,dob,ati,0,,pf
15786,19064.0,upodabnjati,,v.refl. ipf.,"become similar, blend in",,verb,?,?,dobiti,...,upodabn,?abnjati?,,,upo,dab,njati,0,,ipf
15787,19062.0,upodabnjati,,v.tr. ipf.,"liken, make similar, assimilate",,verb,?,?,dobiti,...,upodabn,?abnjati?,,,upo,dab,njati,0,,ipf
15788,19068.0,upodobniti,,v.refl. pf.,"become similar, blend in",,verb,?,?,dobiti,...,upodobn,niti,,,upo,dob,niti,0,,pf
15789,19066.0,upodobniti,,v.tr. pf.,"liken, make similar, assimilate",,verb,?,?,dobiti,...,upodobn,niti,,,upo,dob,niti,0,,pf


In [616]:
g = morphemes.query('base_verb.str.contains("dobn?[ia]") and base_verb != "baviti"')
manual_insert_g(g, 'd[ao]b',  dry_run=0)
set_base_verb(g, 'dobiti')

g = morphemes.query('base_verb == "upokarnjati"')
manual_insert_g(g, 'k[ao]rn',  dry_run=0)
morphemes.loc[g.index, 'base_noun'] = 'pokorny'
set_base_verb(g, 'koriti')


g = morphemes.query('base_verb.str.contains("oprěděliti")')
manual_insert_g(g, 'děl',  dry_run=0)
set_base_verb(g, 'děliti')


g = morphemes.query('isv.str.contains("vplyv")')
manual_insert_g(g, 'vplyv',  dry_run=0)
set_base_verb(g, 'vplyvti')

#g = morphemes.query('isv.str.contains("spravědliv")')
#move_prefix(g, 'u')
#g = morphemes.query('base_verb.str.contains("opravniti")')
#move_prefix(g, 'o')



ozdabjati = oz dab jati
ozdobiti = oz dob iti
podobati = po dob ati
spodobati = spo dob ati
upodabnjati = upo dab njati
upodabnjati = upo dab njati
upodobniti = upo dob niti
upodobniti = upo dob niti
ovplyvniti = o vplyv niti
ovplyvnjati = o vplyv njati
vplyvati =  vplyv ati
vplyvti =  vplyv ti


In [582]:
# last_cons={'c', 'č'},
g = morphemes.query('base_verb.str.contains("možn")')
manual_insert_g(g, 'možn',  dry_run=0)
morphemes.loc[g.index, 'base_noun'] = 'možny'
set_base_verb(g, 'možniti')

g = morphemes.query('base_verb.str.contains("črv")')
manual_insert_g(g, 'črv',  dry_run=0)
set_base_verb(g, 'črveněti')

g = morphemes.query('base_verb.str.contains("ulagoditi")')
move_prefix(g, 'u')


g = morphemes.query('base_verb.str.contains("lučš")')
move_prefix(g, 'u')
morphemes.loc[g.index, 'reconstructed'] = '?lučьjь?'

g = morphemes.query('base_verb.str.contains("menš")')
manual_insert_g(g, 'menš', dry_run=0)
morphemes.loc[g.index, 'reconstructed'] = '?mьňьjь?'
set_base_verb(g, 'menšiti')


g = morphemes.query('base_verb.str.contains("tęg") and not base_verb.str.contains("tęgč")')
set_base_verb(g, 'tęgati')

g = morphemes.query('base_verb.str.contains("tęgč")')
manual_insert_g(g, 'tęgk', last_cons={"č", "k"}, dry_run=0)
morphemes.loc[g.index, 'base_noun'] = 'tęžky'
set_base_verb(g, 'tęgčiti')




umožniti = u možn iti
umožnjati = u možn jati
unemožniti = une možn iti
unemožnjati = une možn jati
črveněti =  črv eněti
počrveněti = po črv eněti
začrveniti = za črv eniti
začrvenjati = za črv enjati
smenšati = s menš ati
smenšiti = s menš iti
umenšati = u menš ati
umenšiti = u menš iti
otęgčati = o tęgk ⒥ati
otęgčati = o tęgk ⒥ati
otęgčiti = o tęgk ⒥iti
otęgčiti = o tęgk ⒥iti


  morphemes.loc[g.index, '_stem'] = g._stem.str[len(prefix):]


In [598]:
g = morphemes.query('base_verb.str.contains("osȯvrěmenniti")')
manual_insert_g(g, 'vrěmen', last_cons={"č", "k"}, dry_run=0)
morphemes.loc[g.index, 'base_noun'] = 'vrěmę'
set_base_verb(g, 'osȯvrěmenniti')


osȯvrěmenniti = osȯ vrěmen niti
osȯvrěmennjati = osȯ vrěmen njati


In [597]:
morphemes.loc[
    morphemes.query('base_verb.str.contains("orų[ž]")').index,
    'base_verb'
] = 'orųžiti'


In [None]:
morphemes.query('base_verb.str.contains("menš")')


In [580]:
g = morphemes.query('base_verb.str.contains("menš")')

g

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
13946,2062.0,smenšati,,v.tr. ipf.,reduce,,verb,?,?,smenšati,...,smenš,?menšati?,,,,smenš,ati,0,,ipf
13948,1297.0,smenšiti,,v.tr. pf.,reduce,,verb,?,?,smenšati,...,smenš,?menšiti?,,,,smenš,iti,0,,pf
15710,4101.0,umenšati,,v.tr. ipf.,"diminish, reduce, make smaller, abate",,verb,?,?,umenšati,...,umenš,?menšati?,,,,umenš,ati,0,,ipf
15711,4099.0,umenšiti,,v.tr. pf.,"diminish, reduce, make smaller, abate",,verb,?,?,umenšati,...,umenš,?menšiti?,,,,umenš,iti,0,,pf


In [575]:
g = morphemes.query('base_verb.str.contains("tęgč")')
g


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
9725,6137.0,otęgčati,,v.tr. ipf.,complicate,,verb,?,?,otęgčati,...,otęgč,?tęgčati?,,,,otęgč,ati,0,otęgъčati,ipf
9726,6138.0,otęgčati,,v.tr. ipf.,"hamper, impede",,verb,?,?,otęgčati,...,otęgč,?tęgčati?,,,,otęgč,ati,0,otęgъčati,ipf
9727,6140.0,otęgčiti,,v.tr. pf.,complicate,,verb,?,?,otęgčati,...,otęgč,?tęgčiti?,,,,otęgč,iti,0,otęgъčiti,pf
9728,6141.0,otęgčiti,,v.tr. pf.,"hamper, impede",,verb,?,?,otęgčati,...,otęgč,?tęgčiti?,,,,otęgč,iti,0,otęgъčiti,pf


In [561]:
g = morphemes.query('base_verb.str.contains("slož")')
manual_insert_g(g, 'slože?n', last_cons={'d', 'đ'}, dry_run=0)
set_base_verb(g, 'složniti')

osloženiti = o složen iti
osloženjati = o složen jati
usložniti = u složn iti
usložnjati = u složn jati


In [564]:
move_prefix(morphemes.query('base_verb.str.contains("uběd")'), 'u')
move_prefix(morphemes.query('base_verb.str.contains("poběd")'), 'po')


  morphemes.loc[g.index, '_stem'] = g._stem.str[len(prefix):]


In [567]:
move_prefix(morphemes.query('base_verb.str.contains("ustal")'), 'u')


  morphemes.loc[g.index, '_stem'] = g._stem.str[len(prefix):]


In [571]:
g = morphemes.query('base_verb.str.contains("věn")')
manual_insert_g(g, 'věnc', last_cons={'c', 'č'}, dry_run=0)
morphemes.loc[g.index, 'base_noun'] = 'věnec'

set_base_verb(g, 'věnčati')

ověnčati = o věnc ⒥ati
ověnčiti = o věnc ⒥iti


In [555]:
g = morphemes.query('base_verb.str.contains("rę[dđ][ia]")')
manual_insert_g(g, 'ręd', last_cons={'d', 'đ'}, dry_run=0)
set_base_verb(g, 'ręditi')

oporędčati = opo ręd čati
oporędčiti = opo ręd čiti
podręditi = pod ręd iti
podręđati = pod ręd ⒥ati
råzporęditi = razpo ręd iti
råzporęđati = razpo ręd ⒥ati
uręditi = u ręd iti
uręđati = u ręd ⒥ati


In [554]:
g = morphemes.query('base_verb.str.contains("rę[dđ]č")')
# porędȯk
morphemes.loc[g.index, 'base_noun'] = 'porędȯk'
manual_insert_g(g, 'porędk', last_cons={'k', 'č'}, dry_run=0)
set_base_verb(g, 'rędčati')

In [531]:
g.base_verb.value_counts().index

Index(['podobniti', 'otęgčati', 'uběditi', 'osnovati', 'ovplyvniti',
       'opravniti', 'ulagoditi', 'unemožniti', 'oporędčati', 'osȯvrěmenniti',
       'uspravědliviti', 'upokarnjati', 'ustaliti', 'orųdovati', 'ulučšati',
       'umožniti', 'umenšati', 'oprěděliti', 'ozdobiti', 'usložniti',
       'osloženiti', 'uręditi', 'ověnčati', 'orųžiti', 'oriti', 'orati', '',
       'ostriti'],
      dtype='object')

In [529]:
#'otęgčati', 'osnovati', 'ustaliti',  'uběditi' 'podobniti', 'upokarnjati',
# 'uspravědliviti'
# 'osȯvrěmenniti'  , 'oporędčati', 'orųdovati', 'unemožniti', 'ulagoditi', 
# 'ovplyvniti', 'opravniti', 'ozdobiti', 'uręditi', 'osloženiti', 'usložniti', 
# 'oprěděliti', 'umenšati', 'umožniti', 'ulučšati', 
for verb in ['orositi', 'osvoboditi', 'udaliti', 'opozdniti', 
       'ozeleniti', 
       'ulěpšati', 'umŕtviti',
       'unarodniti',  'osvěžati', 'oživiti',
       'otrězviti', 'upotrěbiti', 'ulegšati', 
        'upȯlnomoćiti', 'otųpiti',
       'otěniti', 'oznaniti', 'oskvŕniti']:
    g = morphemes.query('base_verb == @verb')
    move_prefix(g, verb[0])


  morphemes.loc[g.index, '_stem'] = g._stem.str[len(prefix):]


In [514]:
g = morphemes.query('base_verb.str.contains("slab")')
g

manual_insert_g(g, 'slab', last_cons={'š', 'h'}, dry_run=0)
set_base_verb(g, 'slaběti')


oslaběti = o slab ěti
oslabiti = o slab iti
oslabiti = o slab iti
oslabjati = o slab jati
oslabjati = o slab jati
slaběti =  slab ěti


In [508]:
g = morphemes.query('isv.str.contains("smě[hš]") and en.str.contains("(jeer|laugh|smile)")')
g

manual_insert_g(g, 'směh', last_cons={'š', 'h'}, dry_run=0)
set_base_verb(g, 'směhati')

podsměhati = pod směh ati
podsměhnųti = pod směh nųti
råzsměšati = raz směh ⒥ati
råzsměšiti = raz směh ⒥iti
usměhati = u směh ati
usměhnųti = u směh nųti


  g = morphemes.query('isv.str.contains("smě[hš]") and en.str.contains("(jeer|laugh|smile)")')


In [510]:
morphemes.loc[g.index, ['reconstructed']] = "?směx?"


In [501]:
g = morphemes.query('isv.str.contains("[uo]per") and en.str.contains("feather")')
morphemes.loc[g.index, ['derived_nouns', 'base_noun', 'base_verb']] = ['', 'pero', 'pero']
manual_insert_g(g, 'per', last_cons={'z', 'ž'}, dry_run=0)

operiti = o per iti
operjati = o per jati
uperiti = u per iti


In [497]:
g = morphemes.query('base_verb.str.contains("smŕ")')
move_prefix(g, 'u')

  morphemes.loc[g.index, '_stem'] = g._stem.str[len(prefix):]


In [488]:

g = morphemes.query("base_verb.str.contains('mŕz')")
manual_insert_g(g, 'mŕz', last_cons={'z', 'ž'}, dry_run=0)


omŕziti = o mŕz iti
omŕžati = o mŕz ⒥ati


In [523]:
# g = morphemes.query("en.str.contains('coura') and base_verb.str.contains('smě')")
g = morphemes.query(" base_verb.str.contains('směl')")


manual_insert_g(g, 'směl', last_cons={'š', 'h'}, dry_run=0)
set_base_verb(g, 'smělěti')

onesměliti = one směl iti
onesměljati = one směl jati
osměliti = o směl iti
osměliti = o směl iti
osměljati = o směl jati
osměljati = o směl jati
råzsměliti = raz směl iti
råzsměljati = raz směl jati


In [522]:
morphemes.query(" base_verb.str.contains('smel')")


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect


In [480]:
g = morphemes.query('isv.str.contains("krųg")')
g
manual_insert_g(g, 'krųgl', dry_run=0)

okrųgliti = o krųgl iti
okrųgljati = o krųgl jati


In [477]:
g = morphemes.query('isv.str.contains("t[ě]l")')
g
manual_insert_g(g, 'těles', dry_run=0)

utělesniti = u těles niti
utělesnjati = u těles njati


In [473]:
g = morphemes.query('isv.str.contains("tekų")')
g
manual_insert_g(g, 'tekųći', last_cons={'š', 'h'}, dry_run=0)
set_base_verb(g, 'tekti')


utekųćiniti = u tekųći niti
utekųćinjati = u tekųći njati


In [468]:
g = morphemes.query('isv.str.contains("umyvati")')
g
manual_insert_g(g, 'my', last_cons={'š', 'h'}, dry_run=0)
set_base_verb(g, 'myti')


umyvati = u my vati


In [466]:
g = morphemes.query('isv.str.contains("opekati")')
manual_insert_g(g, 'pek', last_cons={'š', 'h'}, dry_run=0)
set_base_verb(g, 'pekti')


opekati = o pek ati


In [462]:
g = morphemes.query('isv.str.contains("ti[šh]")')
manual_insert_g(g, 'tih', last_cons={'š', 'h'}, dry_run=0)
set_base_verb(g, 'tišati')


otišati = o tih ⒥ati
otišiti = o tih ⒥iti
utišati = u tih ⒥ati
utišiti = u tih ⒥iti


In [458]:
g = morphemes.query('isv.str.contains("trudn")')
manual_insert_g(g, 'trudn', dry_run=0)
set_base_verb(g, 'trudniti')


otrudniti = o trudn iti
otrudnjati = o trudn jati
utrudniti = u trudn iti
utrudnjati = u trudn jati


In [455]:
g = morphemes.query('isv.str.contains("tęž.t")')
manual_insert_g(g, 'tęž', dry_run=0)
set_base_verb(g, 'tęžiti')
#g

obtęžati = ob tęž ati
obtęžiti = ob tęž iti
utęžati = u tęž ati
utęžiti = u tęž iti


In [675]:
# and base_noun == '' and not isv.str.contains(' ')
g = morphemes.query("_prefix == '' and isv.str.contains('^(ob|za|od|do)') ").sort_values(by='base_verb')
g#.sort_values(by='_suffix')

  g = morphemes.query("_prefix == '' and isv.str.contains('^(ob|za|od|do)') ").sort_values(by='base_verb')


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
17544,31583.0,zavěćati,,v.tr. ipf./pf.,"bequeath, leave by will",,verb,?,?,,...,zavěć,?ěćati?,,zavět,,zavět,⒥ati,0,?uměti,ipf/pf
2085,35297.0,dokumentovati,,v.tr. ipf./pf.,document,I,verb,?,?,,...,dokument,?kumentovati?,|dokumentacija,dokument,,dokument,ovati,0,?uměti,ipf/pf
17537,14193.0,zautrakati,,v.intr. ipf.,"eat breakfast, have breakfrast",,verb,?,?,,...,zautrak,?trakati?,,zautraka,,zautrak,ati,0,?uměti,ipf
8419,18014.0,obmanyvati,,v.tr. ipf.,fool,,verb,?,?,,...,obman,?manyvati?,|obmannik,obman,,obman,yvati,0,?uměti,ipf
8418,4216.0,obmanyvati,,v.tr. ipf.,"con, cheat, delude, deceive, hoodwink",,verb,?,?,,...,obman,?manyvati?,|obmannik,obman,,obman,yvati,0,?uměti,ipf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17450,31555.0,zasloniti,,v.tr. pf.,"shelter, cover, shade, shield (from light etc.)",,verb,?,?,zasloniti,...,zaslon,?loniti?,,zaslon,,zaslꜵn,iti,0,,pf
17502,1170.0,zatemniti,,v.tr. pf.,"obscure, darken, make dark",,verb,?,?,zatemniti,...,zatemn,?temniti?,,,,zatemn,iti,0,,pf
17503,3090.0,zatemnjati,,v.tr. ipf.,"obscure, darken, make dark",,verb,?,?,zatemniti,...,zatemn,?temnjati?,,,,zatemn,jati,0,,ipf
17507,33692.0,zatměvati,,v.tr. ipf.,"eclipse, obscure",,verb,?,?,zatmiti,...,zatměv,?tměvati?,,,,zatm,ěvati,0,,ipf


In [676]:
g.base_verb.value_counts()

                     9
oddaliti             4
obnoviti             4
zatemniti            2
zasloniti            2
zaplěniti            2
zabezpečati          2
zabezpamętiti        2
odščepiti            2
odčuđati             2
odsloniti            2
odročiti             2
odosobniti           2
odomašniti           2
odolěti              2
odličati             2
odkųsiti             2
obmeđati             2
obobćati             2
obiděti              2
obkoliti             2
oblegčati            2
oběćati              2
zatmiti              2
obognjeodparnjati    2
obrěmeniti           2
občiti               2
observovati          1
vlěkti               1
odobriti             1
doskonaliti          1
dojiti               1
dobrěti              1
byti                 1
Name: base_verb, dtype: int64

In [677]:
g.base_verb.value_counts().index

Index(['', 'oddaliti', 'obnoviti', 'zatemniti', 'zasloniti', 'zaplěniti',
       'zabezpečati', 'zabezpamętiti', 'odščepiti', 'odčuđati', 'odsloniti',
       'odročiti', 'odosobniti', 'odomašniti', 'odolěti', 'odličati',
       'odkųsiti', 'obmeđati', 'obobćati', 'obiděti', 'obkoliti', 'oblegčati',
       'oběćati', 'zatmiti', 'obognjeodparnjati', 'obrěmeniti', 'občiti',
       'observovati', 'vlěkti', 'odobriti', 'doskonaliti', 'dojiti', 'dobrěti',
       'byti'],
      dtype='object')

In [None]:
'oddaliti', 'obnoviti', 
'zatemniti', 'zasloniti', 'odsloniti', 'zaplěniti',
#'zabezpečati', 
# 'zabezpamętiti',
#'odščepiti', 'odčuđati', 
'odročiti', 
'odosobniti', 
# 'odomašniti', 
'o/dolěti', 
       'odkųsiti', 
    'obmeđati', 
    # 'obobćati',  'obiděti', 
'obkoliti', 
'oblegčati'

In [678]:
g1 = morphemes.query("isv.str.contains('li[č]') and not isv.str.contains('veli')")
manual_insert_g(g1, 'lik', dry_run=0, last_cons={'k', 'č'})
set_base_verb(g1, 'ličiti')

# 

izobličati = izob lik ⒥ati
izobličiti = izob lik ⒥iti
odličati = od lik ⒥ati
odličiti = od lik ⒥iti
råzličati = raz lik ⒥ati
råzličati = raz lik ⒥ati
råzličiti = raz lik ⒥iti


In [666]:
g1 = morphemes.query("isv.str.contains('zastar') and isv != 'abstrahovati'")
manual_insert_g(g1, 'star', dry_run=0, last_cons={'h', 'š'})
set_base_verb(g1, 'starěti')



zastariti = za star iti
zastarjati = za star jati


In [660]:
g1 = morphemes.query("isv.str.contains('stra[hš]') and isv != 'abstrahovati'")
manual_insert_g(g1, 'strah', dry_run=0, last_cons={'h', 'š'})
set_base_verb(g1, 'strašiti')


nastrašiti = na strah ⒥iti
nastrašiti = na strah ⒥iti
odstrašati = od strah ⒥ati
odstrašiti = od strah ⒥iti
prěstrašiti = prě strah ⒥iti
strahovati =  strah ovati
strašiti =  strah ⒥iti
zastrašati = za strah ⒥ati
zastrašiti = za strah ⒥iti


In [655]:

g1 = morphemes.query("isv.str.contains('obr[aå][tć]') or isv.str.contains('vr[aå][tć]')")
manual_insert_g(g1, 'v?r[aå]t', dry_run=0, last_cons={'t', 'ć'})
set_base_verb(g1, 'vråtiti')

g1 = morphemes.query("base_verb.str.contains('†nažiti')")
manual_insert_g(g1, 'nag', dry_run=0, last_cons={'ž', 'g'})
morphemes.loc[g1.index, 'base_noun'] = 'nagy'

g1 = morphemes.query("base_verb.str.contains('hvat')")
manual_insert_g(g1, 'hvat', dry_run=0, last_cons={'t', 'ć'})
set_base_verb(g1, 'hvatati')

g1 = morphemes.query("base_verb.str.contains('bočiti')")
manual_insert_g(g1, 'b[ao]k', dry_run=0, last_cons={'k', 'č'})

g1 = morphemes.query("base_verb.str.contains('doråzuměvati')")
manual_insert_g(g1, 'umě', dry_run=0, last_cons={'t', 'ć'})
set_base_verb(g1, 'uměti')


izvraćati = iz vrat ⒥ati
izvråtiti = iz vråt iti
obraćati = ob rat ⒥ati
obraćati = ob rat ⒥ati
obraćati = ob rat ⒥ati
obraćati = ob rat ⒥ati
obraćati = ob rat ⒥ati
obraćati = ob rat ⒥ati
obraćati = ob rat ⒥ati
obraćati = ob rat ⒥ati
obraćati = ob rat ⒥ati
obraćati = ob rat ⒥ati
obråtiti = ob råt iti
obråtiti = ob råt iti
obråtiti = ob råt iti
obråtiti = ob råt iti
obråtiti = ob råt iti
obråtiti = ob råt iti
obråtiti = ob råt iti
obråtiti = ob råt iti
obråtiti = ob råt iti
obråtiti = ob råt iti
odvraćati = od vrat ⒥ati
odvraćati = od vrat ⒥ati
odvraćati = od vrat ⒥ati
odvråtiti = od vråt iti
odvråtiti = od vråt iti
odvråtiti = od vråt iti
povraćati = po vrat ⒥ati
povråtiti = po vråt iti
povråtiti = po vråt iti
prěobraćati = prěob rat ⒥ati
prěobraćati = prěob rat ⒥ati
prěobråtiti = prěob råt iti
prěobråtiti = prěob råt iti
vȯzvraćati = vȯz vrat ⒥ati
vȯzvråtiti = vȯz vråt iti
vraćati =  vrat ⒥ati
vraćati v otčinų =  vrat ⒥ati
vraćati =  vrat ⒥ati
vråtiti =  vråt iti
vråtiti v otčinų =  vr

In [633]:
tmp_df = morphemes.query("base_verb.str.contains('hvat')")
tmp_df

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
3577,2591.0,hvatati,,v.tr. ipf.,"grab, grasp, seize",,verb,?,?,hvatati,...,hvat,hvatati,,hvat,,hvat,ati,0,xvatati,ipf
7171,28002.0,nahvatati,,v.tr. pf.,"pick up, snatch",,verb,?,?,hvatati,...,nahvat,hvatati,,,¬na,hvat,ati,0,(na)+xvatati,pf
8321,17990.0,obhvaćati,,v.tr. ipf.,"embrace, clasp",,verb,?,?,obhvatiti,...,obhvać,?hvaćati?,,,,obhvat,⒥ati,0,,ipf
8322,17992.0,obhvatiti,,v.tr. pf.,"embrace, clasp",,verb,?,?,obhvatiti,...,obhvat,?hvatiti?,,,,obhvat,iti,0,,pf
17178,2292.0,zahvaćati,,v.tr. ipf.,capture,,verb,?,?,zahvatiti,...,zahvać,?hvaćati?,,,,zahvat,⒥ati,0,,ipf
17182,571.0,zahvatiti,,v.tr. pf.,capture,,verb,?,?,zahvatiti,...,zahvat,?hvatiti?,,zahvat,,zahvat,iti,0,,pf


In [631]:
tmp_df = morphemes.query("isv.str.contains('(čina|čę)ti') and not en.str.contains('rest')")
set_base_verb(
    tmp_df,
    'čęti'
)

for stem_var in ['počin']:
    move_prefix(tmp_df.query('_stem == @stem_var'), stem_var[:2])

morphemes.loc[
    morphemes.query("isv.str.contains('započ')").index,
    ['_prefix', '_stem']
] = ['¬za¬po', 'čin']

morphemes.query("isv.str.contains('(čina|čę)ti') and not en.str.contains('rest')")


  tmp_df = morphemes.query("isv.str.contains('(čina|čę)ti') and not en.str.contains('rest')")
  morphemes.loc[g.index, '_stem'] = g._stem.str[len(prefix):]
  morphemes.query("isv.str.contains('(čina|čę)ti') and not en.str.contains('rest')")


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
7080,1502.0,načęti,(načne),v.refl. pf.,"begin (intr.), start (intr.)",,verb,?,?,čęti,...,načęti,?čęti?,,,na,čin,⒩ti,0,načęti,pf
7081,22780.0,načęti,(načne),v.tr. pf.,"begin (tr.), start (tr.)",,verb,?,?,čęti,...,načęti,?čęti?,,,na,čin,⒩ti,0,načęti,pf
7084,5516.0,načinati,,v.refl. ipf.,"begin (intr.), start (intr.)",,verb,?,?,čęti,...,način,?činati?,,,na,čin,ati,0,načinati,ipf
7085,22779.0,načinati,,v.tr. ipf.,"begin (tr.), start (tr.)",,verb,?,?,čęti,...,način,?činati?,,,na,čin,ati,0,načinati,ipf
10306,22788.0,počęti sę,(počne),v.tr. pf.,"begin (intr.), start (intr.)",,verb,?,?,čęti,...,počęti,?čęti sę?,,,po,čin,⒩ti,-1,?počęti,pf
10307,1078.0,počęti,(počne),v.tr. pf.,"begin (tr.), start (tr.), commence",,verb,?,?,čęti,...,počęti,?čęti?,,,po,čin,⒩ti,-1,počęti,pf
10315,10675.0,počinati,,v.refl. ipf.,"begin (intr.), start (intr.)",,verb,?,?,čęti,...,počin,?činati?,,počin,po,čin,ati,0,?počęti,ipf
10316,2835.0,počinati,,v.tr. ipf.,"begin (tr.), start (tr.), commence",,verb,?,?,čęti,...,počin,?činati?,,počin,po,čin,ati,0,?počęti,ipf
17107,14040.0,začęti,(začne),v.tr. pf.,"begin (tr.), start (tr.), embark",,verb,?,?,čęti,...,začęti,?čęti?,,,za,čin,⒩ti,0,začęti,pf
17110,14041.0,začinati,,v.tr. ipf.,"begin (tr.), start (tr.), embark",,verb,?,?,čęti,...,začin,?činati?,,,za,čin,ati,0,?začęti,ipf


In [621]:
g.base_verb.value_counts()

                     19
obraćati             10
obnoviti              4
bočiti                4
oddaliti              4
†nažiti               4
odomašniti            2
obhvatiti             2
odročiti              2
odsloniti             2
odstrašati            2
odčuđati              2
odščepiti             2
zabezpamętiti         2
zabezpečati           2
zahvatiti             2
zaplěniti             2
zasloniti             2
zastariti             2
zastrašati            2
zatemniti             2
zatmiti               2
začinati              2
odosobniti            2
odličati              2
odolěti               2
obognjeodparnjati     2
obiděti               2
obkoliti              2
oblegčati             2
obmeđati              2
obobćati              2
obrěmeniti            2
občiti                2
oběćati               2
odkųsiti              2
observovati           1
odobriti              1
dojiti                1
vlěkti                1
doråzuměvati          1
dobrěti         

In [429]:
g = morphemes.query("_prefix == '' and isv.str.contains('^(u|o)')").sort_values(by='base_verb')
#g = morphemes.query("_prefix == '' and isv.str.contains('^(ob|za|od|do)')").sort_values(by='base_verb')
#g = morphemes.query("_prefix == '' and isv.str.contains('^pod')").sort_values(by='base_verb')


for v1 in g.base_verb.unique():
    v = v1[1:]
    if v == '': continue
    if any(morphemes.isv == v) or any(morphemes.base_verb == v):
        print(v, v1)
        stem_ = None
        for g_cand in [morphemes[morphemes.isv == v], morphemes[morphemes.base_verb == v]]:
            if len(g_cand):
                stem_ = g_cand._stem.unique()
                if len(stem_) == 1:
                    stem = stem_[0]
                print(g_cand.base_verb.unique())
        print(stem)
        if v1 in ['odobriti', 'okupovati', 'osnovati']:
            continue
        g1 = morphemes[morphemes.base_verb == v1]
        LC = None if "slu" not in v1 else {"g", "ž"}
        manual_insert_g(g1, stem, dry_run=0, last_cons=LC)
        set_base_verb(g1, v)


kupovati okupovati
['kupiti']
kupo
provŕgati oprovŕgati
['vŕgati']
vŕg
oprovŕgati = opro vŕg ati
oprovŕgnųti = opro vŕg nųti
snovati osnovati
['']
sn
vějati ovějati
['vějati']
['vějati']
věj
ovějati = o věj ati
ovějivati = o věj ivati


  g = morphemes.query("_prefix == '' and isv.str.contains('^(u|o)')").sort_values(by='base_verb')


In [423]:

manual_insert_g(g.query("isv.str.contains('slu')"), 'slug', dry_run=0, last_cons={"g", "ž"})


zasluživati = za slug ⒥ivati
zaslužiti = za slug ⒥iti


In [428]:
g1 = morphemes.query("isv.str.contains('dobr.ti')")
manual_insert_g(g1, 'dobr', dry_run=0, last_cons=None)
set_base_verb(g1, 'dobrěti')


dobrěti =  dobr ěti
odobriti = o dobr iti


In [419]:
g.head(44).tail(22)

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
8707,36711.0,odbačati vpravo/vdesno,,v.intr. ipf.,turn right,,verb,?,?,bočiti,...,odbačati vpravo/vdesno,?bačati vpravo/vdesno?,,,,odbꜵč,ati,0,,ipf
8724,36712.0,odbočiti vpravo/vdesno,,v.intr. pf.,turn right,,verb,?,?,bočiti,...,odbočiti vpravo/vdesno,?bočiti vpravo/vdesno?,,,,odbꜵč,iti,0,,pf
9061,36666.0,odsųtstvovati,,v.intr. ipf.,"be absent, be away",,verb,?,?,byti,...,odsųtstv,sųtstvovati,,,,,,1,?byti,ipf
2062,1181.0,dojiti,,v.tr. ipf.,milk,,verb,?,?,dojiti,...,do,?jiti?,,,,do,jiti,0,dojiti,ipf
2187,7215.0,doråzuměvati,,v.refl. ipf.,communicate,,verb,?,?,doråzuměvati,...,doråzuměv,?měvati?,,,,doråzum,ěvati,0,,ipf
2207,4208.0,doskonaliti,,v.tr. ipf.,perfect,,verb,?,?,doskonaliti,...,doskonal,?konaliti?,|doskonalosť,,,doskonal,iti,0,,ipf
2235,21853.0,dostigati,,v.tr. ipf.,catch up,,verb,?,?,dostigati,...,dostig,stigati,,,,dostig,ati,0,,ipf
2236,21855.0,dostigati,,v.tr. ipf.,"reach, achieve, attain",,verb,?,?,dostigati,...,dostig,stigati,,,,dostig,ati,0,,ipf
2237,21854.0,dostignųti,,v.tr. pf.,catch up,,verb,?,?,dostigati,...,dostig,stignųti,,,,dostig,nųti,0,,pf
2238,21856.0,dostignųti,,v.tr. pf.,"reach, achieve, attain",,verb,?,?,dostigati,...,dostig,stignųti,,,,dostig,nųti,0,,pf


In [420]:
morphemes.loc[2236]

id                                21855.0
isv                             dostigati
addition                              NaN
partOfSpeech                   v.tr. ipf.
en                 reach, achieve, attain
genesis                               NaN
pos                                  verb
prefix                                  ?
verb_stem                               ?
base_verb                         stigati
isv_orig                        dostigati
left_stem_cand                     dostig
right_stem_cand                   stigati
derived_nouns                            
base_noun                                
_prefix                                do
_stem                                stig
_suffix                               ati
_is_irregular                           0
reconstructed                         NaN
aspect                                ipf
Name: 2236, dtype: object

In [415]:
g1

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect


## ANOMALY 2: rare verbs

In [201]:
morphemes.query("_stem.str.len() > 5 and genesis != genesis")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
712,2880.0,besědovati,,v.intr. ipf.,"converse, chat",,verb,?,?,besědovati,...,besěd,besědovati,,besěda,,besědo,vati,0,besědovati,ipf
921,24689.0,blaznovati,,v.intr. ipf.,behave like a fool,,verb,?,?,blaznovati,...,blazn,blaznovati,,blazn,,blazno,vati,0,,ipf
1002,4510.0,bogohuliti,,v.intr. ipf.,blaspheme,,verb,?,?,bogohuliti,...,bogohul,bogohuliti,,,,bogohul,iti,0,,ipf
1450,6842.0,čestitati,,v.tr. ipf./pf.,congratulate,,verb,?,?,čestitati,...,čestit,čestitati,,,,čestit,ati,0,,ipf/pf
2091,19486.0,dȯlgočasiti,,v.refl. ipf.,be bored,,verb,?,?,dȯlgočasiti,...,dȯlgočas,dȯlgočasiti,|dȯlgočaśje|dȯlgočasnik|dȯlgočasnica,,,dȯlgočas,iti,0,,ipf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17673,1892.0,zloupotrěbjati,,v.tr. ipf.,"abuse, misuse",,verb,?,?,zloupotrěbiti,...,zloupotrěb,zloupotrěbjati,,,,zloupotrěb,jati,0,,ipf
17712,22850.0,znamenovati,,v.tr. ipf.,"mark, put a mark on",,verb,?,?,znamenovati,...,znamen,znamenovati,|znameńje|znameńje,,,znameno,vati,0,,ipf
17713,22851.0,znamenovati,,v.tr. ipf.,"signify, mean, be a sign of",,verb,?,?,znamenovati,...,znamen,znamenovati,|znameńje|znameńje,,,znameno,vati,0,,ipf
17914,-37006.0,råzpečętyvati,,v.tr. ipf.,unseal,,verb,råz’,pečętyvati,pečętati,...,råzpečęt,?pečętyvati?,,,¬råz,pečęty,vati,0,,ipf


In [198]:
vc = morphemes.base_verb.value_counts()

rare_verbs = vc[vc == 1].index

morphemes.query("base_verb in @rare_verbs and genesis != genesis and base_noun == ''")#._prefix.value_counts()

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
854,34987.0,bičevati,,v.tr. ipf.,"whip, scourge, flog, flagellate",,verb,?,?,bičevati,...,bičev,bičevati,,,,bič,evati,0,bičevati,ipf
907,22297.0,blågovolěti,(blågovoli),v.intr. ipf.,"show benevolence, show goodwill",,verb,?,?,volěti,...,blågovol,blågovolěti,,,blågo,vol,ěti,0,,ipf
922,24690.0,blědněti,,v.intr. ipf.,turn pale,,verb,?,?,blědněti,...,blědn,blědněti,,,,blědn,ěti,0,,ipf
925,24693.0,blejati,,v.intr. ipf.,bleat,,verb,?,?,blejati,...,ble,blejati,,,,ble,jati,0,blejati,ipf
934,24697.0,blěstěti,,v.intr. ipf.,shine,,verb,?,?,blěstěti,...,blěst,blěstěti,,,,blěst,ěti,0,,ipf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17665,35008.0,zloradovati,,v.refl. ipf.,gloat,,verb,?,?,zloradovati,...,zlorad,zloradovati,|zloradosť,,,zlorado,vati,0,,ipf
17771,1890.0,zvųčati,(zvųči),v.intr. ipf.,sound,,verb,?,?,zvųčati,...,zvųč,zvųčati,|zvųčnik,,,zvųč,ati,0,,ipf
17789,1923.0,žalovati,,v.refl. ipf.,complain,,verb,?,?,žalovati,...,žal,žalovati,|žalosť,,,žalo,vati,0,žalovati,ipf
17868,31713.0,žmuriti oči,,v.tr. ipf.,"squint, scare off, screw up one’s eyes",,verb,?,?,žmuriti,...,žmuriti oči,?žmuriti oči?,žmurky,,,žmur,iti,0,,ipf


In [22]:
vc[vc == 2].index

Index(['pracovati', 'męti', 'prěčiti', 'luščiti', 'prědpovědati', 'lupiti',
       'lokati', 'postigati', 'logovati', 'prěhytriti',
       ...
       'hvatati', 'izkusiti', 'iznuriti', 'izpovědati', 'izopačati',
       'izobličati', 'rydati', 'hudnųti', 'izčrkati', 'hvaliti'],
      dtype='object', length=409)

## ANOMALY 2: rare endings

In [539]:
vc = morphemes._stem.str[-1].value_counts()

vc[vc < 120]

m    117
o    117
h    116
ž    116
b    107
š     99
i     92
j     81
ć     59
u     45
a     31
e     29
ě     25
c     17
å     11
ŕ      9
ų      6
đ      4
f      3
Name: _stem, dtype: int64

In [540]:
vc = morphemes._stem.str[-1].value_counts()
#vc[vc < 120]
#rare_ends = dict(zip(vc[vc < 120].index, vc[vc < 120].values))
rare_ends = vc[vc < 120].index


morphemes[morphemes._stem.str[-1].apply(lambda x: x in rare_ends)]

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
28,19625.0,absorbovati,,v.tr. ipf.,absorb,I,verb,?,?,absorbovati,...,absorb,absorbovati,|absorbcija,,,absorb,ovati,0,,ipf
29,24056.0,abstrahovati,,v.tr. ipf./pf.,abstract,I,verb,?,?,abstrahovati,...,abstrah,abstrahovati,,,,abstrah,ovati,0,,ipf/pf
363,36413.0,aranževati,,v.tr. ipf./pf.,arrange (music),F,verb,?,?,aranževati,...,aranžev,aranževati,,,,aranž,evati,0,,ipf/pf
549,24437.0,bajati,,v.intr. ipf.,"babble, tell fables, narrate",,verb,?,?,bajati,...,ba,bajati,,,,baj,ati,0,bajati,ipf
550,34833.0,bajati,,v.intr. ipf.,"practise sorcery, conjure",,verb,?,?,bajati,...,ba,bajati,,,,baj,ati,0,bajati,ipf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17846,7191.0,žiti,(žive),v.intr. ipf.,"reside, dwell",,verb,?,?,žiti,...,ž,žiti,,,,ži,ti,-1,žiti,ipf
17881,34178.0,žrěbiti,,v.refl. ipf.,foal,,verb,?,?,žrěbiti,...,žrěb,žrěbiti,|žrěbec,,,žrěb,iti,0,,ipf
17894,1490.0,žrtvovati,,v.tr. ipf.,sacrifice,,verb,?,?,žrtvovati,...,žrtv,žrtvovati,,žrtva,,žrtvo,vati,0,,ipf
17904,2627.0,žuvati,(žuje),v.intr. ipf.,"chew, masticate",,verb,?,?,žuvati,...,žuv,žuvati,,,,žu,vati,0,,ipf


In [541]:
morphemes[morphemes._stem.str[-1].apply(lambda x: x in list('žšćěcåŕųđ'))].base_verb.value_counts()

dŕžati          29
pušćati         21
†ložiti         19
děti            18
měšati          13
                ..
mrěti            1
dyšati           1
dųnųti           1
distancevati     1
žužati           1
Name: base_verb, Length: 83, dtype: int64

In [542]:
morphemes[morphemes._stem.str[-1].apply(lambda x: x in list('ćåŕųđ'))].base_verb.value_counts()

pušćati         21
obraćati        12
vraćati         10
vŕnųti           9
klåti            7
dųti             5
občiti           5
strěćati         3
obråtiti         2
pråti            2
odčuđati         2
obmeđati         2
obobćati         2
oběćati          2
noćevati         2
upȯlnomoćiti     2
dųnųti           1
Name: base_verb, dtype: int64

## ANOMALY 3: english doubles

In [37]:
# g = morphemes.query("base_verb == 'očišćati'")
# manual_insert_g(g, "čist", last_cons={"t", "ć"}, dry_run=0)
# set_base_verb(g, 'čistiti')


In [645]:
BV = morphemes.query("_prefix == 'p'").base_verb.unique()

EN_BV = morphemes.query("base_verb in @BV").en.unique()


morphemes.query("en in @EN_BV").sort_values(by='en')


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular
8388,22524.0,oblěkati,,v.tr. ipf.,"clothe, dress",,verb,?,?,vlěkti,oblěkati,oblěk,?lěkati?,,,ob,lěk,ati,0
8390,22520.0,oblěkti,,v.tr. pf.,"clothe, dress",,verb,?,?,vlěkti,oblěkti,oblěkti,?lěkti?,,,ob,lěk,ti,0
8768,16840.0,oděti,(oděne),v.tr. pf.,"clothe, dress",,verb,?,?,děti,oděti,od,děti,,,o,dě,ti,-1
8770,16838.0,oděvati,,v.tr. ipf.,"clothe, dress",,verb,?,?,oděvati,oděvati,oděv,?ěvati?,,,,od,ěvati,0
8777,5382.0,odgovarjati,,v.tr. ipf.,"answer, reply",,verb,?,?,odgovarjati,odgovarjati,odgovar,?govarjati?,,,,odgovar,jati,0
8778,34094.0,odgovarjati,,v.tr. ipf.,dissuade,,verb,?,?,odgovarjati,odgovarjati,odgovar,?govarjati?,,,,odgovar,jati,0
8780,3242.0,odgovoriti,,v.tr. pf.,"answer, reply",,verb,?,?,govoriti,odgovoriti,odgovor,govoriti,,,od,govꜵr,iti,0
8781,29087.0,odgovoriti,,v.tr. pf.,dissuade,,verb,?,?,govoriti,odgovoriti,odgovor,govoriti,,,od,govꜵr,iti,0
8816,2225.0,odkladati,,v.tr. ipf.,"suspend, postpone, delay",,verb,?,?,odkladati,odkladati,odklad,?kladati?,,,,odklad,ati,0
8817,5568.0,odkladati,,v.tr. ipf.,"put away, put off, put aside, put back",,verb,?,?,odkladati,odkladati,odklad,?kladati?,,,,odklad,ati,0


In [128]:
CONS = 'hńmzdjčåuđtkbćlsrcťšpŕžfnvg'


In [590]:
profiles = set()
L = 0

cnt1 = Counter()
for k, g in morphemes.groupby("en"):
    if k[0] != "#" and len(g) > 1 and len(g.base_verb.unique()) > 1:
         
        if "" in g.base_verb.unique():
            cur_profile = tuple(g.isv.unique())
        else:
            cur_profile = tuple(g.base_verb.unique())
        if cur_profile not in profiles:
            profiles.add(cur_profile)
            L += 1
            stem_cons = g._stem.apply(lambda x: ''.join(c for c in x if c in CONS))
            cnt1[(len(cur_profile), len(g._stem.unique()), len(stem_cons.unique()))] += 1
            if len(cur_profile) == 2 and len(g._stem.unique()) == 1:
                print(k, cur_profile)
            if len(cur_profile) == 2 and len(stem_cons.unique()) < 2:
                print("    ", k, cur_profile)
                if len(stem_cons) == 1:
                    pass
                    #print(g.isv.values)
                    #print(g.base_verb.values)


        #if L == 30:
        #    print(k, cur_profile)
# ['jęti' 'imati'] 'pustiti' 'pušćati'
# ['znati' 'poznavati']
# ['byti' 'byvati'] ['baviti' 'dati' 'davati' '†kladati' '†ložiti']
# ['dati' 'davati'] obraćati obråtiti
# ['staviti' 'stavjati'
# posȯvětovati sȯvětovati ['lětati' 'letěti'] ['postaviti' 'staviti']
 # ???  ['vnikati' 'vȯznikati']  ['prikladati' 'priložiti']
    # ['vezti' 'voziti'] ['pokladati' '†lagati' 'položiti']  

L


acquire ('byti', 'byvati')
     acquire ('byti', 'byvati')
appear (before a court) ('stati', 'stavati')
     appear (before a court) ('stati', 'stavati')
     approve, approbate, sanction ('dobrěti', 'odobriti')
     arrive (by flying) ('lětati', 'letěti')
avoid ('běgati', 'běgti')
     avoid ('běgati', 'běgti')
     be inclined, be prone to, yield to ('klanjati', 'kloniti')
become stronger ('krěpěti', 'krěpiti')
     become stronger ('krěpěti', 'krěpiti')
     beget, procreate ('raditi', 'roditi')
     behave like a fool ('blazniti', 'blaznovati')
     bestow ('dariti', 'darovati')
blow ('dųnųti', 'dųti')
     blow ('dųnųti', 'dųti')
     break ('lamati', 'lomiti')
     breathe in ('dȯhnųti', 'dyhati')
     bring away ('vezti', 'voziti')
chop up ('sěkati', 'sěkti')
     chop up ('sěkati', 'sěkti')
     climb ('laziti', 'lězti')
collapse, fall apart ('valiti', 'råzvaliti')
     collapse, fall apart ('valiti', 'råzvaliti')
crawl out ('pȯlzati', 'pȯlzti')
     crawl out ('pȯlzati', 'pȯlz

324

In [591]:
cnt1

Counter({(4, 4, 4): 9,
         (2, 3, 2): 23,
         (2, 2, 2): 180,
         (2, 1, 1): 24,
         (5, 5, 5): 3,
         (3, 3, 3): 31,
         (2, 2, 1): 21,
         (4, 3, 3): 2,
         (3, 4, 3): 3,
         (3, 4, 4): 2,
         (3, 2, 2): 9,
         (3, 3, 2): 9,
         (3, 4, 2): 1,
         (5, 7, 5): 1,
         (2, 3, 3): 4,
         (4, 4, 3): 1,
         (4, 5, 4): 1})

In [534]:
morphemes.query("base_verb == 'tkati'")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
4309,26247.0,iztkati,,v.tr. pf.,weave,,verb,?,?,tkati,...,iztk,tkati,,,¬iz,tk,ati,0,(iz)+tъkati,pf
14133,34857.0,sȯtkati,,v.tr. pf.,weave,,verb,sȯ’,tkati,tkati,...,sȯtk,tkati,,,¬sȯ,tk,ati,0,(sȯ)+tъkati,pf
15231,34856.0,tkati,,v.tr. ipf.,weave,,verb,?,?,tkati,...,tk,tkati,,,,tk,ati,0,tъkati,ipf


In [115]:
len(morphemes.isv.unique()), len(morphemes.base_verb.unique())

(4163, 1308)

In [112]:
from collections import Counter

Counter(len(p) for p in profiles)

Counter({3: 63, 5: 3, 2: 262, 4: 13, 7: 1})

In [102]:
morphemes.query("en == 'advise, counsel'")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
10905,6019.0,posȯvětovati,,v.tr. pf.,"advise, counsel",,verb,?,?,sȯvětovati,...,posȯvět,sȯvětovati,věće,,¬po¬sȯ,vět,ovati,0,,pf
14143,4475.0,sȯvětovati,,v.tr. ipf.,"advise, counsel",,verb,sȯ’,větovati,,...,sȯvět,?ětovati?,|sȯvětnik,sȯvět,,sȯvět,ovati,0,?uměti,ipf


In [None]:
['jęti' 'imati']

In [97]:
g

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
1983,21272.0,dobaviti,,v.tr. pf.,add,,verb,?,?,baviti,...,dobav,baviti,,,¬do,bav,iti,-1,(do)+baviti,pf
1984,21273.0,dobavjati,,v.tr. ipf.,add,,verb,?,?,baviti,...,dobav,?bavjati?,,,¬do,bav,jati,-1,?baviti,ipf
2027,311.0,dodati,(doda),v.tr. pf.,add,,verb,?,?,dati,...,dod,dati,,,¬do,da,ti,0,dodati,pf
2036,1704.0,dodavati,,v.tr. ipf.,add,,verb,?,?,davati,...,dodav,davati,,,¬do,dav,ati,-1,(do)+davati,ipf
2068,19676.0,dokladati,,v.tr. ipf.,add,,verb,?,?,†kladati,...,doklad,?kladati?,,,¬do,klad,ati,0,(do)+kladati,ipf
2117,20364.0,doložiti,,v.tr. pf.,add,,verb,?,?,†ložiti,...,dolož,?ložiti?,,,¬do,lož,iti,0,(do)+ložiti,pf


In [None]:
'pustiti' 'pušćati'


# g = morphemes.query("base_verb == 'očišćati'")
# manual_insert_g(g, "čist", last_cons={"t", "ć"}, dry_run=0)
# set_base_verb(g, 'čistiti')


In [87]:
EN_TMP = morphemes.en.value_counts().head(11).index

morphemes.query('en in @EN_TMP').sort_values(by="en")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
1983,21272.0,dobaviti,,v.tr. pf.,add,,verb,?,?,baviti,...,dobav,baviti,,,¬do,bav,iti,-1,(do)+baviti,pf
1984,21273.0,dobavjati,,v.tr. ipf.,add,,verb,?,?,baviti,...,dobav,?bavjati?,,,¬do,bav,jati,-1,?baviti,ipf
2027,311.0,dodati,(doda),v.tr. pf.,add,,verb,?,?,dati,...,dod,dati,,,¬do,da,ti,0,dodati,pf
2036,1704.0,dodavati,,v.tr. ipf.,add,,verb,?,?,davati,...,dodav,davati,,,¬do,dav,ati,-1,(do)+davati,ipf
2068,19676.0,dokladati,,v.tr. ipf.,add,,verb,?,?,†kladati,...,doklad,?kladati?,,,¬do,klad,ati,0,(do)+kladati,ipf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3454,17458.0,hlipnųti,,v.intr. pf.,sob,,verb,?,?,hlipati,...,hlip,hlipnųti,,,,hlip,nųti,0,,pf
3453,17457.0,hlipati,,v.intr. ipf.,sob,,verb,?,?,hlipati,...,hlip,hlipati,,,,hlip,ati,0,,ipf
3110,17418.0,gȯltati sȯlzy,,v.ipf.,sob,,verb,?,?,gȯltati,...,gȯltati sȯlzy,?gȯltati sȯlzy?,,,,gȯlt,ati,0,,ipf
13308,18676.0,rydati,,v.intr. ipf.,sob,,verb,?,?,rydati,...,ryd,rydati,,,,ryd,ati,0,rydati,ipf


## ANOMALY 4: empty


In [543]:
morphemes[morphemes.base_verb == ""]

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,...,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed,aspect
1185,22979.0,bųde/bųdųt,,v.aux. ipf.,there will be,,verb,?,?,,...,bųde/bųdųt,?bųde/bųdųt?,,,,,,0,?uměti,ipf
1710,25144.0,debatovati,,v.intr. ipf.,debate,I,verb,’,debatovati,,...,debat,?batovati?,,debata,,debat,ovati,0,?uměti,ipf
1716,25152.0,debjutovati,,v.intr. ipf./pf.,make one’s debut,F,verb,’,debjutovati,,...,debjut,?bjutovati?,,debjut,,debjut,ovati,0,?uměti,ipf/pf
1735,15280.0,defisovati,,v.tr. ipf.,hyphenate,I,verb,’,defisovati,,...,defis,?fisovati?,,defis,,defis,ovati,0,?uměti,ipf
2085,35297.0,dokumentovati,,v.tr. ipf./pf.,document,I,verb,?,?,,...,dokument,?kumentovati?,|dokumentacija,dokument,,dokument,ovati,0,?uměti,ipf/pf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17164,35915.0,zagospodariti,,v.tr. pf.,"gain authority over, begin to rule",,verb,?,?,,...,zagospodar,?gospodariti?,,gospodaŕ,¬za,gospodar,iti,0,?uměti,pf
17231,31509.0,zakliniti,,v.tr. pf.,wedge,,verb,?,?,,...,zaklin,?kliniti?,,klin,¬za,klin,iti,0,?uměti,pf
17232,33665.0,zaklinovati,,v.tr. ipf.,wedge,,verb,?,?,,...,zaklin,?klinovati?,,klin,¬za,klin,ovati,0,?uměti,ipf
17537,14193.0,zautrakati,,v.intr. ipf.,"eat breakfast, have breakfrast",,verb,?,?,,...,zautrak,?trakati?,,zautraka,,zautrak,ati,0,?uměti,ipf


## ANOMALY 5: pf/ipf counts


In [51]:
morphemes.partOfSpeech.value_counts()

v.tr. ipf.          1792
v.tr. pf.           1499
v.intr. ipf.         574
v.refl. ipf.         328
v.intr. pf.          319
v.refl. pf.          263
v.tr. ipf./pf.        89
v.ipf.                56
v.pf.                 23
v.intr. ipf./pf.      16
v.aux. ipf.           12
v.refl. ipf./pf.       3
#v.tr. ipf.            3
#v.intr. ipf.          2
v.aux. pf.             2
#v.tr.ipf              1
#v.intr. pf.           1
#v.tr. pf.             1
v.tr.ipf.              1
Name: partOfSpeech, dtype: int64

In [56]:
morphemes.query("not isv_orig.str.contains(' sę') and partOfSpeech.str.contains('refl')").isv_orig.unique()

array(['prědstaviti sobě', 'prědstavjati sobě'], dtype=object)

In [62]:
morphemes['aspect'] = morphemes.partOfSpeech.apply(
    lambda x: (
        "ipf/pf" if "ipf./pf." in x else (
            "ipf" if "ipf" in x else (
                "pf" if "pf" in x else "?"
            )
        )
    )
)


In [544]:
morphemes['aspect'].value_counts()

ipf       2769
pf        2108
ipf/pf     108
Name: aspect, dtype: int64

In [80]:
i = 0
for k, g in morphemes.groupby("base_verb"):
    if k == '' or len(g) < 3:
        continue
    for k1, g1 in g.groupby("_prefix"):
        aspects_dict = g1['aspect'].value_counts().to_dict()
        if not (aspects_dict in [{'pf': 1, 'ipf': 1}, {'pf': 1}]):
            if 'pf' not in aspects_dict and k1 == '' and aspects_dict['ipf'] == len(g1.en.unique()):
                continue
            if aspects_dict.get('pf') == aspects_dict.get('ipf') == len(g1.en.str.replace("#", "").unique()):
                continue
            if 'ipf' not in aspects_dict and aspects_dict.get('pf') == len(g1.en.unique()):
                continue
            if 'ipf' not in aspects_dict and aspects_dict.get('pf') == len(g1.partOfSpeech.unique()):
                print("!!!")
                print(aspects_dict)
                print(g1[['isv', 'partOfSpeech', 'en', '_prefix', '_stem',]])
                print("!!!")
                continue
            i += 1
            print(k1)
            print(aspects_dict)
            print(g1[['isv', 'partOfSpeech', 'en', '_prefix', '_stem',]])

    #print(k)
    #print(g['aspect'].value_counts())
    if i == 15: break
    

¬de
{'ipf/pf': 1}
              isv      partOfSpeech       en _prefix _stem
1717  deblokovati  v.intr. ipf./pf.  unblock     ¬de  blok
¬za
{'pf': 2, 'ipf': 1}
              isv  partOfSpeech                      en _prefix _stem
17092    zabolěti   v.intr. pf.      fall ill, get sick     ¬za   bol
17093    zabolěti   v.intr. pf.  hurt, ache, be painful     ¬za   bol
17096  zabolěvati  v.intr. ipf.      fall ill, get sick     ¬za   bol
¬s
{'ipf': 2}
           isv  partOfSpeech                                  en _prefix _stem
13413  sbirati  v.refl. ipf.                             prepare      ¬s   bir
13414  sbirati    v.tr. ipf.  gather, assemble, collect, pick up      ¬s   bir

{'ipf': 2}
        isv  partOfSpeech     en _prefix _stem
1143  briti  v.refl. ipf.  shave            br
1144  briti    v.tr. ipf.  shave            br
!!!
{'pf': 2}
         isv partOfSpeech     en _prefix _stem
8540  obriti  v.refl. pf.  shave     ¬ob     r
8541  obriti    v.tr. pf.  shave     ¬ob     r
!

KeyError: 'ipf'

## ANOMALY 6: rare prefixes


In [None]:

morphemes.query("isv.str.contains('blågo')").isv.str.partition('blågo')[2].apply(lambda x: x + str(x in morphemes.isv.unique() or x in morphemes.base_verb.unique()))


In [899]:
morphemes.query("isv.str.contains('blågo')").isv.str.partition('blågo')[2].apply(lambda x: x + str(x in morphemes.isv.unique() or x in morphemes.base_verb.unique()))


887         daritiTrue
899      slavjatiFalse
900        slovitiTrue
907         volětiTrue
912         želatiTrue
10292       daritiTrue
10293       želatiTrue
Name: 2, dtype: object

In [897]:
#morphemes.loc[12828]

## stuff

In [82]:

from isv_nlp_utils import constants
from isv_nlp_utils.slovnik import get_slovnik
# from isv_translate import translate_sentence, postprocess_translation_details, prepare_parsing

from ast import literal_eval
import os
import glob


In [None]:
from isv_nlp_utils.flavorization.tokenizer import extract_stem_prefix_suffix

slovnik = get_slovnik()
slovnik = slovnik['words']

morph = constants.create_etm_analyzer(r"C:\dev\ISV_data_gathering\\")

isv_dict = morph._units[0][0].dict


In [None]:
anomalies = []
i = 0

cnt = Counter()
cnt2 = Counter()
mbs = []

for k, g in partial_verb_prefixes.groupby('left_stem_cand'):
    if g.base_verb.apply(len).min() > 0:
        continue
    cnt[len(g.isv.unique())] += 1
    if g.base_verb.apply(len).max() > 0:
        cnt2[len(g.isv.unique())] += 1
        mbs.append((k, g.isv_orig.values, g.base_verb.values))

    if len(k) > 1 and len(g.isv.unique()) == 2:
        i += 1
        aspect_stats = {}
        for aspect in ['ipf.', 'pf.']:
            # print(aspect, len(g[g.partOfSpeech.str.contains(" " + aspect)]))
            aspect_stats[aspect] = list(g[g.partOfSpeech.str.contains("[ .]" + aspect)].isv.values)
        if any(len(x) == 0 for x in aspect_stats.values()):
            anomalies.append((aspect_stats, g.partOfSpeech.values.tolist(), g.en.values.tolist(), g.ru.values.tolist(), g.pl.values.tolist()))
            #print(aspect_stats)
            #print(g.en.values.tolist())
            #print(g.ru.values.tolist())
        else:
            bited, prefices = zip(
                *g.isv.apply(lambda x: bite_all_prefixes_off(x, "")).values.tolist()
            )
            if set(prefices) == {'’'}:
                base_verb = g.loc[g.isv.apply(len).idxmin()].isv
                print(g.isv.values.tolist(), "->", base_verb)
                print(g.en.unique())
                print()
                write_base_verb(g.index, base_verb)
            else:
                stems = g.isv.apply(
                    lambda x: bite_all_prefixes_off(x, "")[0]
                    # lambda x: x[len(bite_all_prefixes_off(x, "").replace('’', '')):]
                )
                for verb_cand in sorted(stems.values, key=len):
                    if morph.word_is_known(verb_cand):
                        if {verb_cand} == {v.normal_form for v in morph.parse(verb_cand)}:
                            base_verb_idx = partial_verb_prefixes.query("isv == @verb_cand").index[0]
                            add_index = g.index.tolist() + [base_verb_idx]
                            print(g.isv.values.tolist(), "+", verb_cand, "->", verb_cand)
                            print(g.en.unique())
                            print(partial_verb_prefixes.loc[base_verb_idx, 'en'])
                            print()
                            write_base_verb(add_index, verb_cand)
                            break
                else:
                    base_verb = g.loc[g.isv.apply(len).idxmin()].isv
                    print(g.isv.values.tolist(), "->", "???", base_verb)
                    write_base_verb(g.index, base_verb)
                
len(anomalies)

In [None]:
r, p = bite_all_prefixes_off(x, '')
p2 = p.replace('’', "")
print(x, p, r, morph.word_is_known(r), p2)
for len_cand in range(1, len(p2)):
    word_cand = p2[-len_cand:] + r
    print(len_cand, word_cand, morph.word_is_known(word_cand))
    if morph.word_is_known(word_cand):
        n += 1
        print(len_cand, word_cand, morph.word_is_known(word_cand))


In [None]:
from isv_nlp_utils.slovnik import infer_pos
KNOWN_NOUNS = slovnik[slovnik.partOfSpeech.apply(infer_pos) == 'noun'].isv.unique()

In [86]:
#partial_verb_prefixes.loc[1875, "base_verb"] = 'infikovati'
morphemes.loc[1875, "base_verb"]


'dezinfikovati'

In [None]:
def add_derived_nouns(ending, replacement=''):
    tmp_df = slovnik[slovnik.partOfSpeech.apply(infer_pos) == 'noun'].copy()
    tmp_df['repl'] = tmp_df.isv.str.replace(ending + "$", replacement)
    tmp_df = tmp_df.query(
        "isv not in @BEG and repl in @BEG"
    ).copy()
    for i, row in tmp_df.iterrows():
        repl = row.repl
        matches = partial_verb_prefixes.query("left_stem_cand == @repl")

        partial_verb_prefixes.loc[matches.index, "derived_nouns"] = partial_verb_prefixes.loc[matches.index, "derived_nouns"] + "|" + row.isv

    return tmp_df.isv.values.tolist()


In [None]:

result = []

result += add_derived_nouns("ťje")
result += add_derived_nouns("je")
result += add_derived_nouns("ńje")
result += add_derived_nouns("ťje")
result += add_derived_nouns("ŕje")


for letter, letter_repl in ['ďd', 'ľl', 'ńn', 'ŕr', 'śs', 'ťt', 'źz']:
    result += add_derived_nouns(letter + "je", letter_repl)
    print(letter, len(result))

result = []
for letter, letter_repl in ['čk', 'šh']:
    result += add_derived_nouns(letter + "je", letter_repl)
    print(letter, len(result))

result += add_derived_nouns('nik')
print(len(result))
result += add_derived_nouns('nica')
print(len(result))

result += add_derived_nouns('ec')
print(len(result))
result += add_derived_nouns('ica')
print(len(result))

result += add_derived_nouns('stvo')
print(len(result))
result += add_derived_nouns('išče')
print(len(result))
result += add_derived_nouns('osť')
print(len(result))
result += add_derived_nouns('ač')
print(len(result))
result += add_derived_nouns('aŕ')
print(len(result))


result

## end stuff

In [None]:
# is_known = partial_verb_prefixes.query("base_verb == '' and not isv.str.contains(' ') and left_stem_cand != isv").left_stem_cand.apply(
# is_known = partial_verb_prefixes.query("not isv.str.contains(' ') and left_stem_cand != isv and (base_verb == '' or base_verb == 'isv')").left_stem_cand.apply(

is_known = partial_verb_prefixes.query("not isv.str.contains(' ') and left_stem_cand != isv and (base_verb == '' or base_verb == isv)").left_stem_cand.apply(
    lambda stem: any(
        (morph.word_is_known(x) and any(v.tag.POS == "NOUN" for v in morph.parse(x)))
        for x in [stem, stem + "a"]
    )
)

tmp = is_known[is_known]

partial_verb_prefixes.loc[tmp.index]

In [None]:
for i, row in partial_verb_prefixes.loc[tmp.index].iterrows():
    stem = row.left_stem_cand
    base_noun_cands = []
    for x in [stem, stem + "a"]:
        variants = morph.parse(x)
        variants = [v for v in variants if v.tag.POS == "NOUN"]
        variants = [v for v in variants if v.normal_form == v.word and morph.word_is_known(v.normal_form)]
        if variants:
            variants = set([("!" if v.word != x else "") + v.word for v in variants])
            base_noun_cands += list(variants)
    if base_noun_cands:
        print(row.isv, row.base_verb, stem, base_noun_cands)
        partial_verb_prefixes.loc[i, "base_noun"] = "|".join(base_noun_cands)


## functions

In [None]:


def bite_all_prefixes_off(word, verb_nest):

    can_continue = True
    prefixes = []

    while can_continue:
        can_continue = False
        for pref in possible_prefixes:
            if word.startswith(pref):
                # print(pref, word, word[len(pref):])
                new_word = word[len(pref):]
                if new_word.endswith(verb_nest):
                    word = new_word
                    prefixes.append(pref)
                    can_continue = True
                    break
    return word, "’".join(prefixes) + "’"


def bite_all_suffixes_off(word, verb_nest):

    can_continue = True
    suffixes = []

    while can_continue:
        can_continue = False
        for pref in possible_suffixes:
            if word.endswith(pref):
                new_word = word[:-len(pref)]
                if new_word.startswith(verb_nest):
                    word = new_word
                    suffixes.append(pref)
                    can_continue = True
                    break
    return word, "+".join(reversed(suffixes)) + "+"



In [397]:
BASE = "slěditi"

def check_if_orphan(BASE):

    g = morphemes.query(" base_verb == @BASE")

    base_variants = set((g['_stem'] + g['_suffix']).unique())
    variants = set(base_variants)
    for (src, dst) in [
        ("t" + YI, "ć"), ("sk" + YI, 'šć'), ("st" + YI, 'šć'),
        ("d" + YI, "đ"), ("zd" + YI, "žđ"),
        ("k" + YI, "č"), ("c" + YI, "č"), ("sk" + YI, "šč"),
        ("g" + YI, "ž"), ("z" + YI, "ž"), 
        ("s" + YI, "š"), ("h" + YI, "š"), 
        (AO, "a"), (AO, "o"), (AO, "å"), 
        ("in" + NASAL, "ę"),
        ("tt", "t"),
        ("bv", "v"),
        ("jdti", "idti"), ("jmati", "imati")
        # žđ  "h" + YI ?
    ]:
        variants |= {v.replace(src, dst) for v in base_variants}
    return BASE not in variants


In [289]:
# morphemes.query(" isv.str.contains('slě')")

In [184]:
morphemes.query("(_suffix in @sWTF or _suffix == '')  and _is_irregular == 0 and not isv.str.contains('[/,]')")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular
4336,763.0,izvaljnjati,,v.tr. ipf.,"fire (terminate employment), sack, discharge, ...",,verb,?,?,voliti,izvaljnjati,izvaljn,?aljnjati?,,,iz,vꜵl,jnjati,0
4359,22512.0,izvlastniti,,v.tr. pf.,expropriate,,verb,?,?,vlastniti,izvlastniti,izvlastn,vlastniti,,,iz,vlast,niti,0
4360,22511.0,izvlastnjati,,v.tr. ipf.,expropriate,,verb,?,?,vlastniti,izvlastnjati,izvlastn,?lastnjati?,,,iz,vlast,njati,0
4377,2956.0,izvoljniti,,v.tr. pf.,"fire (terminate employment), sack, discharge, ...",,verb,?,?,voliti,izvoljniti,izvoljn,?ljniti?,,,iz,vꜵl,jniti,0
9102,22538.0,odvlåčivati,,v.tr. ipf.,drag away,,verb,?,?,vlěkti,odvlåčivati,odvlåč,?låčivati?,,,odv,låk,⒥ivati,0
9103,23884.0,odvlåčivati,,v.tr. ipf.,distract,,verb,?,?,vlěkti,odvlåčivati,odvlåč,?låčivati?,,,odv,låk,⒥ivati,0
10897,17010.0,posluživati,(+5),v.refl. ipf.,"use, make use of",,verb,?,?,služiti,posluživati sę,posluž,?luživati?,,,pos,lug,⒥ivati,0
11065,22659.0,povŕtati,,v.tr. ipf.,turn (tr.),,verb,?,?,vŕnųti,povŕtati,povŕt,?ŕtati?,,,pov,ŕ,tati,0
11910,23175.0,privlåčivati,,v.tr. ipf.,attract,,verb,?,?,vlěkti,privlåčivati,privlåč,?låčivati?,,,priv,låk,⒥ivati,0
11911,22501.0,privlastniti sobě,,v.tr. pf.,"appropriate, arrogate",,verb,?,?,vlastniti,privlastniti sobě,privlastniti sobě,?lastniti sobě?,,,pri,vlast,niti,0


In [67]:
morphemes.loc[
    morphemes.query("en.str.contains('testif')").index,
    "base_verb"
]

In [706]:
morphemes[morphemes.isv != morphemes._prefix + morphemes._stem + morphemes._suffix].query(
    "not _stem.str.contains('ꜵ') and not _suffix.str.contains('[⒥⒩]') and not isv.str.contains('[ \/]')"
)

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular
2179,7211.0,dorastati,,v.intr. ipf.,"grow up, become an adult",,verb,?,?,råsti,dorastati,dorast,?rastati?,,,do,råst,ati,0
2180,16416.0,doråsti,(doråste),v.intr. pf.,"grow up, become an adult",,verb,?,?,råsti,doråsti,doråsti,råsti,,,do,råst,ti,0
3640,2104.0,idti,(ide; šel),v.intr. ipf.,go,,verb,?,?,idti,idti,idti,idti,,,,jd,ti,0
3685,3325.0,imati,,v.ipf.,deal with,,verb,?,?,imati,iměti dělo s,iměti dělo s,?iměti dělo s?,,,,jm,ati,0
3686,36890.0,imati,,v.tr. ipf.,"mean, have in mind",,verb,?,?,imati,iměti na myslji,iměti na myslji,?iměti na myslji?,,,,jm,ati,0
3687,36928.0,imati,,v.tr. ipf.,"mean, have in mind",,verb,?,?,imati,iměti na umu,iměti na umu,?iměti na umu?,,,,jm,ati,0
3688,17485.0,imati,,v.ipf.,hope,,verb,?,?,imati,iměti nadějų,iměti nadějų,?iměti nadějų?,,,,jm,ati,0
3690,876.0,imati,,v.ipf.,be right,,verb,?,?,imati,iměti pravdų,iměti pravdų,?iměti pravdų?,,,,jm,ati,0
3691,36402.0,imati,,v.ipf.,make sense,,verb,?,?,imati,iměti smysl,iměti smysl,?iměti smysl?,,,,jm,ati,0
3692,23339.0,imati,,v.intr. ipf.,matter,,verb,?,?,imati,iměti važnosť,iměti važnosť,?iměti važnosť?,,,,jm,ati,0


In [709]:
!ls -lh | grep wik

-rw-r--r-- 1 79165 197609  230K Mar 26 00:41 data gathering-wiktionary_new.ipynb
-rw-r--r-- 1 79165 197609  7.9M Nov 11 20:36 isv_wiki.pkl
-rw-r--r-- 1 79165 197609   33K Nov 11 23:16 isv_wiki_crawling.ipynb
-rw-r--r-- 1 79165 197609  412K May  6 00:00 morphemes_plus_wiktionary.ipynb
drwxr-xr-x 1 79165 197609     0 Nov 11 23:16 wiki
-rw-r--r-- 1 79165 197609  1.5M Nov  8 00:48 wiktionary.json
-rw-r--r-- 1 79165 197609  342K Nov  8 00:49 wiktionary.zip
-rw-r--r-- 1 79165 197609   16M Nov 30 04:20 wiktionary_extended.json
-rw-r--r-- 1 79165 197609  7.5M Mar 25 04:16 wiktionary_extended2.json
-rw-r--r-- 1 79165 197609   12M Mar 26 00:31 wiktionary_extended_new.json


In [725]:
from isv_nlp_utils.flavorization.replacer import VOWELS
from isv_nlp_utils.flavorization.parsing import parse_multireplacer_rules
from isv_nlp_utils.flavorization.tokenizer import compute_annotated_tokens, pretty_stringify
from isv_nlp_utils.flavorization.replacer import process_multireplacing, morphological_flavorise
from isv_nlp_utils.flavorization.selector import produce_string, filter_good_spellings, filter_lingua, init_detector, init_hunspell
from isv_nlp_utils.flavorization.tokenizer import tokens_to_exhaustive_string_list

In [728]:
from razdel import tokenize
from isv_nlp_utils.flavorization.tokenizer import ParseVariant, AnnotatedToken

In [726]:
rules_struct, declared_constants = parse_multireplacer_rules(
        r"C:\dev\razumlivost\src\flavorizers\slow\protoslavic.ts"
)

def f(word, rules_struct, declared_constants, pos_tag):
    cap = False
    space_after = ""
    
    slovnik_pos = ""
    isv_lemma = None
    variants = [ParseVariant(
                    [word],
                    pos_tag, slovnik_pos, isv_lemma,
                    None, "", 
                    False
                )]

    ann_token = AnnotatedToken(
        variants,
        cap, space_after,
    )
    
    tokens_base = [ann_token]
    # tokens = morphological_flavorise(tokens_base, morph, flavor_rules[LANG])
    tokens = process_multireplacing(tokens_base, rules_struct, declared_constants)
    return tokens

In [729]:
word = 'bolь'


pretty_stringify(
    f(word, rules_struct, declared_constants, {""})
)

'bolj'

In [None]:
all_reconstructions = set()
reconstructed_articles = {}

for k, v in words_data.items():
    if v['Related_Slavic']:
        all_reconstructions |= set([x[1][1:] for x in v['Related_Slavic']])
    all_reconstructions.add(v['*'])
    reconstructed_articles[v['*']] = v

len(all_reconstructions), len(reconstructed_articles)

In [834]:
pra_verbs = {}
pra_forms = {}

all_reconstructions = set()

for k, v in words_data.items():
    if v['Related_Slavic']:
        all_reconstructions |= set([x[1][1:] for x in v['Related_Slavic']])
    all_reconstructions.add(v['*'])

for word in all_reconstructions:
    if word.endswith('ti') or word.endswith('ťi'):

        evolved = pretty_stringify(
            f(word, rules_struct, declared_constants, {""})
        )
        if "|" in evolved:
            variants = evolved[1:-1].split("|")
            variants = [vr for vr in variants if vr in morphemes.isv.unique()]
            if len(variants) == 1:
                pra_verbs[word] = variants[0]
                pra_forms[variants[0]] = word
            else:
                print(word, evolved, variants)
        else:
            pra_verbs[word] = evolved
            pra_forms[evolved] = word

o(b)klasti [obklasti|oklasti] []
postiťi [postikti|postigti] []
pľuti [pljuti|pjuti] []
-ťi [-gti|-kti] []
bľukati [bljukati|bjukati] []
orzkošiti [razkošiti|råzkošiti|rozkošiti] []
sęťi [sękti|sęgti] []
orzьniti [roźniti|raźniti|råźniti] []
orzduxati [rozduhati|razduhati|råzduhati] []
povelťi [povlěgti|povlěkti] []
(s)tъrčati [strčati|trčati] []
orsěsti [råsěsti|rasěsti|rosěsti] []
orzžaliti [razžaliti|råzžaliti|rozžaliti] []
pręťi [prękti|pręgti] []
poteťi [potegti|potekti] []
ortiti [ratiti|råtiti|rotiti] []
žegťi [žeggti|žegkti] []
orsuti [råsuti|rosuti|rasuti] []
o(b)teťi [otekti|otegti|obtekti|obtegti] []
skobľati [skobljati|skobjati] []
dostiťi [dostigti|dostikti] []
napeťi [napegti|napekti] []
bel(e)niti [beleniti|blěniti] []
jьzreťi [izregti|izrekti] []
sъpeťi [spekti|spegti] []
ortati [råtati|rotati|ratati] []
orzuzdati [razuzdati|råzuzdati|rozuzdati] []
tęťi [tęgti|tękti] []
orzvětriti [rozvětriti|razvětriti|råzvětriti] []
stiťi [stikti|stigti] []
o(b)kladati [okladati|obkla

In [835]:
len(pra_forms)

2845

In [739]:
# pra_verbs

In [836]:
"" in pra_forms

False

In [907]:
morphemes['reconstructed'] = ""

In [908]:
from collections import Counter

cnt = Counter()

unk = []
dfs = []
for nest in morphemes.base_verb.unique():
    g = morphemes.query('base_verb == @nest')
    if nest.strip("†") in pra_forms and nest not in morphemes.isv.unique():
        print("†?", nest, pra_forms[nest.strip("†")])
        morphemes.loc[g.index, 'reconstructed'] = "?" + pra_forms[nest.strip("†")]
        continue

    g2 = g.isv.apply(lambda x: pra_forms.get(x, float("nan")))
    morphemes.loc[g.index, 'reconstructed'] = g2.values
    cnt[len(g2.dropna().unique())] += 1
    if len(g2.dropna().unique()) == 0:
        unk.append(nest)
        continue
    if len(g2.dropna()) != len(g2):
        if len(g2.dropna().unique()) > 1:
            dfs.append(g)
        selected = min(g2.dropna().unique(), key=len)
        g3 = morphemes.loc[g.index].query("isv not in @pra_forms")
        morphemes.loc[g3.index, 'reconstructed'] = "?" + selected


cnt

†? sloviti sloviti
†? †kladati kladati
†? †ložiti ložiti
†? voliti voliti
†? †gynųti gynǫti
†? †městiti městiti
†? mrěti merti
†? †niknųti niknǫti
†? obraziti obraziti
†? †plåšiti polšiti
†? pęti pęti
†? poriti poriti
†? světiti světiti
†? viniti viniti
†? †vysiti vysiti
†? †spěti spěti
†? †strěti sterti
†? †trčati tъrčati
†? †pitati pitati


Counter({0: 776, 1: 400, 3: 17, 2: 78, 6: 4, 27: 1, 4: 8, 5: 2, 8: 1, 7: 1})

In [854]:
g

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
17905,19473.0,žužati,,v.intr. ipf.,buzz,,verb,?,?,žužati,žužati,žuž,žužati,,,,žuž,ati,0,žužati


In [843]:
cnt

Counter({0: 779, 1: 401, 3: 17, 2: 78, 6: 4, 27: 1, 4: 8, 5: 2, 8: 1, 7: 1})

'None'

In [794]:
g2

17905    None
Name: isv, dtype: object

In [909]:
t1 = morphemes.dropna(subset=['reconstructed']).query('reconstructed.str.contains("\?")')

t1

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
887,4889.0,blågodariti,,v.tr. ipf.,thank,,verb,?,?,dariti,blågodariti,blågodar,blågodariti,,,blågo,dar,iti,0,?dariti
899,5574.0,blågoslavjati,,v.tr. ipf.,"bless, beatify",,verb,?,?,sloviti,blågoslavjati,blågoslav,blågoslavjati,,,blågo,slav,jati,0,?sloviti
900,5573.0,blågosloviti,,v.tr. pf.,"bless, beatify",,verb,?,?,sloviti,blågosloviti,blågosl,blågosloviti,,,blågo,slov,iti,0,?sloviti
912,6050.0,blågoželati,(+3),v.intr. ipf.,congratulate,,verb,?,?,želati,blågoželati,blågožel,blågoželati,,,blågo,žel,ati,0,?želati
988,36422.0,bodnųti,(bode),v.refl. pf.,lock horns,,verb,?,?,bodati,bodnųti sę,bod,bodnųti,|bodec,,,bod,nųti,0,?bodati
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17581,32633.0,zažigati,,v.tr. ipf.,"ignite, light, kindle, inflame",,verb,?,?,žegti,zažigati,zažig,?žigati?,,,za,žig,ati,0,?žeťi
17582,32636.0,zažigati,,v.tr. ipf.,"switch on (the lights), turn on (the light)",,verb,?,?,žegti,zažigati,zažig,?žigati?,,,za,žig,ati,0,?žeťi
17583,32638.0,zažigati,,v.tr. ipf.,set fire to,,verb,?,?,žegti,zažigati,zažig,?žigati?,,,za,žig,ati,0,?žeťi
17624,23718.0,zěvnųti,,v.intr. pf.,yawn,,verb,?,?,zěvati,zěvnųti,zěv,zěvnųti,,,,zěv,nųti,-1,?zěvati


In [910]:
morphemes.loc[
    t1[(t1._prefix + t1.reconstructed.str.replace("?", "").apply(pra_verbs.get) == t1.isv)].index,
    "reconstructed"
] = "(" + t1._prefix + ")+" + t1.reconstructed.str.replace("?", "")


  ] = "(" + t1._prefix + ")+" + t1.reconstructed.str.replace("?", "")
  t1[(t1._prefix + t1.reconstructed.str.replace("?", "").apply(pra_verbs.get) == t1.isv)].index,
  morphemes.loc[


In [911]:
morphemes.query("reconstructed != reconstructed and genesis != genesis").tail(55)

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
17464,23842.0,zasramjati,,v.tr. ipf.,embarrass,,verb,?,?,sråmiti,zasramjati,zasram,?ramjati?,,,za,sråm,jati,0,
17466,1932.0,zasrati,,v.intr. pf.,shit,,verb,?,?,srati,zasrati,zasr,srati,,,za,sr,ati,0,
17468,4112.0,zastariti,,v.intr. pf.,age,,verb,?,?,zastariti,zastariti,zastar,?tariti?,,,,zastar,iti,0,
17469,4111.0,zastarjati,,v.intr. ipf.,age,,verb,?,?,zastariti,zastarjati,zastar,?tarjati?,,,,zastar,jati,0,
17478,21834.0,zastavjati,,v.tr. ipf.,"block, obstruct, cram",,verb,?,?,stavjati,zastavjati,zastav,stavjati,,,za,st,avjati,0,
17479,21840.0,zastavjati,,v.tr. ipf.,"pawn, hock",,verb,?,?,stavjati,zastavjati,zastav,stavjati,,,za,st,avjati,0,
17483,19418.0,zastrašati,,v.tr. ipf.,intimidate,,verb,?,?,zastrašati,zastrašati,zastraš,?trašati?,,,,zastraš,ati,0,
17484,19419.0,zastrašiti,,v.tr. pf.,intimidate,,verb,?,?,zastrašati,zastrašiti,zastraš,strašiti,,,,zastraš,iti,0,
17485,3268.0,zastrěliti,,v.tr. pf.,"fire, shoot",,verb,?,?,zastrěliti,zastrěliti,zastrěl,strěliti,,,,zastrěl,iti,0,
17486,5113.0,zastrěljati,,v.tr. ipf.,"fire, shoot",,verb,?,?,zastrěliti,zastrěljati,zastrěl,strěljati,,,,zastrěl,jati,0,


In [859]:
t1[(t1._prefix + t1.reconstructed.str.replace("?", "").apply(pra_verbs.get) != t1.isv)].query('not _suffix.str.contains("nųti")')

  t1[(t1._prefix + t1.reconstructed.str.replace("?", "").apply(pra_verbs.get) != t1.isv)].query('not _suffix.str.contains("nųti")')


Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
1185,22979.0,bųde/bųdųt,,v.aux. ipf.,there will be,,verb,?,?,,bųde/bųdųt,bųde/bųdųt,?bųde/bųdųt?,,,,,,0,?uměti
1189,34791.0,buditi sę,,v.tr. ipf.,wake up,,verb,?,?,buditi,buditi sę,bud,?buditi sę?,|budka,,,bud,iti,0,?buditi
1246,35085.0,byti dȯlžen,,v.aux. ipf.,"must, have to, ought to, should, be obliged to...",,verb,?,?,byti,byti dȯlžen,byti dȯlžen,?byti dȯlžen?,,,,by,ti,0,?byti
1248,10343.0,byti ostråžny,,v.ipf.,"look out, watch out",,verb,?,?,byti,byti ostråžny,byti ostråžny,?byti ostråžny?,,,,by,ti,0,?byti
1249,17182.0,byti podobny,,v.ipf.,"look alike, resemble",,verb,?,?,byti,byti podobny,byti podobny,?byti podobny?,,,,by,ti,0,?byti
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17551,36089.0,zaviděti,(zavidi),v.intr. ipf.,envy,,verb,?,?,viděti,zaviděti,zavid,viděti,,,za,vid,ěti,0,?nenaviděti
17553,20321.0,zavisěti od,(zavisi),v.intr. ipf.,depend on,,verb,?,?,visěti,zavisěti od,zavisěti od,?isěti od?,,,za,vis,ěti,0,?visěti
17581,32633.0,zažigati,,v.tr. ipf.,"ignite, light, kindle, inflame",,verb,?,?,žegti,zažigati,zažig,?žigati?,,,za,žig,ati,0,?žeťi
17582,32636.0,zažigati,,v.tr. ipf.,"switch on (the lights), turn on (the light)",,verb,?,?,žegti,zažigati,zažig,?žigati?,,,za,žig,ati,0,?žeťi


In [860]:
morphemes[morphemes.base_verb == ''].query('isv in @pra_forms')

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
7246,6087.0,nakladati,,v.tr. ipf.,impose,,verb,?,?,,nakladati,naklad,?kladati?,,naklad,,naklad,ati,0,nakladati
7247,28028.0,nakladati,,v.tr. ipf.,load up,,verb,?,?,,nakladati,naklad,?kladati?,,naklad,,naklad,ati,0,nakladati
8290,9775.0,obědati,,v.intr. ipf./pf.,"dine, eat lunch",,verb,?,?,,obědati,oběd,?ědati?,,oběd,,oběd,ati,0,obědati
8510,22681.0,obråtiti,(+3),v.refl. pf.,"address, appeal to",,verb,?,?,,obråtiti sę k,obråt,?råtiti?,,obråt,,obråt,iti,0,ob(v)ortiti
8511,22675.0,obråtiti,,v.refl. pf.,"turn (intr.), turn around",,verb,?,?,,obråtiti sę,obråt,?råtiti?,,obråt,,obråt,iti,0,ob(v)ortiti
8512,22677.0,obråtiti,,v.refl. pf.,turn (into sth.),,verb,?,?,,obråtiti sę,obråt,?råtiti?,,obråt,,obråt,iti,0,ob(v)ortiti
8513,22679.0,obråtiti,,v.refl. pf.,convert (to a religion),,verb,?,?,,obråtiti sę,obråt,?råtiti?,,obråt,,obråt,iti,0,ob(v)ortiti
8514,3209.0,obråtiti,,v.tr. pf.,turn around,,verb,?,?,,obråtiti,obråt,?råtiti?,,obråt,,obråt,iti,0,ob(v)ortiti
8515,22666.0,obråtiti,,v.tr. pf.,"rotate, revolve (tr.)",,verb,?,?,,obråtiti,obråt,?råtiti?,,obråt,,obråt,iti,0,ob(v)ortiti
8516,22668.0,obråtiti,,v.tr. pf.,convert (sth. into sth.),,verb,?,?,,obråtiti,obråt,?råtiti?,,obråt,,obråt,iti,0,ob(v)ortiti


In [861]:
morphemes.reconstructed.value_counts()

?uměti          80
?stǫpati        27
?voliti         19
?obelťi         18
?praviti        17
                ..
(od)+aviti       1
(od)+jьgrati     1
(od)+xoditi      1
(od)+gybati      1
žužati           1
Name: reconstructed, Length: 1936, dtype: int64

In [912]:
len(morphemes), len(morphemes.query('reconstructed == reconstructed or genesis == genesis'))

(4985, 3463)

In [874]:
morphemes.query('(reconstructed != reconstructed) and (genesis != genesis)').head(333).tail(77)

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
5859,20420.0,lęgati,,v.tr. ipf.,hatch,,verb,?,?,lęgati,lęgati,lęg,lęgati,,,,lęg,ati,0,
5894,27115.0,lěniti,,v.refl. ipf.,be lazy,,verb,?,?,lěniti,lěniti sę,lěn,lěniti,,,,lěn,iti,0,
5903,27127.0,lepetati,,v.intr. ipf.,"babble, prattle",,verb,?,?,lepetati,lepetati,lepet,lepetati,,,,lepet,ati,0,
5920,2655.0,leskati,,v.intr. ipf.,shine,,verb,?,?,leskati,leskati,lesk,leskati,,!lěska,,lesk,ati,0,
5921,2697.0,lesknųti,,v.intr. pf.,shine,,verb,?,?,leskati,lesknųti,lesk,lesknųti,,,,lesk,nųti,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7449,28189.0,naslåditi,,v.refl. pf.,"enjoy, gloat, take pleasure in",,verb,?,?,slåditi,naslåditi sę,naslåd,slåditi,,,na,slåd,iti,0,
7450,28187.0,naslåditi,,v.tr. pf.,give pleasure to,,verb,?,?,slåditi,naslåditi,naslåd,slåditi,,,na,slåd,iti,0,
7451,33275.0,naslađati,,v.refl. ipf.,"enjoy, gloat, take pleasure in",,verb,?,?,slåditi,naslađati sę,naslađ,?lađati?,,,na,slåd,⒥ati,0,
7452,33278.0,naslađati,,v.tr. ipf.,give pleasure to,,verb,?,?,slåditi,naslađati,naslađ,?lađati?,,,na,slåd,⒥ati,0,


In [873]:
morphemes.query("base_noun in @all_reconstructions")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
712,2880.0,besědovati,,v.intr. ipf.,"converse, chat",,verb,?,?,besědovati,besědovati,besěd,besědovati,,besěda,,besědo,vati,0,besědovati
1292,24918.0,cěniti,,v.tr. ipf.,appreciate,,verb,?,?,cěniti,cěniti,cěn,cěniti,,cěna,,cěn,iti,0,cěniti
3138,401.0,gorěti,(gori),v.intr. ipf.,"burn, be on fire",,verb,?,?,gorěti,gorěti,gor,gorěti,|gorka|gornik,gora,,gꜵr,ěti,0,gorěti
3283,17443.0,gromaditi,,v.tr. ipf.,accumulate,,verb,?,?,gromaditi,gromaditi,gromad,gromaditi,,gromada,,gromad,iti,0,
3293,2568.0,groziti,,v.intr. ipf.,threaten,,verb,?,?,groziti,groziti,groz,groziti,,groza,,grꜵz,iti,0,groziti
3306,3950.0,grupovati,,v.tr. ipf.,group,I,verb,?,?,grupovati,grupovati,grup,grupovati,,grupa,,grupo,vati,0,
4487,1592.0,jedati,,v.intr. ipf.,eat,,verb,?,?,jedati,jedati,jed,jedati,,jeda,,jed,ati,0,
4804,2565.0,karati,,v.tr. ipf.,punish,,verb,?,?,karati,karati,kar,karati,,kara,,kar,ati,0,karati
4805,35408.0,karati,,v.tr. ipf.,"scold, rebuke, reprimand, reproach",,verb,?,?,karati,karati,kar,karati,,kara,,kar,ati,0,karati
4958,26555.0,klevetati,,v.intr. ipf.,slander,,verb,?,?,klevetati,klevetati,klevet,klevetati,|klevetnik,kleveta,,klevet,ati,0,klevetati


In [779]:
morphemes.head(1).values

array([[11.0, 'abdikovati', nan, 'v.intr. ipf.', 'abdicate', 'I', 'verb',
        '?', '?', 'abdikovati', 'abdikovati', 'abdik', 'abdikovati',
        '|abdikacija', '', '', 'abdiko', 'vati', 0, None]], dtype=object)

In [798]:
BV = morphemes.query('reconstructed == reconstructed').base_verb.unique()

In [799]:
len(BV)

386

In [801]:
morphemes.query('reconstructed != reconstructed and base_verb in @BV and not isv.str.contains(" ")')

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
1185,22979.0,bųde/bųdųt,,v.aux. ipf.,there will be,,verb,?,?,,bųde/bųdųt,bųde/bųdųt,?bųde/bųdųt?,,,,,,0,
1710,25144.0,debatovati,,v.intr. ipf.,debate,I,verb,’,debatovati,,debatovati,debat,?batovati?,,debata,,debat,ovati,0,
1716,25152.0,debjutovati,,v.intr. ipf./pf.,make one’s debut,F,verb,’,debjutovati,,debjutovati,debjut,?bjutovati?,,debjut,,debjut,ovati,0,
1735,15280.0,defisovati,,v.tr. ipf.,hyphenate,I,verb,’,defisovati,,defisovati,defis,?fisovati?,,defis,,defis,ovati,0,
1988,25273.0,dobrěti,,v.intr. ipf.,become good,,verb,?,?,,dobrěti,dobr,?brěti?,|dobrosť,dobry,,dobr,ěti,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17500,23255.0,zašiti,(zašije),v.tr. pf.,"sew, stitch",,verb,?,?,šiti,zašiti,zaš,šiti,,,za,š,iti,-1,
17521,5204.0,zatvarjati,,v.tr. ipf.,"close, shut",,verb,?,?,tvoriti,zatvarjati,zatvar,?tvarjati?,,,za,tvꜵr,jati,0,
17524,5205.0,zatvoriti,,v.tr. pf.,"close, shut",,verb,?,?,tvoriti,zatvoriti,zatvor,tvoriti,|zatvorka,,za,tvꜵr,iti,0,
17537,14193.0,zautrakati,,v.intr. ipf.,"eat breakfast, have breakfrast",,verb,?,?,,zautrakati,zautrak,?trakati?,,zautraka,,zautrak,ati,0,


In [758]:
pd.concat(dfs).sort_values(by='base_verb')

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
12589,21505.0,raziti,,v.tr. ipf.,"hit, strike",,verb,?,?,,raziti,raz,?iti?,,raz,,raz,iti,0,
13330,12075.0,saditi,,v.tr. ipf.,"plant, seat",,verb,?,?,,saditi,sad,?aditi?,,sad,,sad,iti,0,
14037,34003.0,snovati,,v.tr. ipf.,warp,,verb,?,?,,snovati,sn,?novati?,,sn,,sn,ovati,0,
14619,2097.0,sųditi,,v.tr. ipf.,"judge, try (in court)",,verb,?,?,,sųditi,sųd,?diti?,,sųd,,sųd,iti,0,
14620,22003.0,sųditi,,v.tr. ipf.,"predestine, foreordain",,verb,?,?,,sųditi,sųd,?diti?,,sųd,,sųd,iti,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14699,18925.0,sušiti,,v.tr. ipf.,dry,,verb,?,?,šiti,sušiti,suš,šiti,,,,suh,⒥iti,0,
17846,7191.0,žiti,(žive),v.intr. ipf.,"reside, dwell",,verb,?,?,žiti,žiti,ž,žiti,,,,ži,ti,-1,
17845,28.0,žiti,(žive),v.intr. ipf.,live,,verb,?,?,žiti,žiti,ž,žiti,,,,ži,ti,-1,
11645,11412.0,prěživati,,v.tr. ipf.,live through,,verb,?,?,žiti,prěživati,prěž,?živati?,,,prě,ži,vati,-1,


In [751]:
unk

['abdikovati',
 'abonovati',
 'absorbovati',
 'abstrahovati',
 'adaptovati',
 'administrovati',
 'adoptovati',
 'adresovati',
 'agitovati',
 'agonizovati',
 'akcentovati',
 'akceptovati',
 'aklimatizovati',
 'akompanovati',
 'aktivovati',
 'aktualizovati',
 'akumulovati',
 'amnestovati',
 'amortizovati',
 'amputovati',
 'analizovati',
 'aneksovati',
 'anulovati',
 'apelovati',
 'aplodovati',
 'aranževati',
 'areštovati',
 'argumentovati',
 'arhivovati',
 'asimilovati',
 'atakovati',
 'avansovati',
 'avtomatizovati',
 'avtorizovati',
 'avtostopovati',
 'bagatelizovati',
 'balansovati',
 'balotovati',
 'balzamovati',
 'barikadovati',
 'barviti',
 'bazovati',
 '†pokojiti',
 'blågodariti',
 'blågoslavjati',
 'blågosloviti',
 'blågovolěti',
 'blågoželati',
 'blaznovati',
 'blědněti',
 'blěskati',
 'blěstěti',
 'bližiti',
 'bljunųti',
 'bljuvati',
 'blokovati',
 'blųkati',
 'bogatěti',
 'bogohuliti',
 'bojkotovati',
 'bombardovati',
 'brahtati',
 'bratati',
 'bråzditi',
 'breknųti',
 'brenča

In [748]:
morphemes.query('not ((isv in @pra_forms) or (base_verb.str.replace("†", "") in @pra_forms))')

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
5,11.0,abdikovati,,v.intr. ipf.,abdicate,I,verb,?,?,abdikovati,abdikovati,abdik,abdikovati,|abdikacija,,,abdiko,vati,0,
16,6119.0,abonovati,,v.tr. ipf.,"subscribe, engage",F,verb,?,?,abonovati,abonovati,abon,abonovati,,,,abono,vati,0,
28,19625.0,absorbovati,,v.tr. ipf.,absorb,I,verb,?,?,absorbovati,absorbovati,absorb,absorbovati,|absorbcija,,,absorbo,vati,0,
29,24056.0,abstrahovati,,v.tr. ipf./pf.,abstract,I,verb,?,?,abstrahovati,abstrahovati,abstrah,abstrahovati,,,,abstraho,vati,0,
43,24070.0,adaptovati,,v.tr. ipf./pf.,adapt,I,verb,?,?,adaptovati,adaptovati,adapt,adaptovati,|adaptacija,,,adapto,vati,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17904,2627.0,žuvati,(žuje),v.intr. ipf.,"chew, masticate",,verb,?,?,žuvati,žuvati,žuv,žuvati,,,,žu,vati,0,
17905,19473.0,žužati,,v.intr. ipf.,buzz,,verb,?,?,žužati,žužati,žuž,žužati,,,,žuž,ati,0,
17913,-37005.0,råzpečętati,,v.tr. pf.,unseal,,verb,råz’,pečętati,pečętati,råzpečętati,råzpečęt,pečętati,,,råz,pečęt,ati,0,
17914,-37006.0,råzpečętyvati,,v.tr. ipf.,unseal,,verb,råz’,pečętyvati,pečętati,råzpečętyvati,råzpečęt,?pečętyvati?,,,råz,pečęty,vati,0,


In [720]:
morphemes.loc[
    morphemes.query('isv in @VALS').index,
    'reconstructed'
] = morphemes.query('isv in @VALS').isv

  morphemes.loc[


In [721]:
morphemes

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
5,11.0,abdikovati,,v.intr. ipf.,abdicate,I,verb,?,?,abdikovati,abdikovati,abdik,abdikovati,|abdikacija,,,abdiko,vati,0,
16,6119.0,abonovati,,v.tr. ipf.,"subscribe, engage",F,verb,?,?,abonovati,abonovati,abon,abonovati,,,,abono,vati,0,
28,19625.0,absorbovati,,v.tr. ipf.,absorb,I,verb,?,?,absorbovati,absorbovati,absorb,absorbovati,|absorbcija,,,absorbo,vati,0,
29,24056.0,abstrahovati,,v.tr. ipf./pf.,abstract,I,verb,?,?,abstrahovati,abstrahovati,abstrah,abstrahovati,,,,abstraho,vati,0,
43,24070.0,adaptovati,,v.tr. ipf./pf.,adapt,I,verb,?,?,adaptovati,adaptovati,adapt,adaptovati,|adaptacija,,,adapto,vati,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17905,19473.0,žužati,,v.intr. ipf.,buzz,,verb,?,?,žužati,žužati,žuž,žužati,,,,žuž,ati,0,
17913,-37005.0,råzpečętati,,v.tr. pf.,unseal,,verb,råz’,pečętati,pečętati,råzpečętati,råzpečęt,pečętati,,,råz,pečęt,ati,0,
17914,-37006.0,råzpečętyvati,,v.tr. ipf.,unseal,,verb,råz’,pečętyvati,pečętati,råzpečętyvati,råzpečęt,?pečętyvati?,,,råz,pečęty,vati,0,
17917,-30731.0,smogti,,v.tr. pf.,"manage to, be able to, cope, make it",,verb,?,?,mogti,smogti,smogti,mogti,,,s,mꜵg,ti,0,


In [935]:
prefix_map = {
    "nedoråz": "¬ne¬do¬råz",
    "nezado": "¬ne¬za¬do",
    # "poblågo":
    "prěvȯzȯ": "¬prě¬vȯz¬ȯ" 
}
weird_pref = set()


for p in morphemes._prefix.unique():
    if not p: continue
    if p in prefix_map: continue
    if p in possible_prefixes:
        prefix_map[p] = "¬" + p
    else:
        for length in range(1, len(p)):
            if p[:length] in possible_prefixes and p[length:] in possible_prefixes | {'ȯ', 'z'}:
                prefix_map[p] = "¬" + p[:length] + "¬" + p[length:]
                break
        else:
            print(p)
            weird_pref.add(p)
                
prefix_map


a
blågo
mråzo
poblågo
protiv
rųko
samou
sebe
spolu


{'nedoråz': '¬ne¬do¬råz',
 'nezado': '¬ne¬za¬do',
 'prěvȯzȯ': '¬prě¬vȯz¬ȯ',
 'bez': '¬bez',
 'de': '¬de',
 'do': '¬do',
 'doråz': '¬do¬råz',
 'doz': '¬do¬z',
 'iz': '¬iz',
 'izna': '¬iz¬na',
 'izne': '¬iz¬ne',
 'izȯ': '¬iz¬ȯ',
 'izu': '¬iz¬u',
 'na': '¬na',
 'nad': '¬nad',
 'nadu': '¬nad¬u',
 'napo': '¬na¬po',
 'nas': '¬na¬s',
 'po': '¬po',
 'sȯ': '¬sȯ',
 'ne': '¬ne',
 'nedo': '¬ne¬do',
 'nena': '¬ne¬na',
 'o': '¬o',
 'ob': '¬ob',
 'obez': '¬obez',
 'obȯ': '¬ob¬ȯ',
 'od': '¬od',
 'odȯ': '¬od¬ȯ',
 'odpo': '¬od¬po',
 'odza': '¬od¬za',
 'opo': '¬o¬po',
 'os': '¬o¬s',
 'pod': '¬pod',
 'podȯ': '¬pod¬ȯ',
 'podråz': '¬pod¬råz',
 'poråz': '¬po¬råz',
 'posȯ': '¬po¬sȯ',
 'pos': '¬po¬s',
 'pov': '¬po¬v',
 'poza': '¬po¬za',
 'poz': '¬po¬z',
 'prě': '¬prě',
 'prěd': '¬prěd',
 'prědȯ': '¬prěd¬ȯ',
 'prědpo': '¬prěd¬po',
 'prědpri': '¬prěd¬pri',
 'prěds': '¬prěd¬s',
 'prěna': '¬prě¬na',
 'prěo': '¬prě¬o',
 'prěpo': '¬prě¬po',
 'prěråz': '¬prě¬råz',
 'prěs': '¬prě¬s',
 'prěvȯz': '¬prě¬vȯz',
 'pri': '¬p

In [936]:
weird_pref


{'a', 'blågo', 'mråzo', 'poblågo', 'protiv', 'rųko', 'samou', 'sebe', 'spolu'}

In [923]:
for i, row in morphemes.query("genesis == 'I' and isv.str.startswith('a')").iterrows():
    if row.isv in 'akompanovati anulovati asimilovati'.split(" "):
        morphemes.loc[i, '_prefix'] = 'a'
        morphemes.loc[i, '_stem'] = morphemes.loc[i, '_stem'][1:]
    if row._stem.endswith("o"):
        morphemes.loc[i, '_stem'] = morphemes.loc[i, '_stem'][:-1]
        morphemes.loc[i, '_suffix'] = "o" + morphemes.loc[i, '_suffix']

morphemes.query("genesis == 'I' and isv.str.startswith('a')")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
5,11.0,abdikovati,,v.intr. ipf.,abdicate,I,verb,?,?,abdikovati,abdikovati,abdik,abdikovati,|abdikacija,,,abdik,ovati,0,
28,19625.0,absorbovati,,v.tr. ipf.,absorb,I,verb,?,?,absorbovati,absorbovati,absorb,absorbovati,|absorbcija,,,absorb,ovati,0,
29,24056.0,abstrahovati,,v.tr. ipf./pf.,abstract,I,verb,?,?,abstrahovati,abstrahovati,abstrah,abstrahovati,,,,abstrah,ovati,0,
43,24070.0,adaptovati,,v.tr. ipf./pf.,adapt,I,verb,?,?,adaptovati,adaptovati,adapt,adaptovati,|adaptacija,,,adapt,ovati,0,
50,24081.0,administrovati,,v.tr. ipf.,administrate,I,verb,?,?,administrovati,administrovati,administr,administrovati,|administracija,,,administr,ovati,0,
54,19628.0,adoptovati,,v.tr. ipf.,adopt,I,verb,?,?,adoptovati,adoptovati,adopt,adoptovati,,,,adopt,ovati,0,
59,4540.0,adresovati,,v.tr. ipf.,address,I,verb,?,?,adresovati,adresovati,adres,adresovati,,adres|adresa,,adres,ovati,0,
92,24114.0,agitovati,,v.intr. ipf.,agitate,I,verb,?,?,agitovati,agitovati,agit,agitovati,|agitacija|agitka,,,agit,ovati,0,
99,24121.0,agonizovati,,v.intr. ipf.,"agonise, agonize",I,verb,?,?,agonizovati,agonizovati,agoniz,agonizovati,,agonija,,agoniz,ovati,0,
120,24148.0,akcentovati,,v.tr. ipf.,"accentuate, stress (pronunciation)",I,verb,?,?,akcentovati,akcentovati,akcent,akcentovati,,akcent,,akcent,ovati,0,


In [925]:
morphemes.genesis.value_counts()

I    335
F     11
D      8
E      6
S      2
M      1
Name: genesis, dtype: int64

In [929]:
for i, row in morphemes.query("genesis in ['I', 'F', 'D', 'E', 'S', 'M'] and _stem.str.endswith('o')").iterrows():
    morphemes.loc[i, '_stem'] = morphemes.loc[i, '_stem'][:-1]
    morphemes.loc[i, '_suffix'] = "o" + morphemes.loc[i, '_suffix']


In [930]:
morphemes.query("genesis in ['I', 'F', 'D', 'E', 'S', 'M'] and _stem.str.endswith('ir')")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
2916,25578.0,garnirovati,,v.tr. ipf.,garnish,F,verb,?,?,garnirovati,garnirovati,garnir,garnirovati,,,,garnir,ovati,0,
3229,2668.0,gravirovati,,v.tr. ipf.,engrave,I,verb,?,?,gravirovati,gravirovati,gravir,gravirovati,,,,gravir,ovati,0,
3790,35896.0,inspirovati,,v.tr. ipf./pf.,inspire,I,verb,?,?,inspirovati,inspirovati,inspir,inspirovati,|inspiracija,,,inspir,ovati,0,
6333,2869.0,marširovati,,v.intr. ipf.,march,D,verb,?,?,marširovati,marširovati,maršir,marširovati,,,,maršir,ovati,0,
6342,33371.0,masirovati,,v.tr. ipf.,massage,F,verb,?,?,masirovati,masirovati,masir,masirovati,,,,masir,ovati,0,
10165,10565.0,pirovati,,v.intr. ipf.,feast,I,verb,?,?,pirovati,pirovati,pir,pirovati,,pir,,pir,ovati,0,
10657,10857.0,polirati,,v.tr. ipf.,polish,I,verb,?,?,polirati,polirati,polir,?lirati?,,,,polir,ati,0,
12640,36411.0,råzkvartirovati,,v.tr. pf.,quarter (troops),I,verb,råz’,kvartirovati,råzkvartirovati,råzkvartirovati,råzkvartir,?kvartirovati?,,,,råzkvartir,ovati,0,
12641,24162.0,råzkvartirovyvati,,v.tr. ipf.,quarter (troops),I,verb,råz’,kvartirovyvati,råzkvartirovati,råzkvartirovyvati,råzkvartir,?kvartirovyvati?,,,,råzkvartir,ovyvati,0,
13219,18648.0,rokirovati,,v.intr. ipf./pf.,castle (chess),I,verb,?,?,rokirovati,rokirovati,rokir,rokirovati,,,,rokir,ovati,0,


In [934]:
for i, row in morphemes.query("_stem.str.startswith('råz')").iterrows():
    morphemes.loc[i, '_prefix'] = 'råz'
    morphemes.loc[i, '_stem'] = morphemes.loc[i, '_stem'][3:]


In [921]:

morphemes.loc[13627, ["base_verb", '_prefix', "_stem"]] = ['sikati', '', 'sik']

morphemes.loc[3661, ["reconstructed", '_prefix', "_stem"]] = ['jьkati', '', 'ik']
morphemes.loc[17210, ["reconstructed", '_prefix', "_stem"]] = ['zajьkati', 'za', 'jik']



In [920]:
morphemes.query("en.str.contains('pee')")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
6206,27352.0,lupiti,,v.tr. ipf.,peel,,verb,?,?,lupiti,lupiti,lup,lupiti,,,,lup,iti,0,lupiti
6761,1829.0,močiti,,v.refl. ipf.,"urinate, pee",,verb,?,?,močiti,močiti sę,moč,močiti,,moč|!moć,,moč,iti,0,močiti
7751,28421.0,něměti,,v.intr. ipf.,"fall silent, become speechless, grow dumb, be ...",,verb,?,?,něměti,něměti,něm,něměti,|němcija|němka|němec|němosť,,,něm,ěti,0,něměti
8280,28724.0,obdirati,,v.tr. ipf.,"peel, skin, strip",,verb,?,?,dreti,obdirati,obdir,?dirati?,,,ob,dir,ati,0,
8284,34086.0,obdreti,(obdere),v.tr. pf.,"peel, skin, strip",,verb,?,?,dreti,obdreti,obdreti,dreti,,,ob,dre,ti,0,
8408,28822.0,oblupiti,,v.tr. pf.,peel,,verb,?,?,lupiti,oblupiti,oblup,lupiti,,,ob,lup,iti,0,(ob)+lupiti
9068,4304.0,odšlupati,,v.tr. pf.,peel,,verb,?,?,odšlupati,odšlupati,odšlup,?šlupati?,,,od,šlup,ati,0,
9069,5552.0,odšlupyvati,,v.tr. ipf.,peel,,verb,?,?,odšlupati,odšlupyvati,odšlup,?šlupyvati?,,,od,šlup,yvati,0,
9320,32831.0,oněměti,,v.intr. pf.,"be struck dumb, fall silent, become speechless...",,verb,?,?,něměti,oněměti,oněm,něměti,,,o,něm,ěti,0,(o)+něměti
10181,14838.0,pišati,,v.intr. ipf.,pee,,verb,?,?,pišati,pišati,piš,pišati,,,,piš,ati,0,


In [916]:
morphemes.query("_prefix in @weird_pref")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
130,24156.0,aklimatizovati,,v.intr. ipf.,acclimatize,I,verb,?,?,aklimatizovati,aklimatizovati,aklimatiz,aklimatizovati,|aklimatizacija,klimat,a,klimat,izovati,0,
887,4889.0,blågodariti,,v.tr. ipf.,thank,,verb,?,?,dariti,blågodariti,blågodar,blågodariti,,,blågo,dar,iti,0,(blågo)+dariti
899,5574.0,blågoslavjati,,v.tr. ipf.,"bless, beatify",,verb,?,?,sloviti,blågoslavjati,blågoslav,blågoslavjati,,,blågo,slav,jati,0,?sloviti
900,5573.0,blågosloviti,,v.tr. pf.,"bless, beatify",,verb,?,?,sloviti,blågosloviti,blågosl,blågosloviti,,,blågo,slov,iti,0,(blågo)+sloviti
907,22297.0,blågovolěti,(blågovoli),v.intr. ipf.,"show benevolence, show goodwill",,verb,?,?,volěti,blågovolěti,blågovol,blågovolěti,,,blågo,vol,ěti,0,
912,6050.0,blågoželati,(+3),v.intr. ipf.,congratulate,,verb,?,?,želati,blågoželati,blågožel,blågoželati,,,blågo,žel,ati,0,(blågo)+želati
6915,17812.0,mråzosušati,,v.tr. ipf.,freeze-dry,,verb,?,?,mråzosušati,mråzosušati,mråzosuš,mråzosušati,,,mråzo,suh,⒥ati,0,
6918,9263.0,mråzosušiti,,v.tr. pf.,freeze-dry,,verb,?,?,mråzosušati,mråzosušiti,mråzosuš,mråzosušiti,,,mråzo,suh,⒥iti,0,
7668,20535.0,nedoråzuměti,,v.tr. pf.,misunderstand,,verb,?,?,uměti,nedoråzuměti,nedoråzum,uměti,,,nedoråz,um,ěti,0,?orzuměti
8064,17948.0,nezadovaljati,,v.tr. ipf.,"displease, dissatisfy",,verb,?,?,voliti,nezadovaljati,nezadoval,valjati,,,nezado,vꜵl,jati,0,?voliti


In [938]:
morphemes._prefix = morphemes._prefix.apply(lambda x: prefix_map.get(x, x))

In [939]:
morphemes.to_pickle("morphemes_protoslavic.pkl")
morphemes[['id', 'isv_orig', 'addition', 'partOfSpeech', 'en', 'genesis', 
          'isv', 'base_verb', 'derived_nouns', 'base_noun', '_prefix', '_stem',
       '_suffix', 'reconstructed']].to_csv("morphemes_protoslavic.csv")

In [948]:
print(
    morphemes.query("_prefix.str.contains('¬')")._prefix.value_counts().to_csv(sep="\t")
)

	_prefix
¬råz	349
¬iz	261
¬od	253
¬na	242
¬o	242
¬za	208
¬po	186
¬u	174
¬ob	152
¬s	149
¬prě	117
¬pri	114
¬pro	89
¬do	87
¬v	65
¬pod	52
¬vȯz	44
¬sȯ	25
¬prěd	22
¬sų	12
¬råz¬pro	9
¬obez	7
¬vy	7
¬s¬po	7
¬råz¬s	6
¬u¬s	5
¬nad	5
¬prě¬s	5
¬za¬ne	5
¬de	4
¬prě¬o	4
¬pro¬iz	4
¬po¬z	4
¬po¬v	4
¬pri¬po	4
¬o¬s	4
¬za¬do	4
¬ob¬ȯ	4
¬iz¬na	4
¬vȯ	4
¬prěd¬po	3
¬na¬s	3
¬po¬za	3
¬od¬po	3
¬ne	3
¬vȯz¬po	2
¬pro¬ne	2
¬s¬na	2
¬råz¬o	2
¬råz¬råz	2
¬råz¬po	2
¬sų¬pro	2
¬s¬ne	2
¬bez	2
¬prěd¬pri	2
¬za¬u	2
¬do¬z	2
¬ne¬za¬do	2
¬nad¬u	2
¬iz¬u	2
¬pod¬ȯ	2
¬pod¬råz	2
¬po¬råz	2
¬na¬po	2
¬prěd¬s	2
¬prě¬na	2
¬prě¬po	2
¬prě¬råz	2
¬do¬råz	1
¬ne¬do¬råz	1
¬za¬po	1
¬v¬z	1
¬iz¬ȯ	1
¬vȯz¬ȯ	1
¬za¬pro	1
¬iz¬ne	1
¬pri¬na	1
¬ne¬do	1
¬ne¬na	1
¬od¬ȯ	1
¬prě¬vȯz¬ȯ	1
¬od¬za	1
¬o¬po	1
¬s¬de	1
¬po¬sȯ	1
¬po¬s	1
¬prěd¬ȯ	1
¬råz¬ȯ	1
¬prě¬vȯz	1
¬s¬råz	1



In [949]:
morphemes.query("en.str.contains('dis')")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
785,19654.0,bezpokojiti,,v.tr. ipf.,"disturb, trouble, upset, bother",,verb,?,?,†pokojiti,bezpokojiti,bezpoko,?kojiti?,,pokoj,¬bez,pokoj,iti,0,
1875,25241.0,dezinfikovati,,v.tr. ipf./pf.,disinfect,I,verb,dez’,infikovati,dezinfikovati,dezinfikovati,dezinfik,?zinfikovati?,,,,dezinfik,ovati,0,
1936,35510.0,diskreditovati,,v.tr. ipf./pf.,discredit,I,verb,?,?,diskreditovati,diskreditovati,diskredit,diskreditovati,,,,diskredit,ovati,0,
1939,32603.0,diskriminovati,,v.intr. ipf./pf.,discriminate,I,verb,?,?,diskriminovati,diskriminovati,diskrimin,diskriminovati,|diskriminacija,,,diskrimin,ovati,0,
1942,301.0,diskutovati,,v.intr. ipf.,discuss,I,verb,?,?,diskutovati,diskutovati,diskut,diskutovati,,,,diskut,ovati,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16006,3558.0,uvoljniti,,v.tr. pf.,"discharge, dismiss, fire (terminate employment)",,verb,?,?,voliti,uvoljniti,uvoljn,?ljniti?,,,¬u,vꜵl,jniti,0,?voliti
17176,5475.0,zahoditi,,v.intr. ipf.,set (disappear behind the horizon),,verb,?,?,hoditi,zahoditi,zahod,hoditi,,,¬za,hꜵd,iti,0,(za)+xoditi
17194,5476.0,zajdti,(zajde; zašel),v.intr. pf.,set (disappear behind the horizon),,verb,?,?,idti,zajdti,zajdti,?jdti?,,,¬za,jd,ti,0,
17407,21238.0,zapropastiti,,v.refl. pf.,disappear completely,,verb,?,?,pasti,zapropastiti sę,zapropast,?pastiti?,,,¬za¬pro,pas,titi,0,?pasti


In [951]:
morphemes.query("isv.str.contains('^d[ie][zs]')")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
1826,35733.0,destabilizovati,,v.tr. ipf./pf.,destabilize,I,verb,de’,stabilizovati,stabilizovati,destabilizovati,destabiliz,stabilizovati,|destabilizacija,,¬de,stabiliz,ovati,0,
1873,25239.0,dezertovati,,v.intr. ipf./pf.,desert,I,verb,’,dezertovati,dezertovati,dezertovati,dezert,?zertovati?,,,,dezert,ovati,0,
1875,25241.0,dezinfikovati,,v.tr. ipf./pf.,disinfect,I,verb,dez’,infikovati,dezinfikovati,dezinfikovati,dezinfik,?zinfikovati?,,,,dezinfik,ovati,0,
1936,35510.0,diskreditovati,,v.tr. ipf./pf.,discredit,I,verb,?,?,diskreditovati,diskreditovati,diskredit,diskreditovati,,,,diskredit,ovati,0,
1939,32603.0,diskriminovati,,v.intr. ipf./pf.,discriminate,I,verb,?,?,diskriminovati,diskriminovati,diskrimin,diskriminovati,|diskriminacija,,,diskrimin,ovati,0,
1942,301.0,diskutovati,,v.intr. ipf.,discuss,I,verb,?,?,diskutovati,diskutovati,diskut,diskutovati,,,,diskut,ovati,0,
1944,35295.0,diskvalifikovati,,v.tr. ipf./pf.,disqualify,I,verb,?,?,diskvalifikovati,diskvalifikovati,diskvalifik,diskvalifikovati,|diskvalifikacija,,,diskvalifik,ovati,0,
1946,35944.0,distancevati,,v.refl. ipf./pf.,distance oneself,I,verb,?,?,distancevati,distancevati sę,distancev,distancevati,,,,distanc,evati,0,


In [952]:
morphemes.query("base_verb.str.contains('dti')")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
2057,5345.0,dojdti,(dojde; došel),v.intr. pf.,arrive,,verb,?,?,idti,dojdti,dojdti,?jdti?,,,¬do,jd,ti,0,
3638,13823.0,idti vprěd,#(ide; šel),v.ipf.,advance,,verb,?,?,idti,idti vprěd,idti vprěd,?idti vprěd?,,,,jd,ti,0,
3639,15654.0,idti vzad,#(ide; šel),v.ipf.,back up,,verb,?,?,idti,idti vzad,idti vzad,?idti vzad?,,,,jd,ti,0,
3640,2104.0,idti,(ide; šel),v.intr. ipf.,go,,verb,?,?,idti,idti,idti,idti,,,,jd,ti,0,
4124,36803.0,iznajdti,(iznajde),v.tr. pf.,invent (create something new),,verb,?,?,idti,iznajdti,iznajdti,najdti,,,¬iz¬na,jd,ti,0,
4148,5354.0,izȯjdti,(izȯjde; izšel),v.intr. pf.,"exit, go out, quit",,verb,?,?,idti,izȯjdti,izȯjdti,?ȯjdti?,,,¬iz¬ȯ,jd,ti,0,
7187,781.0,najdti,(najde; našel),v.tr. pf.,find,,verb,?,?,idti,najdti,najdti,?dti?,,,¬na,jd,ti,0,
8469,18025.0,obȯjdti,(obȯjde; obšel),v.refl. pf.,do without,,verb,?,?,idti,obȯjdti sę bez,obȯjdti,?ȯjdti?,,,¬ob¬ȯ,jd,ti,0,
8470,18023.0,obȯjdti,(obȯjde; obšel),v.refl. pf.,"manage, make do with, get by",,verb,?,?,idti,obȯjdti sę,obȯjdti,?ȯjdti?,,,¬ob¬ȯ,jd,ti,0,
8471,5370.0,obȯjdti,(obȯjde; obšel),v.tr. pf.,"go around, circumvent",,verb,?,?,idti,obȯjdti,obȯjdti,?ȯjdti?,,,¬ob¬ȯ,jd,ti,0,


In [953]:
morphemes.query("isv.str.contains('odryvati')")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
9006,29227.0,odryvati,,v.tr. ipf.,dig up,,verb,?,?,ryti,odryvati,odr,?ryvati?,,,¬od,ry,vati,0,?ryti
9007,34361.0,odryvati,,v.tr. ipf.,tear away,,verb,?,?,rvati,odryvati,odr,?ryvati?,,,¬od,ryv,ati,0,?rъvati


In [958]:
morphemes.query("en.str.contains('separate')")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
1781,1754.0,děliti,,v.tr. ipf.,"share, divide, part, separate",,verb,?,?,děliti,děliti,děl,děliti,,děl,,děl,iti,0,děliti
8756,2951.0,odděliti,,v.tr. pf.,separate,,verb,?,?,děliti,odděliti,odděl,děliti,,,¬od,děl,iti,0,(od)+děliti
8757,3016.0,odděljati,,v.tr. ipf.,separate,,verb,?,?,děliti,odděljati,odděl,?děljati?,,,¬od,děl,jati,0,?děliti
12508,595.0,råzděliti,,v.tr. pf.,"separate, sever, divide",,verb,råz’,děliti,děliti,råzděliti,råzděl,děliti,,,¬råz,děl,iti,0,(råz)+děliti
12509,2249.0,råzděljati,,v.tr. ipf.,"separate, sever, divide",,verb,råz’,děljati,děliti,råzděljati,råzděl,?děljati?,,,¬råz,děl,jati,0,?děliti
12676,20108.0,råzlųčati,,v.tr. ipf.,separate,,verb,råz’,lųčati,råzlųčati,råzlųčati,råzlųč,?lųčati?,|råzlųčnik,,¬råz,lųč,ati,0,
12678,20109.0,råzlųčiti,,v.tr. pf.,separate,,verb,råz’,lųčiti,råzlųčati,råzlųčiti,råzlųč,?lųčiti?,|råzlųčnik,,¬råz,lųč,iti,0,


In [960]:
morphemes.query("_prefix == '' and prefix != '?'")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
1710,25144.0,debatovati,,v.intr. ipf.,debate,I,verb,’,debatovati,,debatovati,debat,?batovati?,,debata,,debat,ovati,0,?uměti
1711,25146.0,debelěti,,v.intr. ipf.,grow fat,,verb,’,debelěti,belěti,debelěti,debel,belěti,,,,debel,ěti,0,
1716,25152.0,debjutovati,,v.intr. ipf./pf.,make one’s debut,F,verb,’,debjutovati,,debjutovati,debjut,?bjutovati?,,debjut,,debjut,ovati,0,?uměti
1729,4956.0,definiovati,,v.tr. ipf.,define,I,verb,’,definiovati,definiovati,definiovati,defini,?finiovati?,|definicija,,,defini,ovati,0,
1735,15280.0,defisovati,,v.tr. ipf.,hyphenate,I,verb,’,defisovati,,defisovati,defis,?fisovati?,,defis,,defis,ovati,0,?uměti
1742,25172.0,degradovati,,v.tr. ipf./pf.,"degrade, demote",I,verb,’,degradovati,degradovati,degradovati,degrad,?gradovati?,|degradacija,,,degrad,ovati,0,
1744,25175.0,degustovati,,v.tr. ipf./pf.,taste,I,verb,’,degustovati,degustovati,degustovati,degust,?gustovati?,|degustacija,,,degust,ovati,0,
1757,35941.0,deklamovati,,v.tr. ipf./pf.,"declaim, recite",I,verb,’,deklamovati,deklamovati,deklamovati,deklam,?klamovati?,,,,deklam,ovati,0,
1795,6266.0,demonstrovati,,v.tr. ipf.,demonstrate,I,verb,’,demonstrovati,demonstrovati,demonstrovati,demonstr,?monstrovati?,|demonstracija,,,demonstr,ovati,0,
1806,398.0,deportovati,,v.tr. ipf./pf.,deport,I,verb,de’,portovati,deportovati,deportovati,deport,?rtovati?,|deportacija,,,deport,ovati,0,


In [990]:

morphemes.query("_prefix == '' and prefix not in ['’', '?']")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
1806,398.0,deportovati,,v.tr. ipf./pf.,deport,I,verb,de’,portovati,deportovati,deportovati,deport,?rtovati?,|deportacija,,,deport,ovati,0,
1875,25241.0,dezinfikovati,,v.tr. ipf./pf.,disinfect,I,verb,dez’,infikovati,dezinfikovati,dezinfikovati,dezinfik,?zinfikovati?,,,,dezinfik,ovati,0,
8298,773.0,obezglåviti,,v.tr. pf.,decapitate,,verb,obez’,glåviti,obezglåviti,obezglåviti,obezglåv,?glåviti?,,,,obezglåv,iti,0,
8299,1587.0,obezglåvjati,,v.tr. ipf.,decapitate,,verb,obez’,glåvjati,obezglåviti,obezglåvjati,obezglåv,?glåvjati?,,,,obezglåv,jati,0,
8303,5374.0,obezsiliti,,v.tr. pf.,disempower,,verb,obez’,siliti,obezsiliti,obezsiliti,obezsil,siliti,,,,obezsil,iti,0,
8304,5373.0,obezsiljati,,v.tr. ipf.,disempower,,verb,obez’,siljati,obezsiliti,obezsiljati,obezsil,?iljati?,,,,obezsil,jati,0,
14056,34940.0,sȯčuvstvovati,,v.intr. ipf.,"sympathize, commiserate, feel compassion",,verb,sȯ’,čuvstvovati,čuvati,sȯčuvstvovati,sȯčuvstv,?čuvstvovati?,,,,,,1,?čuti
14076,34119.0,sȯhnųti,,v.intr. ipf.,"dry (intr.), become dry, wither",,verb,sȯ’,hnųti,sȯhnųti,sȯhnųti,sȯh,?hnųti?,,,,sȯh,nųti,0,
14077,34121.0,sȯhnųti,,v.intr. ipf.,pine away,,verb,sȯ’,hnųti,sȯhnųti,sȯhnųti,sȯh,?hnųti?,,,,sȯh,nųti,0,
14119,4150.0,sȯsati,,v.tr. ipf.,suck,,verb,sȯ’,sati,sȯsati,sȯsati,sȯs,?ati?,|sȯska,,,sȯs,ati,0,


In [978]:
morphemes.query("_prefix != '' and reconstructed == reconstructed")

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
887,4889.0,blågodariti,,v.tr. ipf.,thank,,verb,?,?,dariti,blågodariti,blågodar,blågodariti,,,blågo,dar,iti,0,(blågo)+dariti
899,5574.0,blågoslavjati,,v.tr. ipf.,"bless, beatify",,verb,?,?,sloviti,blågoslavjati,blågoslav,blågoslavjati,,,blågo,slav,jati,0,?sloviti
900,5573.0,blågosloviti,,v.tr. pf.,"bless, beatify",,verb,?,?,sloviti,blågosloviti,blågosl,blågosloviti,,,blågo,slov,iti,0,(blågo)+sloviti
912,6050.0,blågoželati,(+3),v.intr. ipf.,congratulate,,verb,?,?,želati,blågoželati,blågožel,blågoželati,,,blågo,žel,ati,0,(blågo)+želati
1983,21272.0,dobaviti,,v.tr. pf.,add,,verb,?,?,baviti,dobaviti,dobav,baviti,,,¬do,bav,iti,-1,(do)+baviti
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17580,32644.0,zažegti,(zažže),v.tr. pf.,set fire to,,verb,?,?,žegti,zažegti,zažegti,žegti,,,¬za,žeg,ti,0,(za)+žeťi
17581,32633.0,zažigati,,v.tr. ipf.,"ignite, light, kindle, inflame",,verb,?,?,žegti,zažigati,zažig,?žigati?,,,¬za,žig,ati,0,?žeťi
17582,32636.0,zažigati,,v.tr. ipf.,"switch on (the lights), turn on (the light)",,verb,?,?,žegti,zažigati,zažig,?žigati?,,,¬za,žig,ati,0,?žeťi
17583,32638.0,zažigati,,v.tr. ipf.,set fire to,,verb,?,?,žegti,zažigati,zažig,?žigati?,,,¬za,žig,ati,0,?žeťi


In [987]:

def bite_all_prefixes_off(word, verb_nest):

    can_continue = True
    prefixes = []

    while can_continue:
        can_continue = False
        for pref in possible_prefixes:
            if word.startswith(pref):
                # print(pref, word, word[len(pref):])
                new_word = word[len(pref):]
                if new_word.endswith(verb_nest):
                    word = new_word
                    prefixes.append(pref)
                    can_continue = True
                    break
    return ("¬" if prefixes else "") + "¬".join(prefixes)

In [989]:
morphemes.query("_prefix == ''").isv.apply(lambda x: bite_all_prefixes_off(x, "")).value_counts().head(33)

          1054
¬s         137
¬v          99
¬o          58
¬ob         55
¬po         48
¬u          45
¬za         22
¬od         22
¬iz         20
¬o¬s        18
¬pro        14
¬pri        12
¬po¬s       12
¬s¬v        11
¬de         11
¬prě        11
¬za¬s       10
¬u¬s        10
¬prě¬s       8
¬iz¬s        8
¬do¬s        7
¬na          7
¬v¬o         7
¬iz¬v        7
¬pri¬s       6
¬pro¬v       6
¬o¬v         6
¬o¬po        6
¬u¬po        6
¬sȯ¬v        5
¬sȯ          5
¬s¬po        5
Name: isv, dtype: int64

In [982]:
morphemes[morphemes._stem.str.len() > 5]

Unnamed: 0,id,isv,addition,partOfSpeech,en,genesis,pos,prefix,verb_stem,base_verb,isv_orig,left_stem_cand,right_stem_cand,derived_nouns,base_noun,_prefix,_stem,_suffix,_is_irregular,reconstructed
28,19625.0,absorbovati,,v.tr. ipf.,absorb,I,verb,?,?,absorbovati,absorbovati,absorb,absorbovati,|absorbcija,,,absorb,ovati,0,
29,24056.0,abstrahovati,,v.tr. ipf./pf.,abstract,I,verb,?,?,abstrahovati,abstrahovati,abstrah,abstrahovati,,,,abstrah,ovati,0,
50,24081.0,administrovati,,v.tr. ipf.,administrate,I,verb,?,?,administrovati,administrovati,administr,administrovati,|administracija,,,administr,ovati,0,
99,24121.0,agonizovati,,v.intr. ipf.,"agonise, agonize",I,verb,?,?,agonizovati,agonizovati,agoniz,agonizovati,,agonija,,agoniz,ovati,0,
120,24148.0,akcentovati,,v.tr. ipf.,"accentuate, stress (pronunciation)",I,verb,?,?,akcentovati,akcentovati,akcent,akcentovati,,akcent,,akcent,ovati,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17673,1892.0,zloupotrěbjati,,v.tr. ipf.,"abuse, misuse",,verb,?,?,zloupotrěbiti,zloupotrěbjati,zloupotrěb,zloupotrěbjati,,,,zloupotrěb,jati,0,
17712,22850.0,znamenovati,,v.tr. ipf.,"mark, put a mark on",,verb,?,?,znamenovati,znamenovati,znamen,znamenovati,|znameńje|znameńje,,,znameno,vati,0,
17713,22851.0,znamenovati,,v.tr. ipf.,"signify, mean, be a sign of",,verb,?,?,znamenovati,znamenovati,znamen,znamenovati,|znameńje|znameńje,,,znameno,vati,0,
17914,-37006.0,råzpečętyvati,,v.tr. ipf.,unseal,,verb,råz’,pečętyvati,pečętati,råzpečętyvati,råzpečęt,?pečętyvati?,,,¬råz,pečęty,vati,0,


In [964]:

morphemes.query("_prefix == ''").isv.str[:3].value_counts()

prě    33
pri    29
pro    25
obr    22
str    18
       ..
let     1
lep     1
lěn     1
lat     1
žuž     1
Name: isv, Length: 869, dtype: int64

In [973]:
morphemes.query("_prefix == '' and right_stem_cand != isv and not isv.str.contains(' ') and not right_stem_cand.str.startswith('?')").base_verb.value_counts()

vvŕgati        4
dostigati      4
zaustaviti     4
postaviti      4
podključati    4
              ..
podslušati     1
poučati        1
povědati       1
prěslušati     1
zavojevati     1
Name: base_verb, Length: 77, dtype: int64