# Exploring sybword segmentation based on BPE, DPE, morphology and "transition freedom" 

### Test coprus based on https://arxiv.org/pdf/2005.06606.pdf (with numbers removed)



In [183]:
import os, sys
cwd = os.getcwd()
project_path = cwd[:cwd.find('pygents')+7]
if project_path not in sys.path: sys.path.append(project_path)
os.chdir(project_path) 

#from importlib import reload  # Python 3.4+

import pickle
import pandas as pd
import matplotlib.pyplot as plt

#force reimport
if 'pygents.util' in sys.modules:
    del sys.modules['pygents.util']
if 'pygents.text' in sys.modules:
    del sys.modules['pygents.text']
if 'pygents.plot' in sys.modules:
    del sys.modules['pygents.plot']
if 'pygents.token' in sys.modules:
    del sys.modules['pygents.token']
if 'pygents.token_plot' in sys.modules:
    del sys.modules['pygents.token_plot']


from pygents.token import *
from pygents.text import *
from pygents.util import *
from pygents.plot import plot_bars, plot_dict, matrix_plot
from pygents.token_plot import *


In [184]:
#get raw lexicon list
en_lex = list(pd.read_csv("https://raw.githubusercontent.com/aigents/aigents-java/master/lexicon_english.txt",sep='\t',header=None,na_filter=False).to_records(index=False))
print(len(en_lex))

#debug raw lexicon
print(max(en_lex,key=lambda item:item[1]))
en_lex_dict = weightedlist2dict(en_lex,lower=True) # no case-insensitive merge
print(len(en_lex_dict))


97565
('the', 53097401)
97565


In [185]:
lex_en_base10 = FreedomTokenizer(max_n=10,mode='chars',debug=False)
lex_en_base10.train(en_lex_dict)
lex_en_base10.store('data/models/lex_en_counted_10')
print(lex_en_base10.count_params())


1684498


In [186]:
en_ref_tokenizer = PrefixSuffixMorphoTokenizerCached(["./data/corpora/English/morphology/prefixes.txt"],
                                   ["./data/corpora/English/morphology/suffixes.txt"])


In [187]:
texts = ['interestingly', # ['inter', 'est', 'ing', 'ly']
         'universities',
         'antidisestablishmentarianism', # anti-dis-establish-ment-ar-i-an-ism
         'decentralisations',
         'cities',
         'ping']
for text in texts:
    print(en_ref_tokenizer.tokenize(text)) 


['inter', 'est', 'ing', 'ly']
['uni', 'v', 'er', 's', 'ities']
['anti', 'dis', 'establ', 'ish', 'ment', 'arian', 'ism']
['dec', 'ent', 'rali', 's', 'ations']
['c', 'ities']
['p', 'ing']


In [191]:
# test coprus based on https://arxiv.org/pdf/2005.06606.pdf
# columns: 1 manual, 2 BPE, 3 DPE 
tokenizations =[
[['re','cogn','ise','s'],['recognises'],['recognise','s']],
[['advocate','s'],['advocates'],['advocate','s']],
[['euro','zone'],['eurozone'],['euro','zone']],
[['under','line','s'],['underlines'],['underline','s']],
[['strength','en','s'],['strengthens'],['strengthen','s']],
[['entrepreneur','ship'],['entrepreneurship'],['entrepreneur','ship']],
[['ac','knowledge','s'],['acknowledges'],['acknowledge','s']],
[['wine','s'],['wines'],['wine','s']],
[['pre','sent','ly'],['pres','ently'],['present','ly']],
[['fill','ed'],['f','illed'],['fill','ed']],
[['en','dorse','ment'],['endors','ement'],['endorse','ment']],
[['bloc'],['blo','c'],['bl','oc']],
[['crucial','ly'],['cru','cially'],['crucial','ly']],
[['eval','u','ation','s'],['eval','uations'],['evaluation','s']],
[['tree','s'],['tre','es'],['tr','ees']],
[['ticket','s'],['tick','ets'],['tick','et','s']],
[['pre','dict','able'],['predic','table'],['predict','able']],
[['multi','lateral','ism'],['multilater','alism'],['multilateral','ism']],
[['rat','ing','s'],['rat','ings'],['rating','s']],
[['pre','dict','ed'],['predic','ted'],['predict','ed']],
[['motive','s'],['mo','tives'],['motiv','es']],
[['re','in','force','s'],['reinfor','ces'],['reinforce','s']],
[['proto','col','s'],['pro','tocols'],['protocol','s']],
[['progress','ive','ly'],['pro','gressively'],['progressive','ly']],
[['skill'],['sk','ill'],['ski','ll']],
[['prevail','s'],['preva','ils'],['prevail','s']],
[['de','cent','ral','isation'],['decent','ralisation'],['decent','ral','isation']],
[['stor','ed'],['sto','red'],['stor','ed']],
[['in','fluen','za'],['influ','enz','a'],['influen','za']],
[['margin','al','is','ed'],['margin','alised'],['marginal','ised']],
[['stay','ing'],['sta','ying'],['stay','ing']],
[['intensi','ty'],['intens','ity'],['intensi','ty']],
[['re','cast'],['rec','ast'],['re','cast']],
[['guide','line'],['guid','eline'],['guide','line']],
[['em','bark','ed'],['emb','arked'],['embark','ed']],
[['out','line','s'],['out','lines'],['outline','s']],
[['scenario','s'],['scen','ari','os'],['scenario','s']],
[['nati','ve'],['n','ative'],['na','tive']],
[['pre','vent','at','ive'],['preven','tative'],['prevent','ative']],
[['home','land'],['hom','eland'],['home','land']],
[['bath','ing'],['bat','hing'],['bath','ing']],
[['en','danger','ed'],['endang','ered'],['endanger','ed']],
[['continent','al'],['cont','inen','tal'],['continent','al']],
[['ten','th'],['t','enth'],['ten','th']],
[['vulner','abil','ity'],['vul','n','era','bility'],['vul','ner','ability']],
[['realis','ing'],['realis','ing'],['real','ising']],
[['tight','er'],['t','ighter'],['tight','er']]
]
tokenizations = [[''.join(i[0]),i[0],i[1],i[2]] for i in tokenizations]
for i in tokenizations:
    assert len(i)==4


In [195]:
base = FreedomTokenizer(name='data/models/lex_en_counted_10',max_n=10,mode='chars',debug=False)
tf_tokenizer = FreedomBasedTokenizer(base,'peak-','peak+')
n = [7]
t = 0.9
tf_tokenizer.set_options(nlist = n, threshold=t)
f1 = [0,0,0,0]
for t in tokenizations:    
    man = t[1] # manual
    ref = en_ref_tokenizer.tokenize(t[0])
    bpe = t[2]
    dpe = t[3]
    tf = tf_tokenizer.tokenize(t[0])
    ref_f1 = calc_f1(man,ref)
    bpe_f1 = calc_f1(man,bpe)
    dpe_f1 = calc_f1(man,dpe)
    tf_f1 = calc_f1(man,tf)
    if tf_f1 < 1.0:
        print(round(tf_f1,2),'\t',man,'\t',tf)
    f1[0] += ref_f1
    f1[1] += bpe_f1
    f1[2] += dpe_f1
    f1[3] += tf_f1

f1 = [round(f/len(tokenizations),2) for f in f1]
f1


0.57 	 ['re', 'cogn', 'ise', 's'] 	 ['re', 'cogn', 'ises']
0 	 ['advocate', 's'] 	 ['ad', 'voc', 'ates']
0.4 	 ['euro', 'zone'] 	 ['euro', 'z', 'one']
0.33 	 ['under', 'line', 's'] 	 ['under', 'l', 'ines']
0 	 ['strength', 'en', 's'] 	 ['stre', 'ngth', 'ens']
0 	 ['entrepreneur', 'ship'] 	 ['entre', 'preneur', 'sh', 'ip']
0 	 ['ac', 'knowledge', 's'] 	 ['ack', 'nowledg', 'es']
0 	 ['wine', 's'] 	 ['w', 'i', 'n', 'es']
0.33 	 ['pre', 'sent', 'ly'] 	 ['pre', 's', 'ently']
0 	 ['fill', 'ed'] 	 ['f', 'i', 'lled']
0 	 ['en', 'dorse', 'ment'] 	 ['endo', 'rsem', 'ent']
0 	 ['bloc'] 	 ['b', 'lo', 'c']
0.33 	 ['crucial', 'ly'] 	 ['cru', 'ci', 'al', 'ly']
0 	 ['eval', 'u', 'ation', 's'] 	 ['eva', 'lu', 'ations']
0 	 ['tree', 's'] 	 ['tre', 'es']
0 	 ['ticket', 's'] 	 ['ti', 'ck', 'ets']
0.57 	 ['multi', 'lateral', 'ism'] 	 ['multi', 'later', 'al', 'ism']
0 	 ['rat', 'ing', 's'] 	 ['ra', 't', 'i', 'ngs']
0 	 ['motive', 's'] 	 ['mo', 'tiv', 'es']
0.29 	 ['re', 'in', 'force', 's'] 	 ['re', 'inforc'

[0.46, 0.05, 0.55, 0.25]

In [211]:
print('| Reference | Morphology-based | BPE | DPE | Transtion-freedom-based |')
print('|---|---|---|---|---|')
base = FreedomTokenizer(name='data/models/lex_en_counted_10',max_n=10,mode='chars',debug=False)
tf_tokenizer = FreedomBasedTokenizer(base,'peak-','peak+')
n = [7]
t = 0.9
tf_tokenizer.set_options(nlist = n, threshold=t)
f1 = [0,0,0,0]
i = 0
for t in tokenizations:
    i +=1
    man = t[1] # manual
    ref = en_ref_tokenizer.tokenize(t[0])
    bpe = t[2]
    dpe = t[3]
    tf = tf_tokenizer.tokenize(t[0])
    ref_f1 = calc_f1(man,ref)
    bpe_f1 = calc_f1(man,bpe)
    dpe_f1 = calc_f1(man,dpe)
    tf_f1 = calc_f1(man,tf)
    f1[0] += ref_f1
    f1[1] += bpe_f1
    f1[2] += dpe_f1
    f1[3] += tf_f1
    if i % 3 == 0:
        print('|',man,'|',ref,'|',bpe,'|',dpe,'|',tf,'|')
f1 = [round(f/len(tokenizations),2) for f in f1]
#print('||**',f1[0],'**|**',f1[1],'**|**',f1[2],'**|**',f1[3],'**|**')
print('|**F1**|**{}**|**{}**|**{}**|**{}**|'.format(f1[0],f1[1],f1[2],f1[3]))


| Reference | Morphology-based | BPE | DPE | Transtion-freedom-based |
|---|---|---|---|---|
| ['euro', 'zone'] | ['eu', 'rozone'] | ['eurozone'] | ['euro', 'zone'] | ['euro', 'z', 'one'] |
| ['entrepreneur', 'ship'] | ['ent', 're', 'pre', 'neur', 'ship'] | ['entrepreneurship'] | ['entrepreneur', 'ship'] | ['entre', 'preneur', 'sh', 'ip'] |
| ['pre', 'sent', 'ly'] | ['pre', 's', 'ent', 'ly'] | ['pres', 'ently'] | ['present', 'ly'] | ['pre', 's', 'ently'] |
| ['bloc'] | ['bloc'] | ['blo', 'c'] | ['bl', 'oc'] | ['b', 'lo', 'c'] |
| ['tree', 's'] | ['tr', 'ee', 's'] | ['tre', 'es'] | ['tr', 'ees'] | ['tre', 'es'] |
| ['multi', 'lateral', 'ism'] | ['multi', 'lat', 'er', 'al', 'ism'] | ['multilater', 'alism'] | ['multilateral', 'ism'] | ['multi', 'later', 'al', 'ism'] |
| ['motive', 's'] | ['mot', 'ive', 's'] | ['mo', 'tives'] | ['motiv', 'es'] | ['mo', 'tiv', 'es'] |
| ['progress', 'ive', 'ly'] | ['pro', 'gr', 'ess', 'ive', 'ly'] | ['pro', 'gressively'] | ['progressive', 'ly'] | ['pro', 'g

| Reference | Morphology-based | BPE | DPE | Transtion-freedom-based |
|---|---|---|---|---|
| ['euro', 'zone'] | ['eu', 'rozone'] | ['eurozone'] | ['euro', 'zone'] | ['euro', 'z', 'one'] |
| ['entrepreneur', 'ship'] | ['ent', 're', 'pre', 'neur', 'ship'] | ['entrepreneurship'] | ['entrepreneur', 'ship'] | ['entre', 'preneur', 'sh', 'ip'] |
| ['pre', 'sent', 'ly'] | ['pre', 's', 'ent', 'ly'] | ['pres', 'ently'] | ['present', 'ly'] | ['pre', 's', 'ently'] |
| ['bloc'] | ['bloc'] | ['blo', 'c'] | ['bl', 'oc'] | ['b', 'lo', 'c'] |
| ['tree', 's'] | ['tr', 'ee', 's'] | ['tre', 'es'] | ['tr', 'ees'] | ['tre', 'es'] |
| ['multi', 'lateral', 'ism'] | ['multi', 'lat', 'er', 'al', 'ism'] | ['multilater', 'alism'] | ['multilateral', 'ism'] | ['multi', 'later', 'al', 'ism'] |
| ['motive', 's'] | ['mot', 'ive', 's'] | ['mo', 'tives'] | ['motiv', 'es'] | ['mo', 'tiv', 'es'] |
| ['progress', 'ive', 'ly'] | ['pro', 'gr', 'ess', 'ive', 'ly'] | ['pro', 'gressively'] | ['progressive', 'ly'] | ['pro', 'gressiv', 'ely'] |
| ['de', 'cent', 'ral', 'isation'] | ['dec', 'ent', 'r', 'al', 'isation'] | ['decent', 'ralisation'] | ['decent', 'ral', 'isation'] | ['de', 'centralis', 'ation'] |
| ['margin', 'al', 'is', 'ed'] | ['marginali', 's', 'ed'] | ['margin', 'alised'] | ['marginal', 'ised'] | ['mar', 'ginal', 'is', 'ed'] |
| ['re', 'cast'] | ['re', 'cast'] | ['rec', 'ast'] | ['re', 'cast'] | ['re', 'c', 'ast'] |
| ['out', 'line', 's'] | ['out', 'l', 'ine', 's'] | ['out', 'lines'] | ['outline', 's'] | ['out', 'l', 'ines'] |
| ['pre', 'vent', 'at', 'ive'] | ['pre', 'v', 'ent', 'ative'] | ['preven', 'tative'] | ['prevent', 'ative'] | ['pre', 'vent', 'ative'] |
| ['en', 'danger', 'ed'] | ['end', 'an', 'g', 'er', 'ed'] | ['endang', 'ered'] | ['endanger', 'ed'] | ['en', 'dang', 'ered'] |
| ['vulner', 'abil', 'ity'] | ['vulnerabil', 'ity'] | ['vul', 'n', 'era', 'bility'] | ['vul', 'ner', 'ability'] | ['vul', 'ner', 'ability'] |
|**F1**|**0.46**|**0.05**|**0.55**|**0.25**|

In [200]:
base = FreedomTokenizer(name='data/models/lex_en_counted_10',max_n=10,mode='chars',debug=False)
tf_tokenizer = FreedomBasedTokenizer(base,'peak-','peak+')
for n in [[1],[2],[3],[4],[5],[6],[7]]:
    for th in [0.5,0.7,0.9,0.95]:
        tf_tokenizer.set_options(nlist = n, threshold=th)
        f1 = [0,0,0,0]
        for t in tokenizations:    
            man = t[1] # manual
            ref = en_ref_tokenizer.tokenize(t[0])
            bpe = t[2]
            dpe = t[3]
            tf = tf_tokenizer.tokenize(t[0])
            ref_f1 = calc_f1(man,ref)
            bpe_f1 = calc_f1(man,bpe)
            dpe_f1 = calc_f1(man,dpe)
            tf_f1 = calc_f1(man,tf)
            #if tf_f1 < 1.0:
            #    print(round(tf_f1,2),'\t',man,'\t',tf)
            f1[0] += ref_f1
            f1[1] += bpe_f1
            f1[2] += dpe_f1
            f1[3] += tf_f1
        f1 = [round(f/len(tokenizations),2) for f in f1]
        print(n,th,f1)


[1] 0.5 [0.46, 0.05, 0.55, 0.09]
[1] 0.7 [0.46, 0.05, 0.55, 0.09]
[1] 0.9 [0.46, 0.05, 0.55, 0.09]
[1] 0.95 [0.46, 0.05, 0.55, 0.09]
[2] 0.5 [0.46, 0.05, 0.55, 0.12]
[2] 0.7 [0.46, 0.05, 0.55, 0.13]
[2] 0.9 [0.46, 0.05, 0.55, 0.14]
[2] 0.95 [0.46, 0.05, 0.55, 0.14]
[3] 0.5 [0.46, 0.05, 0.55, 0.18]
[3] 0.7 [0.46, 0.05, 0.55, 0.2]
[3] 0.9 [0.46, 0.05, 0.55, 0.21]
[3] 0.95 [0.46, 0.05, 0.55, 0.18]
[4] 0.5 [0.46, 0.05, 0.55, 0.26]
[4] 0.7 [0.46, 0.05, 0.55, 0.26]
[4] 0.9 [0.46, 0.05, 0.55, 0.26]
[4] 0.95 [0.46, 0.05, 0.55, 0.24]
[5] 0.5 [0.46, 0.05, 0.55, 0.24]
[5] 0.7 [0.46, 0.05, 0.55, 0.25]
[5] 0.9 [0.46, 0.05, 0.55, 0.27]
[5] 0.95 [0.46, 0.05, 0.55, 0.25]
[6] 0.5 [0.46, 0.05, 0.55, 0.25]
[6] 0.7 [0.46, 0.05, 0.55, 0.25]
[6] 0.9 [0.46, 0.05, 0.55, 0.25]
[6] 0.95 [0.46, 0.05, 0.55, 0.24]
[7] 0.5 [0.46, 0.05, 0.55, 0.26]
[7] 0.7 [0.46, 0.05, 0.55, 0.25]
[7] 0.9 [0.46, 0.05, 0.55, 0.25]
[7] 0.95 [0.46, 0.05, 0.55, 0.24]


### Check if limiting training set by word frequency helps to improve morho-parsing F1 (0.0005-0.001 is the best)

#### Training in https://github.com/aigents/pygents/blob/main/notebooks/nlp/morphology/morphology_lexicon_en_ru.ipynb


In [217]:
for model in ['data/models/lex_en_counted_10','data/models/lex_en_counted_10_0005','data/models/lex_en_counted_10_001',
              'data/models/lex_en_counted_10_005','data/models/lex_en_counted_10_01']:
    best_tf_f1 = 0
    best_f1 = None
    base = FreedomTokenizer(name=model,max_n=10,mode='chars',debug=False)
    tf_tokenizer = FreedomBasedTokenizer(base,'peak-','peak+')
    for n in [[1],[2],[3],[4],[5],[6],[7]]:
        for th in [0.5,0.7,0.9,0.95]:
            tf_tokenizer.set_options(nlist = n, threshold=th)
            f1 = [0,0,0,0]
            for t in tokenizations:    
                man = t[1] # manual
                ref = en_ref_tokenizer.tokenize(t[0])
                bpe = t[2]
                dpe = t[3]
                tf = tf_tokenizer.tokenize(t[0])
                ref_f1 = calc_f1(man,ref)
                bpe_f1 = calc_f1(man,bpe)
                dpe_f1 = calc_f1(man,dpe)
                tf_f1 = calc_f1(man,tf)
                #if tf_f1 < 1.0:
                #    print(round(tf_f1,2),'\t',man,'\t',tf)
                f1[0] += ref_f1
                f1[1] += bpe_f1
                f1[2] += dpe_f1
                f1[3] += tf_f1
            f1 = [round(f/len(tokenizations),2) for f in f1]
            #print(n,th,f1)
            if best_tf_f1 < f1[3]:
                best_tf_f1 = f1[3]
                best_f1 = f1
    print(model,f1)


data/models/lex_en_counted_10 [0.46, 0.05, 0.55, 0.24]
data/models/lex_en_counted_10_0005 [0.46, 0.05, 0.55, 0.28]
data/models/lex_en_counted_10_001 [0.46, 0.05, 0.55, 0.28]
data/models/lex_en_counted_10_005 [0.46, 0.05, 0.55, 0.21]
data/models/lex_en_counted_10_01 [0.46, 0.05, 0.55, 0.08]


In [219]:
base = FreedomTokenizer(name='data/models/lex_en_nocount_7',max_n=7,mode='chars',debug=False)
for model_threshold in [0,0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.5]:
    if model_threshold > 0:
        model_compress_with_loss(base.model,model_threshold)
    best_tf_f1 = 0
    best_f1 = None
    tf_tokenizer = FreedomBasedTokenizer(base,'peak-','peak+')
    for n in [[1],[2],[3],[4],[5],[6],[7]]:
        for th in [0.5,0.7,0.9,0.95]:
            tf_tokenizer.set_options(nlist = n, threshold=th)
            f1 = [0,0,0,0]
            for t in tokenizations:    
                man = t[1] # manual
                ref = en_ref_tokenizer.tokenize(t[0])
                bpe = t[2]
                dpe = t[3]
                tf = tf_tokenizer.tokenize(t[0])
                ref_f1 = calc_f1(man,ref)
                bpe_f1 = calc_f1(man,bpe)
                dpe_f1 = calc_f1(man,dpe)
                tf_f1 = calc_f1(man,tf)
                #if tf_f1 < 1.0:
                #    print(round(tf_f1,2),'\t',man,'\t',tf)
                f1[0] += ref_f1
                f1[1] += bpe_f1
                f1[2] += dpe_f1
                f1[3] += tf_f1
            f1 = [round(f/len(tokenizations),2) for f in f1]
            #print(n,th,f1)
            if best_tf_f1 < f1[3]:
                best_tf_f1 = f1[3]
                best_f1 = f1
    print(model_threshold,base.count_params(),f1)

0 1257863 [0.46, 0.05, 0.55, 0.24]
0.0001 923736 [0.46, 0.05, 0.55, 0.24]
0.0005 899442 [0.46, 0.05, 0.55, 0.24]
0.001 896201 [0.46, 0.05, 0.55, 0.23]
0.005 891853 [0.46, 0.05, 0.55, 0.24]
0.01 888733 [0.46, 0.05, 0.55, 0.25]
0.05 863488 [0.46, 0.05, 0.55, 0.24]
0.1 834719 [0.46, 0.05, 0.55, 0.27]
0.5 689472 [0.46, 0.05, 0.55, 0.19]


## Experiment with MI 

In [60]:
base = FreedomTokenizer(name='data/models/brown_nolines_chars_7a',max_n=7,mode='chars',debug=False)


In [62]:
base.model[0]['a']

381728

In [180]:
from math import exp, log
def root_n(x,n):
    return exp(log(x)/n)

def eval_split_log_div_cnt(base,split,debug=False):
    f = base.model[0]
    p = 1
    for t in split:
        cnt = f[t] if t in f else 0
        cnt /= len(t)
        v = math.log2(cnt)
        p = p * v
    return p

def eval_split_root(base,split,debug=False):
    f = base.model[0]
    p = 1.0
    for t in split:
        cnt = f[t] if t in f else 0
        p = p * cnt * len(t)
    if debug:
        print(p)
    return p if p == 0 or len(split) == 1 else root_n(p,len(split))
    
def eval_avg_log(base,split,extra_len_discount=False,debug=False):
    f = base.model[0]
    p = 0.0
    for t in split:
        cnt = f[t] if t in f else 0
        if extra_len_discount:
            p = p + math.log(cnt*len(t)*len(t)+1)
        else:
            p = p + math.log(cnt*len(t)+1)
    if debug:
        print(p)
    return p / len(split)
    
def eval_splits(base,splits,extra_len_discount=False,debug=True):
    emax = 0
    best = None
    for split in splits:
        e = eval_avg_log(base,split,extra_len_discount=extra_len_discount)
        if emax < e:
            emax = e
            best = split
        if debug:
            print(round(e,2),split)
    return best


In [181]:
print(eval_splits(lex_en_base10,[['lighter'],
    ['l', 'ighter'],['li', 'ghter'],['lig', 'hter'],['ligh', 'ter'],['light', 'er'],['lighte', 'r'],
    ['l', 'i','ghter'],['l', 'ig','hter'],['l', 'igh','ter'],['l', 'ight','er'],['l', 'ighte','r'],
    ['li', 'g','hter'],['li', 'gh','ter'],['li', 'ght','er'],['li', 'ghte','r'],
    ['lig', 'h','ter'],['lig', 'ht','er'],['lig', 'hte','r'],['ligh', 't','er'],['ligh', 'te','r'],
    ['light', 'e','r']]))

11.04 ['lighter']
15.39 ['l', 'ighter']
15.4 ['li', 'ghter']
14.1 ['lig', 'hter']
15.94 ['ligh', 'ter']
16.71 ['light', 'er']
16.07 ['lighte', 'r']
17.21 ['l', 'i', 'ghter']
16.17 ['l', 'ig', 'hter']
17.41 ['l', 'igh', 'ter']
17.85 ['l', 'ight', 'er']
17.28 ['l', 'ighte', 'r']
16.21 ['li', 'g', 'hter']
17.0 ['li', 'gh', 'ter']
17.38 ['li', 'ght', 'er']
16.88 ['li', 'ghte', 'r']
17.08 ['lig', 'h', 'ter']
16.45 ['lig', 'ht', 'er']
16.0 ['lig', 'hte', 'r']
17.6 ['ligh', 't', 'er']
17.3 ['ligh', 'te', 'r']
18.0 ['light', 'e', 'r']
['light', 'e', 'r']


In [182]:
print(eval_splits(lex_en_base10,[['lighter'],
    ['l', 'ighter'],['li', 'ghter'],['lig', 'hter'],['ligh', 'ter'],['light', 'er'],['lighte', 'r'],
    ['l', 'i','ghter'],['l', 'ig','hter'],['l', 'igh','ter'],['l', 'ight','er'],['l', 'ighte','r'],
    ['li', 'g','hter'],['li', 'gh','ter'],['li', 'ght','er'],['li', 'ghte','r'],
    ['lig', 'h','ter'],['lig', 'ht','er'],['lig', 'hte','r'],['ligh', 't','er'],['ligh', 'te','r'],
    ['light', 'e','r']],extra_len_discount=True))

12.99 ['lighter']
16.28 ['l', 'ighter']
16.55 ['li', 'ghter']
15.35 ['lig', 'hter']
17.18 ['ligh', 'ter']
17.86 ['light', 'er']
16.97 ['lighte', 'r']
17.75 ['l', 'i', 'ghter']
16.86 ['l', 'ig', 'hter']
18.14 ['l', 'igh', 'ter']
18.54 ['l', 'ight', 'er']
17.82 ['l', 'ighte', 'r']
16.9 ['li', 'g', 'hter']
17.83 ['li', 'gh', 'ter']
18.21 ['li', 'ght', 'er']
17.57 ['li', 'ghte', 'r']
17.81 ['lig', 'h', 'ter']
17.28 ['lig', 'ht', 'er']
16.73 ['lig', 'hte', 'r']
18.3 ['ligh', 't', 'er']
17.99 ['ligh', 'te', 'r']
18.53 ['light', 'e', 'r']
['l', 'ight', 'er']


In [170]:
print(eval_splits(base,[['t', 'ighter'],['ti', 'ghter'],['tig', 'hter'],['tigh', 'ter'],['tight', 'er'],['tighte', 'r']]))
print(eval_splits(base,[['l', 'ighter'],['li', 'ghter'],['lig', 'hter'],['ligh', 'ter'],['light', 'er'],['lighte', 'r']]))


10.34 ['t', 'ighter']
10.28 ['ti', 'ghter']
7.99 ['tig', 'hter']
9.28 ['tigh', 'ter']
9.99 ['tight', 'er']
9.53 ['tighte', 'r']
['t', 'ighter']
9.93 ['l', 'ighter']
9.96 ['li', 'ghter']
8.8 ['lig', 'hter']
10.65 ['ligh', 'ter']
11.36 ['light', 'er']
10.38 ['lighte', 'r']
['light', 'er']


In [163]:
print(eval_splits(lex_en_base10,[['t', 'ighter'],['ti', 'ghter'],['tig', 'hter'],['tigh', 'ter'],['tight', 'er'],['tighte', 'r']]))
print(eval_splits(lex_en_base10,[['l', 'ighter'],['li', 'ghter'],['lig', 'hter'],['ligh', 'ter'],['light', 'er'],['lighte', 'r']]))


31.6 ['t', 'ighter']
31.57 ['ti', 'ghter']
26.76 ['tig', 'hter']
29.18 ['tigh', 'ter']
30.72 ['tight', 'er']
30.12 ['tighte', 'r']
['t', 'ighter']
30.78 ['l', 'ighter']
30.8 ['li', 'ghter']
28.21 ['lig', 'hter']
31.87 ['ligh', 'ter']
33.41 ['light', 'er']
32.14 ['lighte', 'r']
['light', 'er']


In [164]:
print(eval_splits(lex_en_base10,[
    ['lighter'],
    ['l', 'ighter'],['li', 'ghter'],['lig', 'hter'],['ligh', 'ter'],['light', 'er'],['lighte', 'r'],
    ['l', 'i','ghter'],['l', 'ig','hter'],['l', 'igh','ter'],['l', 'ight','er'],['l', 'ighte','r'],
    ['li', 'g','hter'],['li', 'gh','ter'],['li', 'ght','er'],['li', 'ghte','r'],
    ['lig', 'h','ter'],['lig', 'ht','er'],['lig', 'hte','r'],
    ['ligh', 't','er'],['ligh', 'te','r'],
    ['light', 'e','r'],
]))


22.08 ['lighter']
30.78 ['l', 'ighter']
30.8 ['li', 'ghter']
28.21 ['lig', 'hter']
31.87 ['ligh', 'ter']
33.41 ['light', 'er']
32.14 ['lighte', 'r']
34.42 ['l', 'i', 'ghter']
32.33 ['l', 'ig', 'hter']
34.81 ['l', 'igh', 'ter']
35.69 ['l', 'ight', 'er']
34.57 ['l', 'ighte', 'r']
32.41 ['li', 'g', 'hter']
33.99 ['li', 'gh', 'ter']
34.76 ['li', 'ght', 'er']
33.76 ['li', 'ghte', 'r']
34.16 ['lig', 'h', 'ter']
32.9 ['lig', 'ht', 'er']
31.99 ['lig', 'hte', 'r']
35.2 ['ligh', 't', 'er']
34.59 ['ligh', 'te', 'r']
35.99 ['light', 'e', 'r']
['light', 'e', 'r']


In [160]:
eval_split_root(lex_en_base10,['l', 'ig','hter'],True)

1.1587747904004283e+21


10503475.162330275