# References

- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2655800/
- https://lena-voita.github.io/nlp_course/language_modeling.html
- https://en.wikipedia.org/wiki/Perplexity
- https://github.com/singnet/language-learning/issues/255
- https://medium.com/mlearning-ai/word-embeddings-wordpiece-and-language-agnostic-bert-labse-98c7626878c7


- https://github.com/natasha/razdel - razdel tries to mimic segmentation of these 4 datasets: SynTagRus, OpenCorpora, GICRYA and RNC. 
- https://www.kaggle.com/c/text-normalization-challenge-english-language
- https://www.kaggle.com/c/text-normalization-challenge-russian-language




In [1]:
import os, sys
cwd = os.getcwd()
project_path = cwd[:cwd.find('pygents')+7]
if project_path not in sys.path: sys.path.append(project_path)
os.chdir(project_path) 

#from importlib import reload  # Python 3.4+

import pickle
import pandas as pd

#force reimport
if 'pygents.util' in sys.modules:
    del sys.modules['pygents.util']
if 'pygents.text' in sys.modules:
    del sys.modules['pygents.text']
if 'pygents.plot' in sys.modules:
    del sys.modules['pygents.plot']
if 'pygents.token' in sys.modules:
    del sys.modules['pygents.token']


from pygents.token import *
from pygents.text import *
from pygents.util import *
from pygents.plot import plot_bars, plot_dict 


In [2]:
brown_chars = FreedomTokenizer(name='data/models/brown_nolines_chars_7a',max_n=7,mode='chars',debug=False)
print(brown_chars.count_params())

brown_grams = FreedomTokenizer(name='data/models/brown_nolines_grams_7a',max_n=7,mode='grams',debug=False)
print(brown_grams.count_params())


10967135
33960499


In [3]:
brown_text_lines = url_text_lines("http://www.sls.hawaii.edu/bley-vroman/brown_nolines.txt")
print(len(brown_text_lines))


19810


In [4]:
test_texts = text_lines_sample(brown_text_lines,10,[" ","#"])


In [18]:
def tokenize_with_forward_metric(model,text,forw,nlist,threshold=0.5,debug=False):
    tokens = []
    token = ''
    df = profile_freedoms_avg_df(model,text,[forw],nlist)
    length = len(df)
    for i in range(length):
        brk_forw = True if df.loc[i][forw] >= threshold else False
        token += df.loc[i]['gram']
        if debug:
            print("{}\t{}\t{}\t{}\t{}".format(df.loc[i]['gram'],'+' if brk_forw else '',round(df.loc[i][back],2),round(df.loc[i][forw],2),token))
        if len(token) > 0 and brk_forw:
            tokens.append(token)
            token = ''
    if len(token) > 0:
            tokens.append(token)
    return tokens

def evaluate_tokenizer(model,texts,forw,back,nlist,threshold,spaces=False,debug=False):
    f1_avg = 0
    for text in texts:
        tokens = tokenize_with_opposite_metrics(model,text,forw,back,nlist,threshold=threshold) if back is not None else tokenize_with_forward_metric(model,text,forw,nlist,threshold=threshold)
        tokens_ref = tokenize_split_with_delimiters_and_quotes(text)
        if not spaces:
            remove_all(tokens,' ')
            remove_all(tokens_ref,' ')
        f1 = calc_f1(tokens_ref,tokens) 
        f1_avg += f1
        if debug:
            print(f1)
            print(text)
            print(calc_diff(tokens,tokens_ref))
            print(str(tokens_ref))
            print(str(tokens))
            print()
    print("{}\t{}\t{}".format(nlist,threshold,round(f1_avg/len(texts),2)))


In [6]:
print("N\tthres.\tF1")
for nlist in [[1],[1,2],[2,3],[1,2,3],[1,2,3,4]]:
    for threshold in [0.2,0.3,0.4,0.5,0.6,0.7,0.8]: 
        evaluate_tokenizer(brown_chars.model,test_texts,'ddf-','ddf+',nlist,threshold,spaces=False)


N	thres.	F1
[1]	0.2	0.59
[1]	0.3	0.62
[1]	0.4	0.57
[1]	0.5	0.56
[1]	0.6	0.56
[1]	0.7	0.78
[1]	0.8	0.78
[1, 2]	0.2	0.4
[1, 2]	0.3	0.61
[1, 2]	0.4	0.91
[1, 2]	0.5	0.9
[1, 2]	0.6	0.79
[1, 2]	0.7	0.72
[1, 2]	0.8	0.67
[2, 3]	0.2	0.49
[2, 3]	0.3	0.66
[2, 3]	0.4	0.78
[2, 3]	0.5	0.75
[2, 3]	0.6	0.65
[2, 3]	0.7	0.43
[2, 3]	0.8	0.2
[1, 2, 3]	0.2	0.56
[1, 2, 3]	0.3	0.79
[1, 2, 3]	0.4	0.84
[1, 2, 3]	0.5	0.85
[1, 2, 3]	0.6	0.7
[1, 2, 3]	0.7	0.55
[1, 2, 3]	0.8	0.32
[1, 2, 3, 4]	0.2	0.69
[1, 2, 3, 4]	0.3	0.82
[1, 2, 3, 4]	0.4	0.78
[1, 2, 3, 4]	0.5	0.7
[1, 2, 3, 4]	0.6	0.51
[1, 2, 3, 4]	0.7	0.25
[1, 2, 3, 4]	0.8	0.11


In [8]:
print("N\tthres.\tF1")
for nlist in [[1],[1,2],[2,3],[1,2,3],[1,2,3,4]]:
    for threshold in [0.2,0.3,0.4,0.5,0.6,0.7,0.8]: 
        evaluate_tokenizer(brown_grams.model,test_texts,'ddf-','ddf+',nlist,threshold,spaces=False)


N	thres.	F1
[1]	0.2	0.59
[1]	0.3	0.62
[1]	0.4	0.57
[1]	0.5	0.56
[1]	0.6	0.56
[1]	0.7	0.78
[1]	0.8	0.78
[1, 2]	0.2	0.38
[1, 2]	0.3	0.46
[1, 2]	0.4	0.46
[1, 2]	0.5	0.53
[1, 2]	0.6	0.48
[1, 2]	0.7	0.28
[1, 2]	0.8	0.05
[2, 3]	0.2	0.53
[2, 3]	0.3	0.38
[2, 3]	0.4	0.24
[2, 3]	0.5	0.11
[2, 3]	0.6	0.05
[2, 3]	0.7	0.03
[2, 3]	0.8	0.02
[1, 2, 3]	0.2	0.53
[1, 2, 3]	0.3	0.39
[1, 2, 3]	0.4	0.24
[1, 2, 3]	0.5	0.11
[1, 2, 3]	0.6	0.05
[1, 2, 3]	0.7	0.03
[1, 2, 3]	0.8	0.02
[1, 2, 3, 4]	0.2	0.29
[1, 2, 3, 4]	0.3	0.17
[1, 2, 3, 4]	0.4	0.09
[1, 2, 3, 4]	0.5	0.03
[1, 2, 3, 4]	0.6	0.02
[1, 2, 3, 4]	0.7	0.02
[1, 2, 3, 4]	0.8	0.01


In [19]:
# just use one parameter 
print("N\tthres.\tF1")
for nlist in [[1],[1,2],[2,3],[1,2,3],[1,2,3,4]]:
    for threshold in [0.2,0.3,0.4,0.5,0.6,0.7,0.8]: 
        evaluate_tokenizer(brown_chars.model,test_texts,'ddf+|ddf-',None,nlist,threshold,spaces=False)


N	thres.	F1
[1]	0.2	0.61
[1]	0.3	0.57
[1]	0.4	0.7
[1]	0.5	0.37
[1]	0.6	0.15
[1]	0.7	0.07
[1]	0.8	0.0
[1, 2]	0.2	0.71
[1, 2]	0.3	0.85
[1, 2]	0.4	0.88
[1, 2]	0.5	0.66
[1, 2]	0.6	0.26
[1, 2]	0.7	0.06
[1, 2]	0.8	0.01
[2, 3]	0.2	0.7
[2, 3]	0.3	0.78
[2, 3]	0.4	0.66
[2, 3]	0.5	0.4
[2, 3]	0.6	0.13
[2, 3]	0.7	0.03
[2, 3]	0.8	0.01
[1, 2, 3]	0.2	0.75
[1, 2, 3]	0.3	0.87
[1, 2, 3]	0.4	0.75
[1, 2, 3]	0.5	0.43
[1, 2, 3]	0.6	0.16
[1, 2, 3]	0.7	0.05
[1, 2, 3]	0.8	0.0
[1, 2, 3, 4]	0.2	0.78
[1, 2, 3, 4]	0.3	0.82
[1, 2, 3, 4]	0.4	0.68
[1, 2, 3, 4]	0.5	0.42
[1, 2, 3, 4]	0.6	0.16
[1, 2, 3, 4]	0.7	0.04
[1, 2, 3, 4]	0.8	0.01
