In [1]:
import numpy as np
from collections import defaultdict
import pylelemmatize
import timeit
import numpy as np



## Benchmarking pylelemmatize mapping operations

In [2]:
existing_sample = "abcd" * 2560000 
non_existing_sample = "\t\t\t\t" * 2560000
pcnt99_existing_sample = ("a"*99 + "\t") * 1024 * 100
small_sample = "abcd"*256
small_alphabet = "abcd"
large_alphabet = pylelemmatize.charset.mes3a

### Running Benchmarks

In [None]:
results = {}
for sample_size ,sample_impurity , sample in [(len(small_sample),0.,small_sample),
                            (len(existing_sample),0.,existing_sample),
                            (len(pcnt99_existing_sample), .01, pcnt99_existing_sample),
                            (len(non_existing_sample),1., non_existing_sample),]:
    for alphabet_size, alphabet_str in [[len(small_alphabet), small_alphabet],[len(large_alphabet), large_alphabet]]:
        mapping_dict = {c:c for c in alphabet_str}
        m1 = lambda x:''.join([mapping_dict.get(c, pylelemmatize.default_unknown_chr) for c in sample])
        d = defaultdict(lambda: pylelemmatize.default_unknown_chr)
        m2 = lambda x:''.join([d[c] for c in sample])
        m3 = pylelemmatize.create_mapper(alphabet_str)
        m4 = pylelemmatize.create_mapper(alphabet_str, mapper_type = "generic")
        m1result = %timeit -o m1(sample)
        m2result = %timeit -o m2(sample)
        m3result = %timeit -o m3(sample)
        m4result = %timeit -o m4(sample)
        results[(sample_size, sample_impurity, alphabet_size)] = {"Generic Lemmatizer": m4result,"Fast Lemmatizer": m3result, "Dict":m1result, "Defaultdict": m2result}

### Rendering benchmark results to a latex table

In [None]:
def format_outputs(sample_results, sample_length, scale_by):
    min_val = min([v.average for v in sample_results.values()])
    res = {}
    for k, v in  sample_results.items():
        duration_per_byte = v.average /sample_length
        std_per_byte = v.stdev / sample_length
        duration = duration_per_byte * scale_by
        std = std_per_byte * scale_by
        value_str = f" $ {(duration):.1f} \\pm {(std):.1f} $"
        if v.average == min_val:
            value_str = "\\boldmath{" + value_str + "}"
        res[k] = value_str
    return res


def render_line(parameters, results):
    sample_length, sample_missrate, alphabet_sz = parameters
    rendered_results = format_outputs(results, sample_length, 1024**2*1000)
    result_str = f" & {rendered_results['Dict']} & {rendered_results['Defaultdict']} & {rendered_results['Generic Lemmatizer']} &{rendered_results['Fast Lemmatizer']}"
    result_str = f"{int(sample_length/1024)} & {sample_missrate} & {alphabet_sz} {result_str} \\\\"
    return result_str

res_lines = []
for parameters, measurements in results.items():
    res_lines.append(render_line(parameters, measurements))

print("\\begin{tabular}{ccc||c|c|c|c}")
print("\\hline")
print("\multicolumn{3}{c||}{Experiment Parameters} & \multicolumn{3}{c}{Benchmarked Methods (msec. \ MB)} \\\\")
print("""\\hline
\makecell{Sample\\\\Size (KB)} & 
\makecell{Unknown\\\\Characters (\%)} & 
\makecell{Mapping\\\\Size} & 
\makecell{Python\\\\dict} & 
\makecell{Python\\\\defaultdict} & 
\makecell{Generic\\\\Lemmatizer} & 
\makecell{Fast\\\\Lemmatizer} \\\\
\\hline""")
print("\n".join(res_lines))
print("\\hline\n\end{tabular}")


## Comparing Pylelemmatize to choco-mufin for alphabet extraction

In [4]:
#!pip install chocomufin
print("Running chomufin alphabet extraction")
!time chocomufin generate --parser txt  /tmp/koeningsfelden_expanded.csv ../../tmp/koeningsfelden/koenigsfelden_1308-1662_expanded/*.txt
!rm /tmp/koeningsfelden_expanded.csv # if the file is there, generation behaves very differently.
print("Running pylelemmatizer alphabet extraction")
!time ll_extract_corpus_alphabet -dont_show_histogram -corpus_files ../../tmp/koeningsfelden/koenigsfelden_1308-1662_expanded/*txt

Running chomufin alphabet extraction
100%|█████████████████████████████████████| 1124/1124 [00:00<00:00, 2592.42it/s]
{'D', '7', 'æ', ',', ')', 'C', 'X', '-', 'P', '¶', 'A', 'T', '\\', 'ì', 'g', 'k', 'ɉ', 'ꝑ', 'S', 'ꝓ', '5', 'r', '0', '̀', 'x', 'j', 'Q', ']', 'ē', 'h', 'o', '?', 'È', 'v', 'i', 'e', 'p', 'ÿ', 'ꝯ', '>', '6', '\uf2ea', 'ͦ', 'Ä', 'Ë', 'd', 'ꝙ', 'ͤ', 'B', 'Û', 'c', 'è', 'ï', 'ͥ', 'm', 'F', 'ꝝ', ';', '₰', '̄', 's', '✳', '.', 'R', 'ë', ':', 'ꝫ', 'l', 'ä', '/', '2', 'Ü', '̃', 'u', 'é', '4', 'ƺ', 'ö', 'â', 'L', 'ͧ', 'ü', '₎', 'ꝭ', 'a', 'û', 'n', 'ꝟ', 'î', 'K', 'O', 'N', '(', '_', 'z', 'Â', 'Ò', 'E', '!', 'H', '=', '[', '¬', '3', 'À', '|', '`', 'I', 'ß', 'q', '&', 'ê', 'ù', 't', 'ȝ', 'V', 'ˀ', 'ʼ', 'J', '\uf2e9', 'Y', 'U', 'Ù', 'ͨ', 'b', '\uf2f7', '1', 'ò', 'à', 'ͣ', 'G', 'ꝰ', 'Z', 'Ö', 'W', 'ô', 'w', '9', 'y', '8', 'f', '̂', 'á', 'M', 'ħ'}

real	0m0.547s
user	0m0.517s
sys	0m0.029s
Running pylelemmatizer alphabet extraction

 !&(),-./0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_