In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import os
from tqdm.notebook import tqdm

from spelling_correction import DATA_DIR, BENCHMARK_DIR

from gnn_lib.data import tokenization, utils
from gnn_lib.utils import io
from gnn_lib.api.utils import load_text_file

In [18]:
tokenizer = tokenization.BPETokenizer(
    cfg=tokenization.TokenizerConfig(type=tokenization.Tokenizers.BPE, file_path=os.path.join(DATA_DIR, "tokenizers", "bpe", "wiki_bookcorpus_10k_no_prefix_space.pkl"))
)
char_tokenizer = tokenization.CharTokenizer()

tok_fn = tokenization.get_tokenization_fn(tokenizer)
char_tok_fn = tokenization.get_tokenization_fn(char_tokenizer)

In [6]:
benchmarks = io.glob_safe(os.path.join(BENCHMARK_DIR, "test", "sec", "*", "*", "corrupt.txt"))
benchmarks

['/home/sebastian/msc/masters_thesis/code/spelling_correction/benchmarks/test/sec/wikidump/artificial/corrupt.txt',
 '/home/sebastian/msc/masters_thesis/code/spelling_correction/benchmarks/test/sec/wikidump/realistic/corrupt.txt',
 '/home/sebastian/msc/masters_thesis/code/spelling_correction/benchmarks/test/sec/bookcorpus/artificial/corrupt.txt',
 '/home/sebastian/msc/masters_thesis/code/spelling_correction/benchmarks/test/sec/bookcorpus/realistic/corrupt.txt',
 '/home/sebastian/msc/masters_thesis/code/spelling_correction/benchmarks/test/sec/neuspell/bea60k/corrupt.txt']

In [26]:
def get_trans_edges(tokens: list) -> int:
    return sum(len(t) for t in tokens) ** 2

def get_gnn_edges(tokens: list) -> int:
    return len(tokens) ** 2 + sum(len(t) * (len(t) + 1) for t in tokens)

In [27]:
def get_avg_statistics(benchmarks: list, tok_fn) -> tuple:
    total_tokens = 0
    total_words = 0
    total_sequences = 0
    trans_edges = 0
    gnn_edges = 0
    for benchmark in tqdm(benchmarks, desc="tokenizing benchmarks"):
        inputs = load_text_file(benchmark)
        tokenized = utils.tokenize_words_batch(inputs, return_docs=True)
        for _, doc in tokenized:
            tokens = tok_fn(doc)
            total_words += len(tokens)
            total_tokens += sum(len(t) for t in tokens)
            total_sequences += 1
            trans_edges += get_trans_edges(tokens)
            gnn_edges += get_gnn_edges(tokens)
    return total_tokens / total_words, total_words / total_sequences, trans_edges / total_sequences, gnn_edges / total_sequences

In [28]:
def transformer_and_gnn_edges(avg_number_of_tokens_per_word: float, avg_number_of_words_per_sequence: float) -> float:
    transformer_num_edges = (avg_number_of_tokens_per_word * avg_number_of_words_per_sequence) ** 2
    gnn_num_edges = (
        avg_number_of_words_per_sequence ** 2 # word fully connected
        + avg_number_of_words_per_sequence * (avg_number_of_tokens_per_word ** 2) # token inside word fully connected
        + avg_number_of_words_per_sequence * avg_number_of_tokens_per_word # token to word
    )
    return transformer_num_edges, gnn_num_edges

#### BPE tokenizer

In [29]:
avg_number_of_tokens_per_word, avg_number_of_words_per_sequence, t_edges, g_egdes = get_avg_statistics(benchmarks, tok_fn)

tokenizing benchmarks:   0%|          | 0/5 [00:00<?, ?it/s]

In [31]:
avg_number_of_tokens_per_word, avg_number_of_words_per_sequence, t_edges, g_egdes

(1.4368753332523339, 25.080334614339506, 4045.239053219984, 1885.6976437250107)

In [32]:
transformer_and_gnn_edges(avg_number_of_tokens_per_word, avg_number_of_words_per_sequence)

(1298.6880116545738, 716.8416263132389)

#### Char tokenizer

In [33]:
avg_number_of_tokens_per_word, avg_number_of_words_per_sequence, t_edges, g_egdes = get_avg_statistics(benchmarks, char_tok_fn)

tokenizing benchmarks:   0%|          | 0/5 [00:00<?, ?it/s]

In [34]:
avg_number_of_tokens_per_word, avg_number_of_words_per_sequence, t_edges, g_egdes

(4.861724175023932, 25.080334614339506, 45033.58500252319, 2693.3270641667636)

In [35]:
transformer_and_gnn_edges(avg_number_of_tokens_per_word, avg_number_of_words_per_sequence)

(14867.819663169304, 1343.764720351728)