In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import time
import numpy as np
import torch
import json
from typing import Any
from tqdm.notebook import tqdm

from nsc import SpellingErrorDetector, SpellingErrorCorrector, TokenizationRepairer
from nsc.utils import io
from nsc.api import utils

from spell_checking import BENCHMARK_DIR, DATA_DIR, CONFIG_DIR, EXPERIMENT_DIR
from spell_checking.baselines import sec

### Run experiments on runtime benchmark and record some key stats

In [3]:
def run_experiment(exp: str, task: str, file_path: str, device: str, **kwargs) -> tuple:
    start = time.perf_counter()
    if task == "sed":
        t = SpellingErrorDetector.from_experiment(exp, device)
    elif task == "sec":
        t = SpellingErrorCorrector.from_experiment(exp, device)
    elif task == "tokenization_repair":
        t = TokenizationRepairer.from_experiment(exp, device)
    else:
        raise RuntimeError
    
    with open(file_path, "r", encoding="utf8") as inf:
        file_bytes = len(inf.read().encode("utf8"))
    start = time.perf_counter()
    
    if task == "sed":
        _ = t.detect_file(file_path, **kwargs)
    elif task == "sec":
        _ = t.correct_file(file_path, **kwargs)
    elif task == "tokenization_repair":
        _ = t.repair_file(file_path, **kwargs)
    else:
        raise RuntimeError
    
    end = time.perf_counter()
    return end - start, file_bytes


def save_to_json(obj: object, file_path: str):
    directory = os.path.dirname(file_path)
    if directory:
        os.makedirs(directory, exist_ok=True)
    
    with open(file_path, "w") as of:
        json.dump(obj, of)

In [4]:
device = "cuda"
batch_size = 16
batch_max_length_factor = 16
os.environ["NSC_DATA_DIR"] = DATA_DIR
os.environ["NSC_CONFIG_DIR"] = CONFIG_DIR

runtime_benchmark = os.path.join(BENCHMARK_DIR, "test", "runtime.corrupt.txt")
runtime_ws_benchmark = os.path.join(BENCHMARK_DIR, "test", "runtime.whitespaces.corrupt.txt")

In [8]:
sed_words_experiments = {
    "SED_WORDS/gnn_no_feat_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_08_04_2022_19_07_38": "gnn",
    "SED_WORDS/transformer_no_feat_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_25_03_2022_14_56_24": "transformer",
    "SED_WORDS/transformer_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_25_03_2022_14_56_24": r"transformer\textsuperscript{+}",
    "SED_WORDS/gnn_cliques_wfc_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_04_04_2022_10_26_13": r"gnn\textsuperscript{+}",
    "TOKENIZATION_REPAIR_PLUS/tokenization_repair_plus_sed_wikidump_paragraphs_tokenization_repair_plus_bookcorpus_paragraphs_tokenization_repair_plus_11_05_2022_23_10_34": r"tokenization repair\textsuperscript{+}/tokenization repair\textsuperscript{++}"
}
sec_nmt_experiments = {
    "SEC_NMT/transformer_sec_nmt_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_25_03_2022_17_41_53": "transformer",
    "SEC_WORDS_NMT/transformer_sec_words_nmt_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_25_03_2022_16_58_40": "transformer word"
}
sec_ws_experiments = {
    "SEC_NMT/transformer_sec_with_tokenization_repair_nmt_wikidump_paragraphs_sec_with_tokenization_repair_bookcorpus_paragraphs_sec_with_tokenization_repair_26_04_2022_14_52_03": "transformer with tokenization repair"
}
sec_tok_experiments = {
    "TOKENIZATION_REPAIR_PLUS/tokenization_repair_plus_sed_plus_sec_wikidump_paragraphs_tokenization_repair_plus_bookcorpus_paragraphs_tokenization_repair_plus_11_05_2022_10_47_44": r"tokenization repair\textsuperscript{++}"
}

In [21]:
sed_words_stats = {}
for exp, name in sed_words_experiments.items():
    stats = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sed", runtime_benchmark, device)
    sed_words_stats[name] = stats

save_to_json(sed_words_stats, "runtime_stats/sed_words.json")

2022-05-19 21:33:00,096 [SPELLING_ERROR_DETECTION] [INFO] [101038] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
2022-05-19 21:33:10,086 [SPELLING_ERROR_DETECTION] [INFO] [101038] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                        
2022-05-19 21:33:15,369 [SPELLING_ERROR_DETECTION] [INFO] [101038] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                        
2022-05-19 21:33:21,432 [SPELLING_ERROR_DETECTION] [INFO] [101038] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                        
2022-05-19 21:33:31,794 [SPELLING_ERROR_DETECTION] [INFO] [101038] running spelling error detection on device NVIDIA GeForce GTX

In [23]:
sec_nmt_stats = {}
for exp, name in sec_nmt_experiments.items():
    stats = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sec", runtime_benchmark, device, **kwargs)
    sec_nmt_stats[name] = stats

save_to_json(sec_nmt_stats, "runtime_stats/sec_nmt_stats.json")

2022-05-19 21:48:33,644 [SPELLING_ERROR_CORRECTION] [INFO] [101038] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
2022-05-19 21:49:48,047 [SPELLING_ERROR_CORRECTION] [INFO] [101038] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                      
                                                                                                                                                                                                                               

In [16]:
sed_experiment = "SED_WORDS/gnn_cliques_wfc_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_04_04_2022_10_26_13"
start = time.perf_counter()
sed = SpellingErrorDetector.from_experiment(
    os.path.join(EXPERIMENT_DIR, sed_experiment), 
    device
)
gnn_sed_runtime = time.perf_counter() - start
gnn_detections, _ = sed.detect_file(runtime_benchmark)
sed_experiment = "SED_WORDS/transformer_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_25_03_2022_14_56_24"
start = time.perf_counter()
sed = SpellingErrorDetector.from_experiment(
    os.path.join(EXPERIMENT_DIR, sed_experiment), 
    device
)
transformer_sed_runtime = time.perf_counter() - start
transformer_detections, _ = sed.detect_file(runtime_benchmark)

sec_with_sed_stats = {}
for exp, name in sec_nmt_experiments.items():
    runtime, file_bytes = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sec", runtime_benchmark, device, detections=gnn_detections)
    sec_with_sed_stats[r"gnn\textsuperscript{+} $\rightarrow$ " + name] = (runtime + gnn_sed_runtime, file_bytes)
    runtime, file_bytes = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sec", runtime_benchmark, device, detections=transformer_detections)
    sec_with_sed_stats[r"transformer\textsuperscript{+} $\rightarrow$ " + name] = (runtime + transformer_sed_runtime, file_bytes)
    
save_to_json(sec_with_sed_stats, "runtime_stats/sec_with_sed_stats.json")

2022-05-19 21:23:50,509 [SPELLING_ERROR_DETECTION] [INFO] [101038] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
2022-05-19 21:24:00,321 [SPELLING_ERROR_DETECTION] [INFO] [101038] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                        
2022-05-19 21:24:06,224 [SPELLING_ERROR_CORRECTION] [INFO] [101038] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                      
2022-05-19 21:24:52,731 [SPELLING_ERROR_CORRECTION] [INFO] [101038] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                      
2022-05-19 21:25:40,391 [SPELLING_ERROR_CORRECTION] [INFO] [101038] running spelling error correction on device NVIDIA GeForce G

In [17]:
sec_nmt_with_tr_stats = {}
for exp, name in sec_ws_experiments.items():
    stats = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sec", runtime_ws_benchmark, device)
    sec_nmt_with_tr_stats[name] = stats

save_to_json(sec_nmt_with_tr_stats, "runtime_stats/sec_with_tr_stats.json")

2022-05-19 21:27:19,101 [SPELLING_ERROR_CORRECTION] [INFO] [101038] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
                                                                                                                                                                                                                               

In [30]:
sec_tok_plus_stats = {}
for exp, name in sec_tok_experiments.items():
    stats = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sec", runtime_ws_benchmark, device)
    sec_tok_plus_stats[name] = stats

save_to_json(sec_tok_plus_stats, "runtime_stats/sec_tok_plus_stats.json")

2022-05-19 22:06:44,989 [SPELLING_ERROR_CORRECTION] [INFO] [101038] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
                                                                                                                                                                                                                               

In [19]:
tr_experiments = {
    "TOKENIZATION_REPAIR/eo_small_arxiv_with_errors_ported": "eo small",
    "TOKENIZATION_REPAIR/eo_medium_arxiv_with_errors_ported": "eo medium",
    "TOKENIZATION_REPAIR/eo_large_arxiv_with_errors_ported": "eo large"
}
tr_stats = {}
for exp, name in tr_experiments.items():
    runtime, file_bytes = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "tokenization_repair", runtime_ws_benchmark, device)
    tr_stats[name] = (runtime, file_bytes)

save_to_json(tr_stats, "runtime_stats/tr_stats.json")

2022-05-19 21:29:16,703 [TOKENIZATION_REPAIR] [INFO] [101038] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
2022-05-19 21:29:22,734 [TOKENIZATION_REPAIR] [INFO] [101038] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                             
2022-05-19 21:29:29,068 [TOKENIZATION_REPAIR] [INFO] [101038] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                             
                                                                                                                                                                                                                               

In [11]:
tr_pipeline_stats = {}
start = time.perf_counter()
tok_rep = TokenizationRepairer.from_experiment(
    os.path.join(EXPERIMENT_DIR, "TOKENIZATION_REPAIR/eo_medium_arxiv_with_errors_ported"), 
    device
)
repaired = tok_rep.repair_file(runtime_ws_benchmark)
tr_runtime = time.perf_counter() - start

runtime_ws_lines = utils.load_text_file(runtime_ws_benchmark)

with open(runtime_ws_benchmark, "r", encoding="utf8") as inf:
    file_bytes = len(inf.read().encode("utf8"))

for exp, name in sed_words_experiments.items():
    if name not in {r"gnn\textsuperscript{+}", r"transformer\textsuperscript{+}", r"tokenization repair\textsuperscript{+}/tokenization repair\textsuperscript{++}"}:
        continue
    sed = SpellingErrorDetector.from_experiment(os.path.join(EXPERIMENT_DIR, exp), device)
    start = time.perf_counter()
    detections, new_repaired = sed.detect_text(repaired if not name.startswith("tokenization") else runtime_ws_lines)
    sed_runtime = time.perf_counter() - start
    for sec_exp, sec_name in sec_nmt_experiments.items():
        sec = SpellingErrorCorrector.from_experiment(os.path.join(EXPERIMENT_DIR, sec_exp), device)
        start = time.perf_counter()
        _ = sec.correct_text(repaired if not name.startswith("tokenization") else new_repaired, detections=detections)
        sec_runtime = time.perf_counter() - start
        runtime = sed_runtime + sec_runtime
        pipeline_names = []
        if not name.startswith("tokenization"):
            runtime += tr_runtime
            pipeline_names = ["eo medium", name]
        else:
            pipeline_names = [r"tokenization repair\textsuperscript{+}"]
        tr_pipeline_stats[r" $\rightarrow$ ".join(pipeline_names + [sec_name])] = (runtime, file_bytes)

save_to_json(tr_pipeline_stats, "runtime_stats/tr_pipeline_stats.json")

2022-05-19 23:15:05,761 [TOKENIZATION_REPAIR] [INFO] [115392] running tokenization repair on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
2022-05-19 23:15:12,373 [SPELLING_ERROR_DETECTION] [INFO] [115392] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                        
2022-05-19 23:15:17,631 [SPELLING_ERROR_CORRECTION] [INFO] [115392] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
2022-05-19 23:16:03,217 [SPELLING_ERROR_CORRECTION] [INFO] [115392] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
2022-05-19 23:16:16,305 [SPELLING_ERROR_DETECTION] [INFO] [115392] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28

In [20]:
neuspell = sec.SECNeuspellBaseline("bert")
neuspell_stats = {}

with open(runtime_benchmark, "r", encoding="utf8") as inf:
    file_bytes = len(inf.read().encode("utf8"))
    
start = time.perf_counter()
batch_size = 16
inputs = utils.load_text_file(runtime_benchmark)
inputs = sorted(inputs, key=lambda s: len(s))
for i in tqdm(list(range(0, len(inputs), batch_size)), desc=f"running neuspell baseline {neuspell.name} on runtime benchmark"):
    batch_inputs = inputs[i:i+batch_size]
    _ = neuspell.inference(batch_inputs)
end = time.perf_counter()

neuspell_stats["neuspell bert"] = (end - start, file_bytes)
save_to_json(neuspell_stats, "runtime_stats/sec_neuspell.json")

data folder is set to `/home/sebastian/anaconda3/envs/masters_thesis/lib/python3.8/site-packages/neuspell/../data` script
loading vocab from path:/home/sebastian/anaconda3/envs/masters_thesis/lib/python3.8/site-packages/neuspell/../data/checkpoints/subwordbert-probwordnoise/vocab.pkl
initializing model


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SubwordBert(
  (bert_dropout): Dropout(p=0.2, inplace=False)
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

running neuspell baseline neuspell_bert on runtime benchmark:   0%|          | 0/100 [00:00<?, ?it/s]