In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import time
import numpy as np
import torch
import json
from typing import Any
from tqdm.notebook import tqdm

from nsc import SpellingErrorDetector, SpellingErrorCorrector, TokenizationRepairer
from nsc.utils import io
from nsc.api import utils

from spell_checking import BENCHMARK_DIR, DATA_DIR, CONFIG_DIR, EXPERIMENT_DIR
from spell_checking.baselines import sec

### Run experiments on runtime benchmark and record some key stats

In [3]:
def run_experiment(exp: str, task: str, file_path: str, device: str, **kwargs) -> tuple:
    start = time.perf_counter()
    if task == "sed":
        t = SpellingErrorDetector.from_experiment(exp, device)
    elif task == "sec":
        t = SpellingErrorCorrector.from_experiment(exp, device)
    elif task == "tokenization_repair":
        t = TokenizationRepairer.from_experiment(exp, device)
    else:
        raise RuntimeError
    
    with open(file_path, "r", encoding="utf8") as inf:
        file_bytes = len(inf.read().encode("utf8"))
    start = time.perf_counter()
    
    if task == "sed":
        _ = t.detect_file(file_path, **kwargs)
    elif task == "sec":
        _ = t.correct_file(file_path, **kwargs)
    elif task == "tokenization_repair":
        _ = t.repair_file(file_path, **kwargs)
    else:
        raise RuntimeError
    
    end = time.perf_counter()
    return end - start, file_bytes


def save_to_json(obj: object, file_path: str):
    directory = os.path.dirname(file_path)
    if directory:
        os.makedirs(directory, exist_ok=True)
    
    with open(file_path, "w") as of:
        json.dump(obj, of)

In [26]:
device = "cuda"
batch_size = 16
batch_max_length_factor = 16
os.environ["NSC_DATA_DIR"] = DATA_DIR
os.environ["NSC_CONFIG_DIR"] = CONFIG_DIR

runtime_benchmark = os.path.join(BENCHMARK_DIR, "test", "runtime.corrupt.txt")

In [27]:
# sed_sequence_experiments = {
#     "SED_SEQUENCE/gnn_no_feat_wikidump_paragraphs_sed_sequence_bookcorpus_paragraphs_sed_sequence_14_04_2022_10_43_45": "gnn",
#     "SED_SEQUENCE/transformer_no_feat_wikidump_paragraphs_sed_sequence_bookcorpus_paragraphs_sed_sequence_14_04_2022_10_05_03": "transformer",
#     "SED_SEQUENCE/transformer_wikidump_paragraphs_sed_sequence_bookcorpus_paragraphs_sed_sequence_25_03_2022_14_57_17": r"transformer\textsuperscript{+}"
# }

# sed_sequence_stats = {}

# for exp, name in sed_sequence_experiments.items():
#     stats = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sed", runtime_benchmark, device)
#     sed_sequence_stats[name] = stats

# save_to_json(sed_sequence_stats, "runtime_stats/sed_sequence.json")

In [29]:
sed_words_experiments = {
    "SED_WORDS/gnn_no_feat_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_08_04_2022_19_07_38": "gnn",
    "SED_WORDS/transformer_no_feat_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_25_03_2022_14_56_24": "transformer",
    "SED_WORDS/transformer_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_25_03_2022_14_56_24": r"transformer\textsuperscript{+}",
    "SED_WORDS/gnn_cliques_wfc_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_04_04_2022_10_26_13": r"gnn\textsuperscript{+}",
    "TOKENIZATION_REPAIR_PLUS/tokenization_repair_plus_sed_wikidump_paragraphs_tokenization_\
repair_plus_bookcorpus_paragraphs_tokenization_repair_plus_26_03_2022_21_47_02": r"tokenization repair\textsuperscript{+}"
}
sed_words_stats = {}

for exp, name in sed_words_experiments.items():
    stats = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sed", runtime_benchmark, device)
    sed_words_stats[name] = stats

save_to_json(sed_words_stats, "runtime_stats/sed_words.json")

2022-05-08 22:16:15,937 [SPELLING_ERROR_DETECTION] [INFO] [28240] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
2022-05-08 22:16:23,192 [SPELLING_ERROR_DETECTION] [INFO] [28240] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                         
2022-05-08 22:16:27,107 [SPELLING_ERROR_DETECTION] [INFO] [28240] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                         
2022-05-08 22:16:31,259 [SPELLING_ERROR_DETECTION] [INFO] [28240] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                         
2022-05-08 22:16:38,627 [SPELLING_ERROR_DETECTION] [INFO] [28240] running spelling error detection on device NVIDIA GeForce GTX 1

In [21]:
sed_experiment = "SED_WORDS/gnn_cliques_wfc_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_04_04_2022_10_26_13"
sed = SpellingErrorDetector.from_experiment(
    os.path.join(EXPERIMENT_DIR, sed_experiment), 
    device
)
sed_runtime = sed_words_stats[sed_words_experiments[sed_experiment]][0]
detections, _ = sed.detect_file(runtime_benchmark)

sec_nmt_experiments = {
    "SEC_NMT/transformer_sec_nmt_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_25_03_2022_17_41_53": "transformer",
    "SEC_WORDS_NMT/transformer_sec_words_nmt_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_25_03_2022_16_58_40": "transformer word"
}
sec_ws_experiments = {
    "SEC_NMT/transformer_sec_with_tokenization_repair_nmt_wikidump_paragraphs_sec_with_tokenization_repair_bookcorpus_paragraphs_sec_with_tokenization_repair_26_04_2022_14_52_03": "transformer with tokenization repair"
}
sec_tok_experiments = {
    "TOKENIZATION_REPAIR_PLUS/tokenization_repair_plus_sed_plus_sec_"
"wikidump_paragraphs_tokenization_repair_plus_bookcorpus_paragraphs_tokenization_repair_plus_18_04_2022_17_04_20": r"tokenization repair\textsuperscript{++}"
}
sec_stats = {}

sec_experiments = {**sec_nmt_experiments, **sec_ws_experiments, **sec_tok_experiments}
kwargs = {"tokenization_repair_plus_no_detect": True}
for exp, name in sec_experiments.items():
    stats = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sec", runtime_benchmark, device, **kwargs)
    sec_stats[name] = stats

save_to_json(sec_stats, "runtime_stats/sec_stats.json")

2022-05-08 22:00:04,664 [SPELLING_ERROR_DETECTION] [INFO] [28240] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
2022-05-08 22:00:12,162 [SPELLING_ERROR_CORRECTION] [INFO] [28240] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                       
2022-05-08 22:01:18,093 [SPELLING_ERROR_CORRECTION] [INFO] [28240] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                       


transformer (65.08325776700076, 174880)


2022-05-08 22:01:46,979 [SPELLING_ERROR_CORRECTION] [INFO] [28240] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                       


transformer word (27.980680926999412, 174880)


2022-05-08 22:02:59,064 [SPELLING_ERROR_CORRECTION] [INFO] [28240] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                       


transformer with tokenization repair (71.14561433800009, 174880)


                                                                                                                                                                                                                               

tokenization repair\textsuperscript{++} (40.56263757399938, 174880)




In [19]:
sec_with_sed_stats = {}
kwargs["tokenization_repair_plus_no_detect"] = False
for exp, name in sec_nmt_experiments.items():
    runtime, file_bytes = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sec", runtime_benchmark, device, detections=detections, **kwargs)
    sec_with_sed_stats[r"gnn\textsuperscript{+} $\rightarrow$ " + name] = (runtime + sed_runtime, file_bytes)
for exp, name in sec_tok_experiments.items():
    stats = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sec", runtime_benchmark, device, **kwargs)
    sec_with_sed_stats[name] = stats

save_to_json(sec_with_sed_stats, "runtime_stats/sec_with_sed_stats.json")

2022-05-08 21:46:31,822 [SPELLING_ERROR_CORRECTION] [INFO] [28240] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
2022-05-08 21:47:13,641 [SPELLING_ERROR_CORRECTION] [INFO] [28240] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                       


transformer (19.298796318000313, 174880)


2022-05-08 21:47:24,584 [SPELLING_ERROR_CORRECTION] [INFO] [28240] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                       


transformer word (19.298796318000313, 174880)


                                                                                                                                                                                                                               

tokenization repair\textsuperscript{++} (19.9617370570013, 174880)


In [22]:
tr_experiments = {
    "TOKENIZATION_REPAIR/transformer_encoder_only_wikidump_paragraphs_tokenization_repair_plus_bookcorpus_paragraphs_tokenization_repair_plus_02_04_2022_21_32_06": "tokenization repair"
}
tr_stats = {}
for exp, name in tr_experiments.items():
    runtime, file_bytes = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "tokenization_repair", runtime_benchmark, device)
    tr_stats[name] = (runtime, file_bytes)

save_to_json(tr_stats, "runtime_stats/tr_stats.json")

2022-05-08 22:03:50,968 [TOKENIZATION_REPAIR] [INFO] [28240] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
                                                                                                                                                                                                                               

tokenization repair (40.56263757399938, 174880)


In [30]:
neuspell = sec.SECNeuspellBaseline("bert")
neuspell_stats = {}

with open(runtime_benchmark, "r", encoding="utf8") as inf:
    file_bytes = len(inf.read().encode("utf8"))
start = time.perf_counter()

batch_size = 16
inputs = utils.load_text_file(runtime_benchmark)
inputs = sorted(inputs, key=lambda s: len(s))
for i in tqdm(list(range(0, len(inputs), batch_size)), desc=f"running neuspell baseline {neuspell.name} on runtime benchmark"):
    batch_inputs = inputs[i:i+batch_size]
    _ = neuspell.inference(batch_inputs)

end = time.perf_counter()
neuspell_stats["neuspell bert"] = (end - start, file_bytes)
save_to_json(neuspell_stats, "runtime_stats/sec_neuspell.json")

data folder is set to `/home/sebastian/anaconda3/envs/masters_thesis/lib/python3.8/site-packages/neuspell/../data` script
loading vocab from path:/home/sebastian/anaconda3/envs/masters_thesis/lib/python3.8/site-packages/neuspell/../data/checkpoints/subwordbert-probwordnoise/vocab.pkl
initializing model


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SubwordBert(
  (bert_dropout): Dropout(p=0.2, inplace=False)
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

running neuspell baseline neuspell_bert on runtime benchmark:   0%|          | 0/63 [00:00<?, ?it/s]