In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import time
import numpy as np
import torch
import json
from typing import Any
from tqdm.notebook import tqdm

from nsc import SpellingErrorDetector, SpellingErrorCorrector, TokenizationRepairer
from nsc.utils import io
from nsc.api import utils

from spell_checking import BENCHMARK_DIR, DATA_DIR, CONFIG_DIR, EXPERIMENT_DIR
from spell_checking.baselines import sec

### Run experiments on runtime benchmark and record some key stats

In [3]:
def run_experiment(exp: str, task: str, file_path: str, device: str, **kwargs) -> tuple:
    start = time.perf_counter()
    if task == "sed":
        t = SpellingErrorDetector.from_experiment(exp, device)
    elif task == "sec":
        t = SpellingErrorCorrector.from_experiment(exp, device)
    elif task == "tokenization_repair":
        t = TokenizationRepairer.from_experiment(exp, device)
    else:
        raise RuntimeError
    
    with open(file_path, "r", encoding="utf8") as inf:
        file_bytes = len(inf.read().encode("utf8"))
    start = time.perf_counter()
    
    if task == "sed":
        _ = t.detect_file(file_path, **kwargs)
    elif task == "sec":
        _ = t.correct_file(file_path, **kwargs)
    elif task == "tokenization_repair":
        _ = t.repair_file(file_path, **kwargs)
    else:
        raise RuntimeError
    
    end = time.perf_counter()
    return end - start, file_bytes


def save_to_json(obj: object, file_path: str):
    directory = os.path.dirname(file_path)
    if directory:
        os.makedirs(directory, exist_ok=True)
    
    with open(file_path, "w") as of:
        json.dump(obj, of)

In [5]:
benchmarks = ["wikidump/artificial", "wikidump/realistic", "bookcorpus/artificial", "bookcorpus/realistic", "neuspell/bea60k"]
# create runtime benchmark
rand = np.random.default_rng(22)
samples_per_benchmark = 200
correct_lines = []
corrupt_lines = []
for benchmark in sorted(benchmarks):
    benchmark = os.path.join(BENCHMARK_DIR, "test", "sec", benchmark, "corrupt.txt")
    corrupt = utils.load_text_file(benchmark)
    correct = utils.load_text_file(os.path.join(os.path.dirname(benchmark), "correct.txt"))
    indices = rand.permutation(len(corrupt))[:samples_per_benchmark]
    correct_lines.extend([correct[idx] for idx in indices])
    corrupt_lines.extend([corrupt[idx] for idx in indices])
utils.save_text_file(os.path.join(BENCHMARK_DIR, "test", "runtime.correct.txt"), correct_lines)
utils.save_text_file(os.path.join(BENCHMARK_DIR, "test", "runtime.corrupt.txt"), corrupt_lines)

print(len(correct_lines), len(corrupt_lines))

1000 1000


In [5]:
device = "cuda:1"
os.environ["NSC_DATA_DIR"] = DATA_DIR
os.environ["NSC_CONFIG_DIR"] = CONFIG_DIR

runtime_benchmark = os.path.join(BENCHMARK_DIR, "test", "runtime.corrupt.txt")

In [7]:
sed_sequence_experiments = {
    "SED_SEQUENCE/gnn_no_feat_wikidump_paragraphs_sed_sequence_bookcorpus_paragraphs_sed_sequence_14_04_2022_10_43_45": "gnn",
    "SED_SEQUENCE/transformer_no_feat_wikidump_paragraphs_sed_sequence_bookcorpus_paragraphs_sed_sequence_14_04_2022_10_05_03": "transformer",
    "SED_SEQUENCE/transformer_wikidump_paragraphs_sed_sequence_bookcorpus_paragraphs_sed_sequence_25_03_2022_14_57_17": "transformer+"
}

sed_sequence_stats = {}

for exp, name in sed_sequence_experiments.items():
    stats = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sed", runtime_benchmark, device)
    sed_sequence_stats[name] = stats

save_to_json(sed_sequence_stats, "runtime_stats/sed_sequence.json")

2022-04-19 13:43:28,744 [SPELLING_ERROR_DETECTION] [INFO] [39524] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
2022-04-19 13:43:35,298 [SPELLING_ERROR_DETECTION] [INFO] [39524] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                         
2022-04-19 13:43:38,701 [SPELLING_ERROR_DETECTION] [INFO] [39524] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                         
                                                                                                                                                                                                                               

In [8]:
sed_words_experiments = {
    "SED_WORDS/gnn_no_feat_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_08_04_2022_19_07_38": "gnn",
    "SED_WORDS/transformer_no_feat_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_25_03_2022_14_56_24": "transformer",
    "SED_WORDS/transformer_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_25_03_2022_14_56_24": "transformer+",
    "SED_WORDS/gnn_cliques_wfc_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_04_04_2022_10_26_13": "gnn+",
    "TOKENIZATION_REPAIR_PLUS/tokenization_repair_plus_sed_wikidump_paragraphs_tokenization_\
repair_plus_bookcorpus_paragraphs_tokenization_repair_plus_26_03_2022_21_47_02": "tokenization_repair+"
}
sed_words_stats = {}

for exp, name in sed_words_experiments.items():
    stats = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sed", runtime_benchmark, device)
    sed_words_stats[name] = stats
    print(name, stats)

save_to_json(sed_words_stats, "runtime_stats/sed_words.json")

2022-04-19 13:44:13,498 [SPELLING_ERROR_DETECTION] [INFO] [39524] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
2022-04-19 13:44:20,710 [SPELLING_ERROR_DETECTION] [INFO] [39524] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                         


gnn (6.032822203000251, 175867)


2022-04-19 13:44:24,301 [SPELLING_ERROR_DETECTION] [INFO] [39524] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                         


transformer (2.696371423999153, 175867)


2022-04-19 13:44:28,580 [SPELLING_ERROR_DETECTION] [INFO] [39524] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                         


transformer+ (3.274699260999114, 175867)


2022-04-19 13:44:36,036 [SPELLING_ERROR_DETECTION] [INFO] [39524] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                         


gnn+ (6.264162076000503, 175867)


                                                                                                                                                                                                                               

tokenization_repair+ (8.90006778300085, 175867)


In [10]:
sed_experiment = "SED_WORDS/gnn_cliques_wfc_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_04_04_2022_10_26_13"
sed = SpellingErrorDetector.from_experiment(
    os.path.join(EXPERIMENT_DIR, sed_experiment), 
    device
)
sed_runtime = sed_words_stats[sed_words_experiments[sed_experiment]][0]
detections = sed.detect_file(runtime_benchmark, threshold=0.2)

sec_nmt_experiments = {
    "SEC_NMT/transformer_sec_nmt_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_25_03_2022_17_41_53": "transformer_nmt",
    "SEC_WORDS_NMT/transformer_sec_words_nmt_wikidump_paragraphs_sed_words_and_sec_bookcorpus_paragraphs_sed_words_and_sec_25_03_2022_16_58_40": "transformer_words_nmt"
}
sec_tok_experiments = {
    # "": "tokenization_repair++"
}
sec_stats = {}
sec_with_sed_stats = {}

sec_experiments = {**sec_nmt_experiments, **sec_tok_experiments}
os.environ["NSC_TOKENIZATION_REPAIR_PLUS_NO_DETECT"] = "true"
for exp, name in sec_experiments.items():
    stats = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sec", runtime_benchmark, device)
    sec_stats[name] = stats
    print(name, stats)

save_to_json(sec_stats, "runtime_stats/sec_stats.json")

os.environ["NSC_TOKENIZATION_REPAIR_PLUS_NO_DETECT"] = "false"
for exp, name in sec_experiments.items():
    runtime, file_bytes = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sec", runtime_benchmark, device, detections=detections)
    sec_with_sed_stats[name] = (runtime + sed_runtime, file_bytes)
    print(name, stats)
for exp, name in sec_tok_experiments.items():
    stats = run_experiment(os.path.join(EXPERIMENT_DIR, exp), "sec", runtime_benchmark)
    sec_with_sed_stats[name] = stats
    print(name, stats)

save_to_json(sec_with_sed_stats, "runtime_stats/sec_with_sed_stats.json")

2022-04-19 13:58:54,802 [SPELLING_ERROR_DETECTION] [INFO] [39524] running spelling error detection on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)
2022-04-19 13:59:02,145 [SPELLING_ERROR_CORRECTION] [INFO] [39524] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                       
2022-04-19 14:00:30,589 [SPELLING_ERROR_CORRECTION] [INFO] [39524] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                       


transformer_nmt (87.042771901999, 175867)


2022-04-19 14:01:37,531 [SPELLING_ERROR_CORRECTION] [INFO] [39524] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                       


transformer_words_nmt (65.56247996300044, 175867)


2022-04-19 14:02:33,148 [SPELLING_ERROR_CORRECTION] [INFO] [39524] running spelling error correction on device NVIDIA GeForce GTX 1080 Ti (11,178MiB memory, 6.1 compute capability, 28 multiprocessors)                       


transformer_nmt (65.56247996300044, 175867)


                                                                                                                                                                                                                               

transformer_words_nmt (65.56247996300044, 175867)


In [12]:
neuspell = sec.SECNeuspellBaseline("bert")
neuspell_stats = {}

with open(runtime_benchmark, "r", encoding="utf8") as inf:
    file_bytes = len(inf.read().encode("utf8"))
start = time.perf_counter()

batch_size = 16
inputs = utils.load_text_file(runtime_benchmark)
inputs = sorted(inputs, key=lambda s: len(s))
for i in tqdm(list(range(0, len(inputs), batch_size)), desc=f"running neuspell baseline {neuspell.name} on runtime benchmark"):
    batch_inputs = inputs[i:i+batch_size]
    _ = neuspell.inference(batch_inputs)

end = time.perf_counter()
neuspell_stats["neuspell"] = (end - start, file_bytes)
save_to_json(neuspell_stats, "runtime_stats/sec_neuspell.json")

loading vocab from path:/home/sebastian/anaconda3/envs/masters_thesis/lib/python3.8/site-packages/neuspell/../data/checkpoints/subwordbert-probwordnoise/vocab.pkl
initializing model


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SubwordBert(
  (bert_dropout): Dropout(p=0.2, inplace=False)
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

running neuspell baseline neuspell_bert on runtime benchmark:   0%|          | 0/63 [00:00<?, ?it/s]