In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np

from nsc.api import utils
from nsc.data import preprocessing

from spell_checking import BENCHMARK_DIR, MISSPELLINGS_DIR

In [3]:
own_benchmarks = ["wikidump/realistic", "wikidump/artificial", "bookcorpus/realistic", "bookcorpus/artificial"]
neuspell_benchmarks = ["neuspell/bea322", "neuspell/bea4660", "neuspell/bea60k", "neuspell/jfleg"]

def generate_combined_benchmark(benchmarks, samples_per_benchmark):
    rand = np.random.default_rng(22)
    correct_lines = []
    corrupt_lines = []
    lowercase_only = []
    for benchmark in sorted(benchmarks):
        corrupt = utils.load_text_file(os.path.join(BENCHMARK_DIR, "test", "sec", benchmark, "corrupt.txt"))
        correct = utils.load_text_file(os.path.join(BENCHMARK_DIR, "test", "sec", benchmark, "correct.txt"))
        indices = rand.permutation(len(corrupt))[:samples_per_benchmark]
        correct_lines.extend([correct[idx] for idx in indices])
        corrupt_lines.extend([corrupt[idx] for idx in indices])
        lowercase_only.extend(["1"] * len(indices) if benchmark in {"neuspell/bea322", "neuspell/bea4660"} else ["0"] * len(indices))
    return correct_lines, corrupt_lines, lowercase_only

In [3]:
# combined benchmark from our own benchmarks
own_correct, own_corrupt, _ = generate_combined_benchmark(own_benchmarks, 200)
print(len(own_correct))
utils.save_text_file(os.path.join(BENCHMARK_DIR, "test", "sec", "spelling_correction", "wikibook", "correct.txt"), own_correct)
utils.save_text_file(os.path.join(BENCHMARK_DIR, "test", "sec", "spelling_correction", "wikibook", "corrupt.txt"), own_corrupt)

In [3]:
# combined benchmark from neuspell benchmarks
neuspell_correct, neuspell_corrupt, neuspell_lowercase = generate_combined_benchmark(neuspell_benchmarks, 200)
print(len(neuspell_correct))
utils.save_text_file(os.path.join(BENCHMARK_DIR, "test", "sec", "spelling_correction", "neuspell", "correct.txt"), neuspell_correct)
utils.save_text_file(os.path.join(BENCHMARK_DIR, "test", "sec", "spelling_correction", "neuspell", "corrupt.txt"), neuspell_corrupt)
utils.save_text_file(os.path.join(BENCHMARK_DIR, "test", "sec", "spelling_correction", "neuspell", "lowercase.txt"), neuspell_lowercase)

In [12]:
# combined benchmark from own benchmarks + neuspell
runtime_correct, runtime_corrupt, _ = generate_combined_benchmark(own_benchmarks + neuspell_benchmarks, 200)
utils.save_text_file(os.path.join(BENCHMARK_DIR, "test", "runtime.correct.txt"), runtime_correct)
utils.save_text_file(os.path.join(BENCHMARK_DIR, "test", "runtime.corrupt.txt"), runtime_corrupt)

# combined benchmark from own benchmarks + neuspell with whitespace errors
cfg = preprocessing.WhitespaceNoiseConfig(no_whitespace_p=0, full_whitespace_p=0, insert_whitespace_p=0.025, delete_whitespace_p=0.1)
whitespace_noise = preprocessing.WhitespaceNoise(cfg=cfg, seed=22)

runtime_correct, runtime_corrupt, _ = generate_combined_benchmark(own_benchmarks + neuspell_benchmarks, 200)
runtime_corrupt, _, _ = whitespace_noise.apply(runtime_corrupt, runtime_corrupt, [{}] * len(runtime_corrupt))
utils.save_text_file(os.path.join(BENCHMARK_DIR, "test", "runtime.whitespaces.correct.txt"), runtime_correct)
utils.save_text_file(os.path.join(BENCHMARK_DIR, "test", "runtime.whitespaces.corrupt.txt"), runtime_corrupt)

print(len(runtime_correct))

1600


In [4]:
output_dir = os.path.join(BENCHMARK_DIR, "test", "sec", "whitespace")

In [6]:
word_misspellings_file = os.path.join(MISSPELLINGS_DIR, "test_misspellings.json")

In [8]:
err_noise_levels = [0.05, 0.2]
err_level_names = ["low", "high"]
ws_ins_noise_levels = [0.025, 0.1]
ws_del_noise_levels = [0.1, 0.4]
ws_level_names = ["low", "high"]
for i, (err_noise, err_level_name) in enumerate(zip(err_noise_levels, err_level_names)):
    cfg = preprocessing.MixedNoiseConfig(
        edit_token_p=err_noise, artificial_p=0.5, artificial_num_edits_p=0.8, re_weight_edit_token_p=True, word_misspellings_file=word_misspellings_file
    )
    error_noise = preprocessing.MixedNoise(cfg=cfg, seed=22 + i)
    
    own_corrupt_noised, _, _ = error_noise.apply(
        own_correct,
        own_correct,
        [{}] * len(own_correct)
    )
    for j, (ws_ins_noise, ws_del_noise, ws_level_name) in enumerate(zip(ws_ins_noise_levels, ws_del_noise_levels, ws_level_names)):
        cfg = preprocessing.WhitespaceNoiseConfig(no_whitespace_p=0, full_whitespace_p=0, insert_whitespace_p=ws_ins_noise, delete_whitespace_p=ws_del_noise)
        whitespace_noise = preprocessing.WhitespaceNoise(cfg=cfg, seed=22 + j)

        own_corrupt_noised, _, _ = whitespace_noise.apply(
            own_corrupt_noised,
            own_corrupt_noised,
            [{}] * len(own_corrupt_noised)
        )
        level_name = f"{err_level_name}-{ws_level_name}"
        corrupt_output_file = os.path.join(output_dir, level_name, "corrupt.txt")
        correct_output_file = os.path.join(output_dir, level_name, "correct.txt")

        utils.save_text_file(corrupt_output_file, own_corrupt_noised)
        utils.save_text_file(correct_output_file, own_correct)