In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from tqdm.notebook import tqdm

from spell_checking.baselines import sec
from spell_checking import BENCHMARK_DIR

from nsc.api import utils
from nsc.data.utils import clean_sequence

In [3]:
def run_neuspell_with_detections(baseline: sec.SECNeuspellBaseline, input_file: str, detection_file: str, batch_size: int = 16) -> list:
    inputs = utils.load_text_file(input_file)
    detections = utils.load_text_file(detection_file)
    detections = [[int(d) for d in det.split()] for det in detections]
    
    all_outputs = []
    for i in tqdm(list(range(0, len(inputs), batch_size)), desc=f"running neuspell baseline {baseline.name} on {os.path.relpath(input_file, BENCHMARK_DIR)}"):
        batch_inputs = []
        batch_detections = []
        for ipt, detection in zip(inputs[i:i+batch_size], detections[i:i+batch_size]):
            cleaned_ipt = clean_sequence(ipt, fix_unicode_errors=True)
            if len(cleaned_ipt.split()) != len(ipt.split()):
                print("found input containing unicode that will be removed by neuspell, adapting detections:", ipt, cleaned_ipt, detection)
                detection = detection[1:]
            batch_inputs.append(ipt)
            batch_detections.append(detection)
        outputs = baseline.inference(batch_inputs, detections=batch_detections)
        all_outputs.extend(outputs)
    return all_outputs

In [4]:
bert = sec.SECNeuspellBaseline("bert")

data folder is set to `/home/sebastian/anaconda3/envs/masters_thesis/lib/python3.8/site-packages/neuspell/../data` script
loading vocab from path:/home/sebastian/anaconda3/envs/masters_thesis/lib/python3.8/site-packages/neuspell/../data/checkpoints/subwordbert-probwordnoise/vocab.pkl
initializing model


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SubwordBert(
  (bert_dropout): Dropout(p=0.2, inplace=False)
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [5]:
benchmarks = [
    "wikidump/artificial",
    "wikidump/realistic",
    "bookcorpus/artificial",
    "bookcorpus/realistic",
    "neuspell/bea60k", 
    "neuspell/bea322", 
    "neuspell/bea4660",
    "neuspell/jfleg"
]

In [7]:
for benchmark in benchmarks:
    input_file = os.path.join(BENCHMARK_DIR, "test", "sec", benchmark, "corrupt.txt")
    
    out_name = "gnn_cliques_wfc_plus_baseline_neuspell_bert.txt"
    sed_words_file = "gnn_cliques_wfc.txt"
        
    detection_file = os.path.join(BENCHMARK_DIR, "test", 
                                  "sec" if benchmark.split("/")[1] in {"bea322", "bea4660"} else "sed_words", "results", benchmark, sed_words_file)
    out_file = os.path.join(BENCHMARK_DIR, "test", "sec", "results", benchmark, out_name)
    
    if os.path.exists(out_file):
        print(f"file for benchmark {benchmark} already exists, skipping")
        continue
    
    outputs = run_neuspell_with_detections(bert, input_file, detection_file)
    utils.save_text_file(out_file, outputs)

file for benchmark wikidump/artificial already exists, skipping
file for benchmark wikidump/realistic already exists, skipping
file for benchmark bookcorpus/artificial already exists, skipping
file for benchmark bookcorpus/realistic already exists, skipping
file for benchmark neuspell/bea60k already exists, skipping


running neuspell baseline neuspell_bert on test/sec/neuspell/bea322/corrupt.txt:   0%|          | 0/21 [00:00<…

running neuspell baseline neuspell_bert on test/sec/neuspell/bea4660/corrupt.txt:   0%|          | 0/292 [00:0…

running neuspell baseline neuspell_bert on test/sec/neuspell/jfleg/corrupt.txt:   0%|          | 0/101 [00:00<…

In [8]:
# benchmarks = [
#     "spelling_correction/neuspell",
#     "spelling_correction/wikibook"
# ]

In [10]:
# for benchmark in benchmarks:
#     input_file = os.path.join(BENCHMARK_DIR, "test", "sec", benchmark, "corrupt.txt")
    
#     out_name = "gnn_cliques_wfc_plus_baseline_neuspell_bert.txt"
#     sed_words_file = "gnn_cliques_wfc.txt"
        
#     detection_file = os.path.join(BENCHMARK_DIR, "test", "sec", "results", benchmark, sed_words_file)
#     out_file = os.path.join(BENCHMARK_DIR, "test", "sec", "results", benchmark, out_name)
    
#     if os.path.exists(out_file):
#         print(f"file for benchmark {benchmark} already exists, skipping")
#         continue
    
#     outputs = run_neuspell_with_detections(bert, input_file, detection_file)
#     utils.save_text_file(out_file, outputs)

running neuspell baseline neuspell_bert on test/sec/spelling_correction/neuspell/corrupt.txt:   0%|          |…

running neuspell baseline neuspell_bert on test/sec/spelling_correction/wikibook/corrupt.txt:   0%|          |…