# BEA-2019 

In [34]:
import re, os, pandas as pd

# parse an "A ..." line from .m2
A_RE = re.compile(r"^A (\d+) (\d+)\|\|\|[^|]*\|\|\|([^|]*)\|\|\|")

def apply_edits(src):
    toks = src.split()
    # apply collected edits (rightâ†’left so indices stay valid)
    for s,e,repl in sorted(apply_edits.edits, key=lambda x: x[0], reverse=True):
        repl_toks = [] if repl in ("", "-NONE-") else repl.split()
        toks[s:e] = repl_toks
    return " ".join(toks)
apply_edits.edits = []  # static holder

def m2_to_pairs(path):
    pairs, src = [], None
    with open(path, encoding="utf8") as f:
        for line in f:
            line = line.rstrip("\n")
            if line.startswith("S "):
                # flush previous
                if src is not None:
                    tgt = apply_edits(src)
                    pairs.append((src, tgt))
                src = line[2:]
                apply_edits.edits = []
            elif line.startswith("A "):
                m = A_RE.match(line)
                if m:
                    s, e, repl = int(m.group(1)), int(m.group(2)), m.group(3).strip()
                    apply_edits.edits.append((s, e, repl))
            elif line == "":  # sentence boundary
                if src is not None:
                    tgt = apply_edits(src)
                    pairs.append((src, tgt))
                    src = None
                    apply_edits.edits = []
    # tail
    if src is not None:
        tgt = apply_edits(src)
        pairs.append((src, tgt))
    return pairs

# ---- collect train/dev across files ----
m2_dir = "data/wi_locness/m2"
train, dev = [], []
for fname in os.listdir(m2_dir):
    if fname.endswith(".m2"):
        path = os.path.join(m2_dir, fname)
        if "train" in fname:
            train += m2_to_pairs(path)
        elif "dev" in fname:
            dev += m2_to_pairs(path)

pd.DataFrame(train, columns=["input_text","target_text"]).to_csv("bea_train.csv", index=False)
pd.DataFrame(dev,   columns=["input_text","target_text"]).to_csv("bea_dev.csv",   index=False)

print("Train pairs:", len(train), "Dev pairs:", len(dev))


Train pairs: 68616 Dev pairs: 8768


In [None]:
import pandas as pd
df = pd.DataFrame(train_pairs, columns=["input_text", "target_text"])
df.to_csv("bea_train.csv", index=False)

# NUCLE

In [8]:
# waiting for access

# JFLEG (Similar to CONNL but CONNL is Part of NUCLE and We Don't Have Access Yet)

In [9]:
jfleg = load_dataset("jfleg", split="test")  
jfleg

Dataset({
    features: ['sentence', 'corrections'],
    num_rows: 748
})

# WikiAuto

In [19]:
from datasets import load_dataset

wiki_auto = load_dataset(
    "chaojiang06/wiki_auto",
    "default",
    revision="refs/convert/parquet"   # <-- avoids script
)

wiki_auto

DatasetDict({
    train: Dataset({
        features: ['alignment_label', 'normal_sentence_id', 'simple_sentence_id', 'normal_sentence', 'simple_sentence', 'gleu_score'],
        num_rows: 373801
    })
    validation: Dataset({
        features: ['alignment_label', 'normal_sentence_id', 'simple_sentence_id', 'normal_sentence', 'simple_sentence', 'gleu_score'],
        num_rows: 73249
    })
    test: Dataset({
        features: ['alignment_label', 'normal_sentence_id', 'simple_sentence_id', 'normal_sentence', 'simple_sentence', 'gleu_score'],
        num_rows: 118074
    })
})

# ASSET

In [20]:
asset = load_dataset("asset")
asset

DatasetDict({
    validation: Dataset({
        features: ['original', 'simplifications'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['original', 'simplifications'],
        num_rows: 359
    })
})