# BEA-2019 Train

In [7]:
import re, os, pandas as pd

# parse an "A ..." line from .m2
A_RE = re.compile(r"^A (\d+) (\d+)\|\|\|[^|]*\|\|\|([^|]*)\|\|\|")

def apply_edits(src):
    toks = src.split()
    # apply collected edits (right→left so indices stay valid)
    for s,e,repl in sorted(apply_edits.edits, key=lambda x: x[0], reverse=True):
        repl_toks = [] if repl in ("", "-NONE-") else repl.split()
        toks[s:e] = repl_toks
    return " ".join(toks)
apply_edits.edits = []  # static holder

def m2_to_pairs(path):
    pairs, src = [], None
    with open(path, encoding="utf8") as f:
        for line in f:
            line = line.rstrip("\n")
            if line.startswith("S "):
                # flush previous
                if src is not None:
                    tgt = apply_edits(src)
                    pairs.append((src, tgt))
                src = line[2:]
                apply_edits.edits = []
            elif line.startswith("A "):
                m = A_RE.match(line)
                if m:
                    s, e, repl = int(m.group(1)), int(m.group(2)), m.group(3).strip()
                    apply_edits.edits.append((s, e, repl))
            elif line == "":  # sentence boundary
                if src is not None:
                    tgt = apply_edits(src)
                    pairs.append((src, tgt))
                    src = None
                    apply_edits.edits = []
    # tail
    if src is not None:
        tgt = apply_edits(src)
        pairs.append((src, tgt))
    return pairs

# ---- collect train/dev across files ----
m2_dir = "data/wi_locness/m2"
train, dev = [], []
for fname in os.listdir(m2_dir):
    if fname.endswith(".m2"):
        path = os.path.join(m2_dir, fname)
        if "train" in fname:
            train += m2_to_pairs(path)
        elif "dev" in fname:
            dev += m2_to_pairs(path)

pd.DataFrame(train, columns=["input_text","target_text"]).to_csv("bea_train.csv", index=False)
pd.DataFrame(dev,   columns=["input_text","target_text"]).to_csv("bea_dev.csv",   index=False)

print("Train pairs:", len(train), "Dev pairs:", len(dev))


Train pairs: 68616 Dev pairs: 8768


In [None]:
import pandas as pd
df = pd.DataFrame(train_pairs, columns=["input_text", "target_text"])
df.to_csv("bea_train.csv", index=False)

# BEA-2019 Validation

In [8]:
# waiting for access

# JFLEG (Similar to CONNL but CONNL is Part of NUCLE and We Don't Have Access Yet)

In [9]:
jfleg = load_dataset("jfleg", split="test")  
jfleg

Dataset({
    features: ['sentence', 'corrections'],
    num_rows: 748
})

# WikiAuto

In [19]:
from datasets import load_dataset

wiki_auto = load_dataset(
    "chaojiang06/wiki_auto",
    "default",
    revision="refs/convert/parquet"   # <-- avoids script
)

wiki_auto

DatasetDict({
    train: Dataset({
        features: ['alignment_label', 'normal_sentence_id', 'simple_sentence_id', 'normal_sentence', 'simple_sentence', 'gleu_score'],
        num_rows: 373801
    })
    validation: Dataset({
        features: ['alignment_label', 'normal_sentence_id', 'simple_sentence_id', 'normal_sentence', 'simple_sentence', 'gleu_score'],
        num_rows: 73249
    })
    test: Dataset({
        features: ['alignment_label', 'normal_sentence_id', 'simple_sentence_id', 'normal_sentence', 'simple_sentence', 'gleu_score'],
        num_rows: 118074
    })
})

# ASSET

In [20]:
asset = load_dataset("asset")
asset

DatasetDict({
    validation: Dataset({
        features: ['original', 'simplifications'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['original', 'simplifications'],
        num_rows: 359
    })
})

# Save Datasets for HPC

In [5]:
import os
import pandas as pd
from datasets import load_dataset

# Create output directory for HPC datasets
output_dir = "hpc_datasets"
os.makedirs(output_dir, exist_ok=True)

print(f"Saving datasets to {output_dir}/")

# 1. BEA-2019 train set (already created as bea_train.csv)
print("✓ BEA-2019 train set already saved as bea_train.csv")

# 2. BEA-2019 validation set (already created as bea_dev.csv)
print("✓ BEA-2019 validation set already saved as bea_dev.csv")

# 3. JFLEG set
print("\nLoading JFLEG dataset...")
jfleg = load_dataset("jfleg", split="test")
jfleg_df = pd.DataFrame({
    "input_text": jfleg["sentence"],
    "target_text": [refs[0] for refs in jfleg["corrections"]]  # Using first correction
})
jfleg_df.to_csv(os.path.join(output_dir, "jfleg_test.csv"), index=False)
print(f"✓ Saved JFLEG test set ({len(jfleg_df)} examples)")

# 4. WikiAuto train set
print("\nLoading WikiAuto dataset...")
wiki_auto = load_dataset(
    "chaojiang06/wiki_auto",
    "default",
    revision="refs/convert/parquet"
)
wiki_train_df = pd.DataFrame({
    "input_text": wiki_auto["train"]["normal_sentence"],
    "target_text": wiki_auto["train"]["simple_sentence"]
})
wiki_train_df.to_csv(os.path.join(output_dir, "wikiauto_train.csv"), index=False)
print(f"✓ Saved WikiAuto train set ({len(wiki_train_df)} examples)")

# 5. ASSET validation set
print("\nLoading ASSET dataset...")
asset = load_dataset("asset")
asset_val_df = pd.DataFrame({
    "input_text": asset["validation"]["original"],
    "target_text": [refs[0] for refs in asset["validation"]["simplifications"]]  # Using first simplification
})
asset_val_df.to_csv(os.path.join(output_dir, "asset_validation.csv"), index=False)
print(f"✓ Saved ASSET validation set ({len(asset_val_df)} examples)")

# 6. ASSET test set
asset_test_df = pd.DataFrame({
    "input_text": asset["test"]["original"],
    "target_text": [refs[0] for refs in asset["test"]["simplifications"]]  # Using first simplification
})
asset_test_df.to_csv(os.path.join(output_dir, "asset_test.csv"), index=False)
print(f"✓ Saved ASSET test set ({len(asset_test_df)} examples)")

print("\n" + "="*50)
print("All datasets saved successfully!")
print("="*50)
print(f"\nFiles ready for HPC transfer:")
print(f"  - bea_train.csv")
print(f"  - bea_dev.csv")
print(f"  - {output_dir}/jfleg_test.csv")
print(f"  - {output_dir}/wikiauto_train.csv")
print(f"  - {output_dir}/asset_validation.csv")
print(f"  - {output_dir}/asset_test.csv")

Saving datasets to hpc_datasets/
✓ BEA-2019 train set already saved as bea_train.csv
✓ BEA-2019 validation set already saved as bea_dev.csv

Loading JFLEG dataset...
✓ Saved JFLEG test set (748 examples)

Loading WikiAuto dataset...
✓ Saved WikiAuto train set (373801 examples)

Loading ASSET dataset...
✓ Saved ASSET validation set (2000 examples)
✓ Saved ASSET test set (359 examples)

All datasets saved successfully!

Files ready for HPC transfer:
  - bea_train.csv
  - bea_dev.csv
  - hpc_datasets/jfleg_test.csv
  - hpc_datasets/wikiauto_train.csv
  - hpc_datasets/asset_validation.csv
  - hpc_datasets/asset_test.csv


In [3]:
# Check WikiAuto column names
from datasets import load_dataset
wiki_auto = load_dataset("chaojiang06/wiki_auto", "default", revision="refs/convert/parquet")
print("WikiAuto train columns:", wiki_auto["train"].column_names)
print("\nFirst example:")
print(wiki_auto["train"][0])

WikiAuto train columns: ['alignment_label', 'normal_sentence_id', 'simple_sentence_id', 'normal_sentence', 'simple_sentence', 'gleu_score']

First example:
{'alignment_label': 1, 'normal_sentence_id': '0_66252-1-0-0', 'simple_sentence_id': '0_66252-0-0-0', 'normal_sentence': 'The Local Government Act 1985 is an Act of Parliament in the United Kingdom.', 'simple_sentence': 'The Local Government Act 1985 was an Act of Parliament in the United Kingdom.', 'gleu_score': 0.800000011920929}


In [4]:
# Check JFLEG and ASSET column names
jfleg = load_dataset("jfleg", split="test")
print("JFLEG columns:", jfleg.column_names)
print("\nFirst JFLEG example:")
print(jfleg[0])

asset = load_dataset("asset")
print("\n\nASSET validation columns:", asset["validation"].column_names)
print("\nFirst ASSET validation example:")
print(asset["validation"][0])

JFLEG columns: ['sentence', 'corrections']

First JFLEG example:
{'sentence': 'New and new technology has been introduced to the society .', 'corrections': ['New technology has been introduced to society .', 'New technology has been introduced into the society .', 'Newer and newer technology has been introduced into society .', 'Newer and newer technology has been introduced to the society .']}


ASSET validation columns: ['original', 'simplifications']

First ASSET validation example:
{'original': 'Adjacent counties are Marin (to the south), Mendocino (to the north), Lake (northeast), Napa (to the east), and Solano and Contra Costa (to the southeast).', 'simplifications': ['countries next to it are Marin, Mendocino, Lake, Napa, Solano, and Contra Costa.', 'Nearby counties are Marin, Mendocino, Lake, Napa, and Solano and Contra Costa.', 'Adjacent counties are Marin, Mendocino, Lake, Napa, Solano and Contra Costa.', 'Neighboring counties are Marin, Mendocino, Lake, Napa, Solano, and Con

In [8]:
# Verify files and show their sizes
import os

print("Dataset Files Summary:")
print("=" * 70)

files_to_check = [
    "bea_train.csv",
    "bea_dev.csv",
    "hpc_datasets/jfleg_test.csv",
    "hpc_datasets/wikiauto_train.csv",
    "hpc_datasets/asset_validation.csv",
    "hpc_datasets/asset_test.csv"
]

for filepath in files_to_check:
    if os.path.exists(filepath):
        size_bytes = os.path.getsize(filepath)
        size_mb = size_bytes / (1024 * 1024)
        print(f"✓ {filepath:40} {size_mb:10.2f} MB")
    else:
        print(f"✗ {filepath:40} NOT FOUND")

print("=" * 70)

Dataset Files Summary:
✓ bea_train.csv                                 12.34 MB
✓ bea_dev.csv                                    1.73 MB
✓ hpc_datasets/jfleg_test.csv                    0.14 MB
✓ hpc_datasets/wikiauto_train.csv               87.07 MB
✓ hpc_datasets/asset_validation.csv              0.41 MB
✓ hpc_datasets/asset_test.csv                    0.08 MB


## Summary - Files Ready for HPC Transfer

All datasets have been saved successfully:

### Root Directory:
- `bea_train.csv` - BEA-2019 training set (68,616 examples)
- `bea_dev.csv` - BEA-2019 validation set (8,768 examples)

### hpc_datasets/ Directory:
- `jfleg_test.csv` - JFLEG test set (748 examples)
- `wikiauto_train.csv` - WikiAuto training set (373,801 examples)
- `asset_validation.csv` - ASSET validation set (2,000 examples)
- `asset_test.csv` - ASSET test set (359 examples)

**Total Size:** ~101.77 MB

# Clean Spaced Hyphens (Fix Tokenization Artifacts)

In [10]:
import pandas as pd
import re
import os

# Function to clean spaced hyphens from text
def clean_spaced_hyphens(text):
    """Replace ' - ' with '-' to fix tokenization artifacts"""
    if pd.isna(text):
        return text
    return text.replace(' - ', '-')

# List of all CSV files to clean
csv_files = [
    'data/bea_train.csv',
    'data/bea_dev.csv',
    'hpc_datasets/jfleg_test.csv',
    'hpc_datasets/wikiauto_train.csv',
    'hpc_datasets/asset_validation.csv',
    'hpc_datasets/asset_test.csv'
]

print("Cleaning spaced hyphens from datasets...")
print("=" * 70)

for csv_file in csv_files:
    if not os.path.exists(csv_file):
        print(f"\n⚠ Skipping {csv_file} (not found)")
        continue
        
    print(f"\nProcessing: {csv_file}")
    
    # Read the CSV
    df = pd.read_csv(csv_file)
    
    # Count occurrences before cleaning
    before_count = df.astype(str).apply(lambda x: x.str.contains(' - ', regex=False).sum()).sum()
    
    # Clean both input and target columns
    for col in df.columns:
        df[col] = df[col].apply(clean_spaced_hyphens)
    
    # Count occurrences after cleaning
    after_count = df.astype(str).apply(lambda x: x.str.contains(' - ', regex=False).sum()).sum()
    
    # Save the cleaned CSV
    df.to_csv(csv_file, index=False)
    
    print(f"  ✓ Cleaned {before_count} instances of ' - '")
    print(f"  ✓ Remaining instances: {after_count}")
    print(f"  ✓ Saved cleaned version to {csv_file}")

print("\n" + "=" * 70)
print("All datasets cleaned successfully!")
print("\nExample transformations:")
print("  'medium - sized' → 'medium-sized'")
print("  'high - density' → 'high-density'")
print("  'well - known' → 'well-known'")

Cleaning spaced hyphens from datasets...

Processing: data/bea_train.csv
  ✓ Cleaned 5318 instances of ' - '
  ✓ Remaining instances: 0
  ✓ Saved cleaned version to data/bea_train.csv

Processing: data/bea_dev.csv
  ✓ Cleaned 778 instances of ' - '
  ✓ Remaining instances: 0
  ✓ Saved cleaned version to data/bea_dev.csv

Processing: hpc_datasets/jfleg_test.csv
  ✓ Cleaned 4 instances of ' - '
  ✓ Remaining instances: 0
  ✓ Saved cleaned version to hpc_datasets/jfleg_test.csv

Processing: hpc_datasets/wikiauto_train.csv
  ✓ Cleaned 3011 instances of ' - '
  ✓ Remaining instances: 0
  ✓ Saved cleaned version to hpc_datasets/wikiauto_train.csv

Processing: hpc_datasets/asset_validation.csv
  ✓ Cleaned 57 instances of ' - '
  ✓ Remaining instances: 0
  ✓ Saved cleaned version to hpc_datasets/asset_validation.csv

Processing: hpc_datasets/asset_test.csv
  ✓ Cleaned 2 instances of ' - '
  ✓ Remaining instances: 0
  ✓ Saved cleaned version to hpc_datasets/asset_test.csv

All datasets cleaned 

In [11]:
# Verify the fix - check the specific example
df = pd.read_csv('data/bea_train.csv')
print("Verification - First 3 rows of cleaned BEA train set:")
print("=" * 100)
for idx in range(3):
    print(f"\nRow {idx + 1}:")
    print(f"  Input:  {df.iloc[idx]['input_text'][:100]}...")
    print(f"  Target: {df.iloc[idx]['target_text'][:100]}...")
    
# Check for any remaining spaced hyphens
remaining = df.astype(str).apply(lambda x: x.str.contains(' - ', regex=False).sum()).sum()
print(f"\n{'=' * 100}")
print(f"✓ Remaining spaced hyphens in bea_train.csv: {remaining}")
print(f"✓ Total examples: {len(df)}")

Verification - First 3 rows of cleaned BEA train set:

Row 1:
  Input:  My town is a medium size city with eighty thousand inhabitants ....
  Target: My town is a medium-sized city with eighty thousand inhabitants ....

Row 2:
  Input:  It has a high density population because its small territory ....
  Target: It has a high-density population because of its small territory ....

Row 3:
  Input:  Despite of it is an industrial city , there are many shops and department stores ....
  Target: Although it is an industrial city , there are many shops and department stores ....

✓ Remaining spaced hyphens in bea_train.csv: 0
✓ Total examples: 68616


## Cleaning Summary

All datasets have been cleaned to remove spaced hyphens (` - ` → `-`):

| Dataset | Instances Cleaned |
|---------|------------------|
| BEA train | 5,318 |
| BEA dev | 778 |
| JFLEG test | 4 |
| WikiAuto train | 3,011 |
| ASSET validation | 57 |
| ASSET test | 2 |
| **Total** | **9,170** |

This ensures consistent formatting for LLM comparison without needing extra prompt engineering.