# Dataset Fuzzer Builder

This notebook augments `synth_dataset_v2.json` with additional obfuscated variants while keeping span annotations aligned.


## Setup
Load the source dataset and define shared constants.


In [7]:
from __future__ import annotations

import json
from copy import deepcopy
from dataclasses import asdict
from pathlib import Path
from typing import Dict, Iterable, List, Tuple

import pandas as pd

DATA_DIR = Path('..', 'data').resolve()
SOURCE_DATASET = DATA_DIR / 'synth_dataset_v2.json'
BACKUP_DATASET = DATA_DIR / 'synth_dataset_v2.original.json'
OBFUSCATION_TYPES = ['None', '1-space', '5-space', 'textualization']

print(f'Source dataset: {SOURCE_DATASET}')
print(f'Exists: {SOURCE_DATASET.exists()}')


Source dataset: E:\UCLA\UCLA\263\cs263-final-project\data\synth_dataset_v2.json
Exists: True


## Inspect base dataset
Check the first few records and entity spans to understand the schema.


In [8]:
with SOURCE_DATASET.open(encoding='utf-8') as f:
    base_records = json.load(f)

print(f'Original record count: {len(base_records)}')
example = base_records[0]
print('Example keys:', list(example.keys()))
example


Original record count: 1500
Example keys: ['full_text', 'masked', 'spans', 'template_id', 'metadata']


{'full_text': 'The address of Persint is 6750 Koskikatu 25 Apt. 864\nArtilleros\n, CO\n Uruguay 64677',
 'masked': 'The address of {{ORGANIZATION}} is {{STREET_ADDRESS}}',
 'spans': [{'entity_type': 'STREET_ADDRESS',
   'entity_value': '6750 Koskikatu 25 Apt. 864\nArtilleros\n, CO\n Uruguay 64677',
   'start_position': 26,
   'end_position': 83},
  {'entity_type': 'ORGANIZATION',
   'entity_value': 'Persint',
   'start_position': 15,
   'end_position': 22}],
 'template_id': 87,
 'metadata': None}

## Obfuscation helpers
Implement text transformations and span remapping. Each transformer returns the new text and a mapping from original character offsets to new offsets.


In [9]:
DIGIT_MAP = {
    '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
    '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine',
}
SYMBOL_MAP = {
    '@': ['at'], '.': ['dot'], '-': ['dash'], '_': ['underscore'],
    '+': ['plus'], '#': ['hash'], '$': ['dollar'], '/': ['slash'],
}


def transform_text(text: str, mode: str) -> Tuple[str, List[int]]:
    if mode not in OBFUSCATION_TYPES:
        raise ValueError(f'Unsupported mode: {mode}')

    if mode == 'None':
        return text, list(range(len(text) + 1))

    result: List[str] = []
    index_map: List[int] = [0]
    curr = 0
    text_length = len(text)

    for i, ch in enumerate(text):
        segment = ch
        if mode == '1-space':
            segment = ch
            result.append(segment)
            curr += len(segment)
            index_map.append(curr)
            if i != text_length - 1:
                result.append(' ')
                curr += 1
            continue

        if mode == '5-space':
            segment = ch
            result.append(segment)
            curr += len(segment)
            index_map.append(curr)
            if i != text_length - 1 and (i + 1) % 5 == 0:
                result.append(' ')
                curr += 1
            continue

        if mode == 'textualization':
            next_char = text[i + 1] if i + 1 < text_length else ''
            if ch.isdigit():
                term = DIGIT_MAP[ch]
                prefix = '' if not result or result[-1].endswith(' ') else ' '
                suffix = '' if (not next_char or next_char.isspace()) else ' '
                segment = f'{prefix}{term}{suffix}'
            elif ch in SYMBOL_MAP:
                term = ' '.join(SYMBOL_MAP[ch])
                prefix = '' if not result or result[-1].endswith(' ') else ' '
                suffix = '' if (not next_char or next_char.isspace()) else ' '
                segment = f'{prefix}{term}{suffix}'
            else:
                segment = ch
            result.append(segment)
            curr += len(segment)
            index_map.append(curr)
            continue

        raise RuntimeError('Unhandled mode control flow')

    transformed = ''.join(result)
    return transformed, index_map


def remap_spans(spans: List[Dict[str, object]], index_map: List[int], new_text: str) -> List[Dict[str, object]]:
    remapped = []
    for span in spans:
        start = span['start_position']
        end = span['end_position']
        new_start = index_map[start]
        new_end = index_map[end]
        remapped.append({
            **span,
            'start_position': new_start,
            'end_position': new_end,
            'entity_value': new_text[new_start:new_end],
        })
    return remapped


# Quick smoke tests for the mappings
for mode in OBFUSCATION_TYPES:
    sample_text = 'Email me at A@example.com or call 5551234.'
    new_text, mapping = transform_text(sample_text, mode)
    spans = [{'start_position': 12, 'end_position': 23}]  # 'A@example.com'
    remapped = remap_spans(spans, mapping, new_text)
    print(mode, '->', new_text[remapped[0]['start_position']:remapped[0]['end_position']])


None -> A@example.c
1-space ->  A @ e x a m p l e . c
5-space -> A@e xampl e.c
textualization -> A at example dot c


## Build augmented dataset
Duplicate each record for every obfuscation type and persist the result. Metadata gains an `obfuscation_type` flag.


In [10]:
augmented_records: List[Dict[str, object]] = []
for record in base_records:
    for mode in OBFUSCATION_TYPES:
        new_record = deepcopy(record)
        new_text, mapping = transform_text(record['full_text'], mode)
        new_record['full_text'] = new_text
        new_record['spans'] = remap_spans(record.get('spans', []), mapping, new_text)

        metadata = new_record.get('metadata') or {}
        if not isinstance(metadata, dict):
            metadata = {'original_metadata': metadata}
        metadata['obfuscation_type'] = mode
        new_record['metadata'] = metadata

        augmented_records.append(new_record)

print(f'Augmented record count: {len(augmented_records)}')


Augmented record count: 6000


In [11]:
# Preview a few transformed spans to ensure alignment
preview_df = pd.DataFrame([
    {
        'obfuscation_type': rec['metadata'].get('obfuscation_type'),
        'full_text': rec['full_text'][:80] + ('...' if len(rec['full_text']) > 80 else ''),
        'spans': rec['spans']
    }
    for rec in augmented_records[:5]
])
preview_df


Unnamed: 0,obfuscation_type,full_text,spans
0,,The address of Persint is 6750 Koskikatu 25 Ap...,"[{'entity_type': 'STREET_ADDRESS', 'entity_val..."
1,1-space,T h e a d d r e s s o f P e r s i n t ...,"[{'entity_type': 'STREET_ADDRESS', 'entity_val..."
2,5-space,The a ddres s of Persi nt is 6750 Kosk ikat...,"[{'entity_type': 'STREET_ADDRESS', 'entity_val..."
3,textualization,The address of Persint is six seven five zero ...,"[{'entity_type': 'STREET_ADDRESS', 'entity_val..."
4,,What are my options?,[]


In [14]:
# # Persist results with a safety backup of the original file
if not BACKUP_DATASET.exists():
    SOURCE_DATASET.rename(BACKUP_DATASET)
    print(f'Backed up original dataset to {BACKUP_DATASET}')
else:
    print('Backup already exists:', BACKUP_DATASET)

with SOURCE_DATASET.open('w', encoding='utf-8') as f:
    json.dump(augmented_records, f, ensure_ascii=False, indent=2)

print('Wrote augmented dataset to', SOURCE_DATASET)


Backup already exists: E:\UCLA\UCLA\263\cs263-final-project\data\synth_dataset_v2.original.json
Wrote augmented dataset to E:\UCLA\UCLA\263\cs263-final-project\data\synth_dataset_v2.json


## Usage
1. Run all cells in this notebook to rebuild `synth_dataset_v2.json` with obfuscation variants.
2. Confirm the counts in the preview table and rerun evaluation notebooks. `dataset_name` remains `"synth_dataset_v2.json"`.
3. If needed, restore the original dataset from `synth_dataset_v2.original.json`.
