# Data curation notebook
Minimal pipeline: load raw MAG items, filter by quality/toxicity/length, deduplicate, scrub PII, format training pairs and save curated JSON.

In [None]:
import json
import os
from datetime import datetime
import random
from services.privacy import scrub_record

# Compute repo root assuming notebook lives in /notebooks
ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
RAW_PATH = os.path.join(ROOT, 'data', 'raw_mag.json')  # replace with your raw MAG dump
OUT_PATH = os.path.join(ROOT, 'data', 'curated_training.json')

print('ROOT:', ROOT)
print('RAW_PATH:', RAW_PATH)
print('OUT_PATH:', OUT_PATH)


In [None]:
# Load raw items (placeholder if file missing)
if os.path.exists(RAW_PATH):
    with open(RAW_PATH, 'r', encoding='utf-8') as f:
        raw = json.load(f)
else:
    print('RAW file not found; creating toy examples')
    raw = [
        {"id": "1", "text": "How to compute NPV?", "response": "Here is how you compute NPV step-by-step...", "importance": 0.9, "timestamp": "2025-10-25T12:00:00Z", "quality": 0.95},
        {"id": "2", "text": "What's my SSN 123-45-6789?", "response": "I can't help with that.", "importance": 0.1, "timestamp": "2025-01-01T00:00:00Z", "quality": 0.1}
    ]


In [None]:
def is_high_quality(item, min_len=15, min_quality=0.5):
    txt = (item.get('text') or '') + ' ' + (item.get('response') or '')
    if len(txt.strip()) < min_len:
        return False
    if item.get('quality') is not None and item.get('quality') < min_quality:
        return False
    return True

def is_toxic(item):
    # placeholder: integrate Detoxify, Perspective API or your classifier here
    txt = (item.get('text') or '') + ' ' + (item.get('response') or '')
    tox_triggers = ['kill', 'suicide', 'hate', 'slur']
    t = txt.lower()
    return any(w in t for w in tox_triggers)


In [None]:
# Filtering pipeline
filtered = []
seen = set()
for it in raw:
    if not is_high_quality(it):
        continue
    if is_toxic(it):
        continue
    # privacy scrub
    scrubbed = scrub_record({'input': it.get('text',''), 'target': it.get('response','')}, fields=('input','target'), replace_with='[REDACTED]', use_spacy=False)
    # dedupe by normalized input+target
    key = (scrubbed['input'].strip().lower(), scrubbed['target'].strip().lower())
    if key in seen:
        continue
    seen.add(key)
    formatted = {
        'input': '<|persona:analyst|>\nUser: ' + scrubbed['input'].strip() + '\nAssistant:',
        'target': ' ' + scrubbed['target'].strip(),
        'source_id': it.get('id')
    }
    filtered.append(formatted)

print('Filtered count:', len(filtered))
if filtered:
    print('Example:', filtered[0])


In [None]:
# Optional: prioritize recent & important items (toy scorer)
def score_item_raw(raw_item):
    importance = raw_item.get('importance', 0.5)
    ts = raw_item.get('timestamp')
    recency = 0.0
    try:
        rec_dt = datetime.fromisoformat(ts.replace('Z','+00:00'))
        days = (datetime.utcnow() - rec_dt).days
        recency = max(0, 1 - days/365)
    except Exception:
        recency = 0.0
    return 0.7 * importance + 0.3 * recency

# here we shuffled; replace with scoring/importance sorting if you keep original raw items
filtered_sorted = sorted(filtered, key=lambda x: random.random())


In [None]:
# Save curated dataset
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
with open(OUT_PATH, 'w', encoding='utf-8') as f:
    json.dump(filtered_sorted, f, ensure_ascii=False, indent=2)
print('Saved curated dataset to', OUT_PATH)
