In [None]:
import sys, subprocess
if "google.colab" in sys.modules:
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema"])


# Ingest and Clean Synthetic Articles

**What this notebook does:** Loads synthetic article metadata and abstracts, applies lightweight normalization, and prepares text for downstream tasks.

**Why it matters:** Reproducible preprocessing is the first step in any NLP workflow. By standardizing casing and punctuation removal on synthetic data, you practice habits that transfer to governed, production data.

**How to use it:**
1. Generate data with `scripts/generate_synthetic_data.py`.
2. Run locally or in Colab (first cell installs deps).
3. Execute cells top-to-bottom to inspect raw vs. cleaned text and adapt `clean_text` for your domain.

**Expected outcome:** A DataFrame with original and cleaned abstracts you can export or reuse in clustering, retrieval, or labeling workflows.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pandas as pd

articles_path = DATA_DIR / "sample_texts" / "articles_sample.csv"
articles = pd.read_csv(articles_path)
print(f"Loaded {len(articles)} articles from {articles_path}")
articles.head()


## Basic cleaning

In [None]:
import re

def clean_text(text: str) -> str:
    text = text.lower().strip()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    return " ".join(text.split())

articles["cleaned"] = articles["abstract"].apply(clean_text)
articles[["title", "cleaned"]].head()
