In [None]:
import os, json, re, time, pathlib, textwrap, itertools, contextlib
from bs4 import BeautifulSoup
import requests
from tqdm.auto import tqdm
from typing import Optional
from datetime import datetime
import csv, itertools

In [None]:
# ─── EDIT HERE if you ever move the folders ────────────────────────────
JSON_ROOT   = pathlib.Path("Legal_doc_test")   # ← your master folder
OUT_DIR     = pathlib.Path("scraped_laws")     # snippets will be saved here
CHARS_AROUND = 1000                             # how many characters to keep
#REQUEST_TIMEOUT = 15000                           # seconds
# ────────────────────────────────────────────────────────────────────────

OUT_DIR.mkdir(parents=True, exist_ok=True)
print("Searching for JSONs under:", JSON_ROOT.resolve())


In [None]:
def fetch_snippet(url: str, max_chars: int = 10000) -> Optional[str]:
    """Download `url` and return the first `max_chars` of visible text, or None on failure."""
    try:
        r = requests.get(
            url,
            timeout=REQUEST_TIMEOUT,
            headers={"User-Agent": "Mozilla/5.0"}
        )
        r.raise_for_status()
    except Exception as exc:                # network or HTTP error
        print(f"  ⚠️  request failed: {exc}")
        return None

    soup = BeautifulSoup(r.text, "html.parser")

    # strip non‑content tags
    for tag in soup(["script", "style", "nav", "footer", "header"]):
        tag.decompose()

    text = soup.get_text(" ", strip=True)
    text = re.sub(r"\s+", " ", text)        # collapse whitespace
    return text[:max_chars] or None


In [None]:
# grab the first JSON we can find
test_json = next(JSON_ROOT.rglob("*.json"))
print("Testing on:", test_json.relative_to(JSON_ROOT))

with open(test_json, encoding="utf-8") as fh:
    sample = json.load(fh)[0]               # JSONs are wrapped in a list
first_law = sample["perturbation"][0]

snippet = fetch_snippet(first_law["law_url"], CHARS_AROUND)
print("\n— Scraped snippet —\n")
print(textwrap.fill(snippet or "⟨nothing scraped⟩", width=100))


In [None]:
def all_json_files(root: pathlib.Path):
    yield from root.rglob("*.json")

def target_path(json_path: pathlib.Path) -> pathlib.Path:
    return OUT_DIR / json_path.relative_to(JSON_ROOT).with_suffix(".snippet.json")

# open the log file once; append mode so successive runs accumulate
log_path = OUT_DIR / "scrape_log.tsv"
log_fh   = open(log_path, "a", newline="", encoding="utf-8")
log      = csv.writer(log_fh, delimiter="\t")
if log_fh.tell() == 0:          # header only the first time
    log.writerow(["timestamp", "json_file", "law_url", "status"])

errors = []

def note(json_file, url, status):
    """Write one line to log file and echo to notebook."""
    log.writerow([datetime.utcnow().isoformat(), json_file, url, status])
    print(f"{json_file}  ->  {status}")

session = requests.Session()    # reuse TCP connection

def fetch_snippet_with_retry(url, max_chars=800):
    # Try twice with different headers (some sites dislike 'python-requests')
    hdr_sets = [
        {"User-Agent": "Mozilla/5.0"},                           # normal browser UA
        {"User-Agent": "curl/8.5.0"}                             # plain CLI UA
    ]
    for hdr in hdr_sets:
        try:
            r = session.get(url, timeout=REQUEST_TIMEOUT, headers=hdr)
            r.raise_for_status()
            # success – hand over to BeautifulSoup cleaner
            soup = BeautifulSoup(r.text, "html.parser")
            for tag in soup(["script", "style", "nav", "footer", "header"]):
                tag.decompose()
            txt = re.sub(r"\s+", " ", soup.get_text(" ", strip=True))
            return txt[:max_chars] or None
        except Exception as exc:
            last_exc = exc
    raise last_exc   # both attempts failed

# ------------------------------------------------------------------------
for jpath in tqdm(list(all_json_files(JSON_ROOT)), desc="scraping"):
    out_path = target_path(jpath)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    if out_path.exists():
        continue     # already scraped

    try:
        data = json.loads(jpath.read_text(encoding="utf-8"))
        for pert in data[0]["perturbation"]:
            url = (pert.get("law_url") or "").strip()
            if not url or url.lower().startswith("n/a"):
                pert["scraped_snippet"] = None
                note(jpath.relative_to(JSON_ROOT), url or "∅", "SKIPPED (no url)")
                continue
            try:
                pert["scraped_snippet"] = fetch_snippet_with_retry(url, CHARS_AROUND)
                status = "OK" if pert["scraped_snippet"] else "EMPTY"
                note(jpath.relative_to(JSON_ROOT), url, status)
            except Exception as exc:
                pert["scraped_snippet"] = None
                status = f"ERROR: {type(exc).__name__} – {exc}"
                note(jpath.relative_to(JSON_ROOT), url, status)
                errors.append((jpath, url, str(exc)))
            time.sleep(0.5)  # be polite

        out_path.write_text(json.dumps(data, indent=2, ensure_ascii=False),
                            encoding="utf-8")

    except Exception as exc:
        errors.append((jpath, "⟨parsing⟩", str(exc)))
        note(jpath.relative_to(JSON_ROOT), "⟨parsing⟩",
             f"ERROR: {type(exc).__name__} – {exc}")

log_fh.close()

print(f"\nFinished.  {len(errors)} issues logged to {log_path.relative_to(OUT_DIR)}")
