In [None]:
# -------------------- standard libs --------------------
import os, re, json, csv, time, pathlib, hashlib, textwrap, contextlib
from datetime import datetime
from typing import Optional

# -------------------- third‑party ----------------------
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

# -------------------- paths & constants ----------------
JSON_ROOT     = pathlib.Path("Legal_doc_test")          # source JSONs
OUT_DIR       = pathlib.Path("scraped_laws")            # where snippets + log go
CACHE_DIR     = OUT_DIR / ".cache"                      # html cache (optional)

CHARS_AROUND  = 600                                     # context around § hit
REQUEST_TIMEOUT = 15                                    # seconds
SLEEP_BETWEEN = 1.0                                     # polite pause

# ensure dirs exist
OUT_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)

print("JSON root :", JSON_ROOT.resolve())
print("Output dir:", OUT_DIR.resolve())


In [None]:
UA_STRINGS = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "curl/8.5.0",
    "python-requests/2.31"
]

session = requests.Session()          # TCP reuse


def resilient_get(url: str, ua: str) -> str:
    """
    Download `url` with the given User‑Agent.
    Falls back to Google web‑cache if 404/410.
    """
    hdr = {"User-Agent": ua, "Referer": "https://google.com"}
    try:
        r = session.get(url, timeout=REQUEST_TIMEOUT, headers=hdr)
        r.raise_for_status()
        return r.text
    except requests.HTTPError as e:
        if e.response.status_code in (404, 410):
            cache_url = f"https://webcache.googleusercontent.com/search?q={url}"
            r = session.get(cache_url, timeout=REQUEST_TIMEOUT, headers=hdr)
            r.raise_for_status()
            return r.text
        raise


def extract_relevant_section(text: str, citation: str,
                             context: int = CHARS_AROUND) -> str:
    """
    Return ~`context` chars before/after the first match for the section number
    found in `citation` (e.g. '151.002').  Fallback: first 2*context chars.
    """
    # pull the numeric part (151.002, 2‑306, etc.)
    m_sec = re.search(r'(\d+\.\d+)', citation)
    section_id = m_sec.group(1) if m_sec else None

    if section_id:
        m = re.search(rf'\b{re.escape(section_id)}\b', text)
        if m:
            start = max(0, m.start() - context)
            end   = m.end() + context
            return text[start:end]

    # fallback – nothing matched
    return text[:context * 2]


def fetch_snippet_with_retry(url: str, citation: str) -> Optional[str]:
    """
    Try each UA in UA_STRINGS.  Clean HTML → plain text →
    isolate relevant section → return snippet (or None).
    """
    for ua in UA_STRINGS:
        try:
            # ---------- caching ----------
            key = CACHE_DIR / hashlib.sha1(f"{ua}|{url}".encode()).hexdigest()
            if key.exists():
                html = key.read_text(encoding="utf-8")
            else:
                html = resilient_get(url, ua)
                key.write_text(html, encoding="utf-8")

            # ---------- clean ----------
            soup = BeautifulSoup(html, "html.parser")
            for tag in soup(["script", "style", "nav", "footer", "header"]):
                tag.decompose()
            text = re.sub(r"\s+", " ", soup.get_text(" ", strip=True))
            text = text.replace("\ufeff", "")            # BOM

            # ---------- slice ----------
            return extract_relevant_section(text, citation)

        except Exception:
            continue    # try next UA

    return None          # all attempts failed


Searching for JSONs under: /Users/mannanxanand/Legal-Document-Discrepancy-Benchmark-Dataset/Legal_doc_test


In [None]:
def all_json_files(root: pathlib.Path):
    """Yield every *.json under `root`, skipping .ipynb_checkpoints."""
    for p in root.rglob("*.json"):
        if ".ipynb_checkpoints" in p.parts:
            continue
        yield p


def target_path(json_path: pathlib.Path) -> pathlib.Path:
    """Mirror directory tree under OUT_DIR with *.snippet.json suffix."""
    return OUT_DIR / json_path.relative_to(JSON_ROOT).with_suffix(".snippet.json")


# ------------- TSV log -------------
log_path = OUT_DIR / "scrape_log.tsv"
log_fh   = open(log_path, "a", newline="", encoding="utf-8")
log      = csv.writer(log_fh, delimiter="\t")

if log_fh.tell() == 0:          # header only once
    log.writerow(["timestamp", "json_file", "law_url", "status", "chars"])

def note(jfile, url, status, chars=0):
    log.writerow([datetime.utcnow().isoformat(), jfile, url, status, chars])
    print(f"{jfile:80s}  ->  {status}")


In [None]:
errors = []

for jpath in tqdm(list(all_json_files(JSON_ROOT)), desc="scraping"):
    out_path = target_path(jpath)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    if out_path.exists():
        continue                               # already processed

    try:
        data = json.loads(jpath.read_text(encoding="utf-8"))

        for pert in data[0]["perturbation"]:
            url  = (pert.get("law_url") or "").strip()
            cite = (pert.get("law_citation") or "").strip()

            if not url or url.lower().startswith("n/a"):
                pert["scraped_snippet"] = None
                note(jpath.name, url or "∅", "SKIPPED (no url)")
                continue

            snippet = fetch_snippet_with_retry(url, cite)
            pert["scraped_snippet"] = snippet
            status = "OK" if snippet else "EMPTY"
            note(jpath.name, url, status, len(snippet or ""))

            time.sleep(SLEEP_BETWEEN)

        out_path.write_text(json.dumps(data, indent=2, ensure_ascii=False),
                            encoding="utf-8")

    except Exception as exc:
        errors.append((jpath, str(exc)))
        note(jpath.name, "⟨parsing⟩", f"ERROR: {exc}")

log_fh.close()
print(f"\nFinished.  {len(errors)} errors logged → {log_path.name}")


Testing on: ambiguity_legal/perturbed_DOMINIADVISORTRUST_02_18_2005-EX-99.(H)(2)-SPONSORSHIPAGREEMENT.txt.json
  ⚠️  request failed: 403 Client Error: Forbidden for url: https://www.sec.gov/divisions/investment/whistleblower/ia-section-36a

— Scraped snippet —

⟨nothing scraped⟩


In [None]:
# display one example snippet nicely
example_json = next(OUT_DIR.rglob("*.snippet.json"))
example = json.loads(example_json.read_text(encoding="utf-8"))[0]
first = example["perturbation"][0]

print("→", example_json.relative_to(OUT_DIR))
print("Law URL   :", first["law_url"])
print("Citation  :", first["law_citation"])
print("\nSnippet   :\n")
print(textwrap.fill(first["scraped_snippet"] or "⟨nothing⟩", width=90))


scraping:   0%|          | 0/101 [00:00<?, ?it/s]

ambiguity_legal/perturbed_DOMINIADVISORTRUST_02_18_2005-EX-99.(H)(2)-SPONSORSHIPAGREEMENT.txt.json  ->  ERROR: HTTPError – 403 Client Error: Forbidden for url: https://www.sec.gov/divisions/investment/whistleblower/ia-section-36a
ambiguity_legal/perturbed_DOMINIADVISORTRUST_02_18_2005-EX-99.(H)(2)-SPONSORSHIPAGREEMENT.txt.json  ->  ERROR: HTTPError – 404 Client Error: Not Found for url: https://www.law.cornell.edu/rules/cfr/17/240.10b-5
ambiguity_legal/perturbed_DOMINIADVISORTRUST_02_18_2005-EX-99.(H)(2)-SPONSORSHIPAGREEMENT.txt.json  ->  OK
ambiguity_legal/perturbed_IntegrityFunds_20200121_485BPOS_EX-99.EUNDRCONTR_11948727_EX-99.EUNDRCONTR_ServiceAgreement.txt.json  ->  OK
ambiguity_legal/perturbed_IntegrityFunds_20200121_485BPOS_EX-99.EUNDRCONTR_11948727_EX-99.EUNDRCONTR_ServiceAgreement.txt.json  ->  OK
ambiguity_legal/perturbed_IntegrityFunds_20200121_485BPOS_EX-99.EUNDRCONTR_11948727_EX-99.EUNDRCONTR_ServiceAgreement.txt.json  ->  OK
ambiguity_legal/perturbed_VEONEER,INC_02_21_202