In [8]:
!pip install google-generativeai
!pip install pymupdf

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [9]:
import os
import re
import json
import csv
import time
import pathlib
import hashlib
import textwrap
import contextlib
from datetime import datetime
from typing import Optional

import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

import google.generativeai as genai

# ------------------ Configure Paths -------------------
JSON_ROOT     = pathlib.Path("Legal_doc_test")   # source JSONs
OUT_DIR       = pathlib.Path("scraped_laws")     # where snippets + log go
CACHE_DIR     = OUT_DIR / ".cache"               # html cache (optional)

CHARS_AROUND    = 2000    # context around § hit
REQUEST_TIMEOUT  = 15      # seconds
SLEEP_BETWEEN    = 1.0     # polite pause
SLEEP_BETWEEN_AI = 1.0     # polite pause after Gemini calls

# ------------- Gemini / API Key Setup ---------------
genai.configure(api_key=api_key)

# ------------- Ensure Output Dirs Exist -------------
OUT_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)

print("JSON root :", JSON_ROOT.resolve())
print("Output dir:", OUT_DIR.resolve())

# User-Agent strings to try
UA_STRINGS = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "curl/8.5.0",
    "python-requests/2.31"
]

session = requests.Session()  # for TCP reuse


JSON root : /Users/mannanxanand/Legal-Document-Discrepancy-Benchmark-Dataset/Legal_doc_test
Output dir: /Users/mannanxanand/Legal-Document-Discrepancy-Benchmark-Dataset/scraped_laws


In [10]:
# -------------------------------------------------------
# Logging Setup:
#  1) scrape_log.tsv: overall scraping logs
#  2) broken_links.tsv: logs unreachable/broken links
# -------------------------------------------------------

log_path         = OUT_DIR / "scrape_log.tsv"
broken_links_path = OUT_DIR / "broken_links.tsv"

log_fh   = open(log_path, "a", newline="", encoding="utf-8")
log      = csv.writer(log_fh, delimiter="\t")

broken_fh = open(broken_links_path, "a", newline="", encoding="utf-8")
broken_log = csv.writer(broken_fh, delimiter="\t")

# Write headers if empty
if log_fh.tell() == 0:
    log.writerow(["timestamp", "json_file", "law_url", "status", "chars"])

if broken_fh.tell() == 0:
    broken_log.writerow(["timestamp", "json_file", "law_url", "reason"])


def note_scrape(jfile, url, status, chars=0):
    """Write a line in scrape_log.tsv."""
    log.writerow([datetime.utcnow().isoformat(), jfile, url, status, chars])
    print(f"{jfile:80s}  ->  {status}")


def note_broken(jfile, url, reason):
    """Write a line in broken_links.tsv."""
    broken_log.writerow([datetime.utcnow().isoformat(), jfile, url, reason])
    print(f"*** BROKEN LINK for {jfile} -> {url} :: {reason}")


In [11]:
def resilient_get(url: str, ua: str) -> str:
    """
    Download the URL with the given User-Agent.
    Falls back to Google web-cache if 404/410.
    Raises requests.HTTPError if still not successful.
    """
    hdr = {"User-Agent": ua, "Referer": "https://google.com"}
    try:
        r = session.get(url, timeout=REQUEST_TIMEOUT, headers=hdr)
        r.raise_for_status()
        return r.text
    except requests.HTTPError as e:
        # If 404/410, try Google WebCache
        if e.response.status_code in (404, 410):
            cache_url = f"https://webcache.googleusercontent.com/search?q={url}"
            r = session.get(cache_url, timeout=REQUEST_TIMEOUT, headers=hdr)
            r.raise_for_status()
            return r.text
        raise


def extract_relevant_section(text: str, citation: str,
                             context: int = CHARS_AROUND) -> str:
    """
    Return ~context chars before/after the first match
    for the section number found in citation (e.g. '151.002').
    Fallback: first 2*context chars if nothing matched.
    """
    # pull the numeric part (151.002, 2‑306, etc.)
    m_sec = re.search(r'(\d+\.\d+)', citation)
    section_id = m_sec.group(1) if m_sec else None

    if section_id:
        # match on the exact numeric portion
        m = re.search(rf'\b{re.escape(section_id)}\b', text)
        if m:
            start = max(0, m.start() - context)
            end   = m.end() + context
            return text[start:end]

    # fallback – nothing matched
    return text[:context * 2]


def fetch_snippet_with_retry(url: str, citation: str) -> Optional[str]:
    """
    Try each UA in UA_STRINGS. Clean HTML → plain text
    → isolate relevant section → return snippet (or None).
    """
    for ua in UA_STRINGS:
        try:
            # ---------- caching key -------------
            key = CACHE_DIR / hashlib.sha1(f"{ua}|{url}".encode()).hexdigest()
            if key.exists():
                html = key.read_text(encoding="utf-8")
            else:
                html = resilient_get(url, ua)
                key.write_text(html, encoding="utf-8")

            # ---------- clean HTML -------------
            soup = BeautifulSoup(html, "html.parser")
            for tag in soup(["script", "style", "nav", "footer", "header"]):
                tag.decompose()
            text = re.sub(r"\s+", " ", soup.get_text(" ", strip=True))
            text = text.replace("\ufeff", "")  # remove BOM

            # ---------- slice -------------
            snippet = extract_relevant_section(text, citation)
            return snippet

        except Exception:
            # try next UA
            continue

    return None  # all attempts failed


In [12]:
def evaluate_snippet_with_gemini(snippet: str, law_explanation: str) -> dict:
    """
    Use the Gemini model to judge snippet's 'accuracy_relevance' and explanation.
    The model will return JSON with fields:
        - accuracy_relevance: Low/Medium/High
        - model_explanation: some text
    If there's any error or parsing failure, degrade gracefully.
    """
    prompt = f"""You are given a 'scraped_snippet' from a legal code website and a 'law_explanation' that describes how it applies to a contract change.

Scraped snippet:
{snippet}
Law explanation:
Please assess how accurately or directly the snippet supports the law explanation. Output ONLY valid JSON with two fields:
{{
  "accuracy_relevance": "...",
  "model_explanation": "..."
}}

Where "accuracy_relevance" must be exactly one of "Low", "Medium", or "High". The "model_explanation" is a brief sentence or two explaining why.
Do not include any additional keys or text.
"""

    try:
        model = genai.GenerativeModel(model_name="gemini-2.0-flash")
        response = model.generate_content(prompt)
        content = response.text.strip()

        # Attempt to parse JSON
        try:
            parsed = json.loads(content)
        except json.JSONDecodeError:
            # If parsing fails, degrade
            parsed = {
                "accuracy_relevance": "Low",
                "model_explanation": (
                    "Failed to parse structured JSON from model. Defaulting to Low relevance."
                )
            }

        # Validate presence of required keys
        if "accuracy_relevance" not in parsed or "model_explanation" not in parsed:
            parsed = {
                "accuracy_relevance": "Low",
                "model_explanation": (
                    "Model response was missing required fields. Defaulting to Low."
                )
            }

        # Validate allowed values
        if parsed["accuracy_relevance"] not in ["Low", "Medium", "High"]:
            parsed["accuracy_relevance"] = "Low"
            parsed["model_explanation"] += (
                " [Note: Provided relevance was invalid; forced to 'Low']"
            )

        return parsed

    except Exception as e:
        # If there's a bigger error in the API call, degrade gracefully
        return {
            "accuracy_relevance": "Low",
            "model_explanation": f"Gemini call failed. {str(e)}"
        }

In [13]:
def all_json_files(root: pathlib.Path):
    """Yield every *.json under root, skipping .ipynb_checkpoints."""
    for p in root.rglob("*.json"):
        if ".ipynb_checkpoints" in p.parts:
            continue
        yield p


def target_path(json_path: pathlib.Path) -> pathlib.Path:
    """Mirror directory tree under OUT_DIR with *.snippet.json suffix."""
    return OUT_DIR / json_path.relative_to(JSON_ROOT).with_suffix(".snippet.json")

# We'll keep track of some errors
errors = []

# -------------------- MAIN WORKFLOW --------------------
for jpath in tqdm(list(all_json_files(JSON_ROOT)), desc="scraping"):
    out_path = target_path(jpath)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    # If we've already processed this JSON, skip
    if out_path.exists():
        continue

    try:
        data = json.loads(jpath.read_text(encoding="utf-8"))
        # We'll store a flag if ANY link is broken => skip entire file
        skip_this_file = False

        # Each data file has a top-level list, and we assume data[0]["perturbation"]
        # is the relevant array of modifications.
        for pert in data[0]["perturbation"]:
            url  = (pert.get("law_url") or "").strip()
            cite = (pert.get("law_citation") or "").strip()

            # If no URL, skip snippet but do not skip entire file
            if not url or url.lower().startswith("n/a"):
                pert["scraped_snippet"]       = None
                pert["accuracy_relevance"]    = "Low"
                pert["model_explanation"]     = "No law_url provided."
                note_scrape(jpath.name, url or "∅", "SKIPPED (no url)")
                continue

            # Try fetching snippet
            snippet = fetch_snippet_with_retry(url, cite)
            if not snippet:
                # Mark as broken -> skip entire file
                skip_this_file = True
                note_broken(jpath.name, url, "Snippet was empty or fetch failed.")
                break  # break out of the loop of perturbations

            # If snippet is non-empty, proceed
            pert["scraped_snippet"] = snippet
            note_scrape(jpath.name, url, "OK", len(snippet))

            # Evaluate snippet with Gemini
            explanation = pert.get("law_explanation", "")
            gemini_result = evaluate_snippet_with_gemini(snippet, explanation)
            pert["accuracy_relevance"] = gemini_result["accuracy_relevance"]
            pert["model_explanation"]  = gemini_result["model_explanation"]

            # Polite pause
            time.sleep(SLEEP_BETWEEN_AI)

        if skip_this_file:
            # Do not write out any snippet JSON
            continue

        # If everything is good, write out the updated JSON
        out_path.write_text(json.dumps(data, indent=2, ensure_ascii=False),
                            encoding="utf-8")

    except Exception as exc:
        errors.append((jpath, str(exc)))
        note_scrape(jpath.name, "⟨parsing⟩", f"ERROR: {exc}")

# Close logs
log_fh.close()
broken_fh.close()

print(f"\nFinished. {len(errors)} errors logged → {log_path.name}")
if errors:
    print("Some errors occurred in parsing or processing:")
    for e in errors:
        print(" -", e)


scraping:   0%|          | 0/97 [00:00<?, ?it/s]

*** BROKEN LINK for perturbed_DOMINIADVISORTRUST_02_18_2005-EX-99.(H)(2)-SPONSORSHIPAGREEMENT.txt.json -> https://www.sec.gov/divisions/investment/whistleblower/ia-section-36a :: Snippet was empty or fetch failed.
perturbed_IntegrityFunds_20200121_485BPOS_EX-99.EUNDRCONTR_11948727_EX-99.EUNDRCONTR_ServiceAgreement.txt.json  ->  OK
perturbed_IntegrityFunds_20200121_485BPOS_EX-99.EUNDRCONTR_11948727_EX-99.EUNDRCONTR_ServiceAgreement.txt.json  ->  OK
perturbed_IntegrityFunds_20200121_485BPOS_EX-99.EUNDRCONTR_11948727_EX-99.EUNDRCONTR_ServiceAgreement.txt.json  ->  OK
perturbed_VEONEER,INC_02_21_2020-EX-10.11-JOINTVENTUREAGREEMENT.txt.json          ->  OK
perturbed_VEONEER,INC_02_21_2020-EX-10.11-JOINTVENTUREAGREEMENT.txt.json          ->  OK
*** BROKEN LINK for perturbed_ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENTAGREEMENT.txt.json -> https://law.justia.com/codes/kansas/2022/chapter-84/article-1/section-84-1-103/ :: Snippet was empty or fetch failed.
perturbed_PfHospitalityGroupInc_20150

In [14]:
# Demonstrate one example snippet from out_dir
snippet_files = list(OUT_DIR.rglob("*.snippet.json"))
if snippet_files:
    example_json = snippet_files[0]
    example_data = json.loads(example_json.read_text(encoding="utf-8"))
    
    # Safeguard if structure is different
    if example_data and "perturbation" in example_data[0]:
        first = example_data[0]["perturbation"][0]
        print("\n=== EXAMPLE OUTPUT ===")
        print("→", example_json.relative_to(OUT_DIR))
        print("Law URL        :", first.get("law_url"))
        print("Citation       :", first.get("law_citation"))
        print("Accuracy       :", first.get("accuracy_relevance"))
        print("Model Expl.    :", first.get("model_explanation"))
        print("\nSnippet:\n")
        print(textwrap.fill(first.get("scraped_snippet") or "⟨nothing⟩", width=90))
    else:
        print("No valid 'perturbation' structure in", example_json.name)
else:
    print("No snippet.json files found in", OUT_DIR.name)



=== EXAMPLE OUTPUT ===
→ ambiguity_legal/perturbed_PfHospitalityGroupInc_20150923_10-12G_EX-10.1_9266710_EX-10.1_FranchiseAgreement3.txt.snippet.json
Law URL        : https://leginfo.legislature.ca.gov/faces/codes_displaySection.xhtml?lawCode=CIV&sectionNum=3426
Citation       : Cal. Civ. Code § 3426
Accuracy       : Low
Model Expl.    : Failed to parse structured JSON from model. Defaulting to Low relevance.

Snippet:

California Code, CIV 3426 skip to content home accessibility FAQ feedback sitemap login x
Quick Search: Bill Number Bill Keyword Home Bill Information California Law Publications
Other Resources My Subscriptions My Favorites California Law >> >> Code Section Code
Section Code: Select Code CONS BPC CIV CCP COM CORP EDC ELEC EVID FAM FIN FGC FAC GOV HNC
HSC INS LAB MVC PEN PROB PCC PRC PUC RTC SHC UIC VEH WAT WIC Article: Section: Code:
Select Code All BPC CIV CCP COM CORP EDC ELEC EVID FAM FIN FGC FAC GOV HNC HSC INS LAB MVC
PEN PROB PCC PRC PUC RTC SHC UIC VEH WAT WIC 