# POC — International Agreements Database Mining

This notebook implements a **hybrid extraction pipeline** for noisy OCR legal agreements for the following project tasks:
- **6: Terms of validity** (start/end/duration)
- **8: Conditions for extending** (automatic vs mutual decision vs optional)
- **11: Evaluation of implementation** (review/audit/reporting) + basic attributes

POC Pipeline:

1. **Candidate retrieval** (regex triggers + neighbor expansion for recall under OCR noise)  
2. **Neural clause detection** (finetuned legal model if available, else MNLI zero-shot baseline)  
3. **NLI verification** (ContractNLI-style hypothesis test)  
4. **Normalization & structured output** (dates/durations, renewal type + notice, evaluation attributes)  



## 1) Setup & Utilities

### 1.1 Configuration and memory‑safe model loading
- Set device/CPU defaults, thresholds, and model names.
- Define shared helpers (random seed, caps, lightweight data structures).

In [1]:
# =========================
# 0) CONFIG + MEMORY-SAFE MODEL LOADING 
# =========================
import platform

# Device: GPU if available, else CPU
try:
    import torch
    DEVICE = 0 if torch.cuda.is_available() else -1
except Exception:
    DEVICE = -1

# --- Switches (set these as needed) ---
USE_FINETUNED_SEQCLS = False
FINETUNED_SEQCLS_MODEL = None  # e.g. "nlpaueb/legal-bert-base-uncased" or your finetuned checkpoint

USE_ZEROSHOT_FALLBACK = True
# Default to SMALL on Windows/CPU to avoid paging-file OSError 1455
ZEROSHOT_MODEL = "typeform/distilbert-base-uncased-mnli"

USE_NLI_VERIFIER = True
NLI_MODEL = "typeform/distilbert-base-uncased-mnli"

# Thresholds (tune on a dev set)
THRESH_TEMPORAL = 0.65
THRESH_RENEWAL = 0.60
THRESH_EVAL = 0.60

# Retrieval
NEIGHBOR_K = 2
MAX_CANDIDATES = 50

# Optional HeidelTime hook (off by default)
USE_HEIDELTIME = False
HEIDELTIME_JAR = None
HEIDELTIME_CONFIG = None

# Keep backward-compatible names used later in the notebook
CLAUSE_MODEL = ZEROSHOT_MODEL

# -------------------------
# Memory-safe pipeline loader
# -------------------------
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

def make_zsc(model_name: str, device: int):
    """
    Create zero-shot-classification pipeline with safe fallbacks.
    Avoids Windows paging-file OSError by falling back to smaller MNLI models.
    """
    fallbacks = [
        model_name,
        "typeform/distilbert-base-uncased-mnli",
        "valhalla/distilbart-mnli-12-1",
    ]
    last_err = None
    for name in fallbacks:
        try:
            print(f"Loading ZSC model: {name}")
            return pipeline(
                "zero-shot-classification",
                model=name,
                device=device,
                model_kwargs={"low_cpu_mem_usage": True},
            )
        except Exception as e:
            last_err = e
            print(f"⚠️ Failed loading {name}: {e}")
    raise RuntimeError(f"Failed to load any ZSC model. Last error: {last_err}")

print("✅ Config loaded | Device:", "GPU" if DEVICE==0 else "CPU")


import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import re
import json
import random
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional, Tuple, Any

import pandas as pd
from tqdm import tqdm
import dateparser
from datetime import datetime
from dateutil.relativedelta import relativedelta

from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


✅ Config loaded | Device: CPU


In [2]:
# -----------------------------
# CONFIG (CPU)
# -----------------------------
DEVICE = -1
RANDOM_SEED = 42

# Retrieval caps
MAX_BLOCKS = 24
MAX_SENTS  = 80
NEIGHBOR_K = 2

# Thresholds
THRESH_TEMPORAL = 0.65   # for validity (temporal clause)
THRESH_RENEWAL  = 0.60   # renewal type
THRESH_EVAL     = 0.60   # eval present

DERIVE_END_DATE = True

# ------------- MODEL CHOICES -------------
# 1) Clause classifier (SOTA: LegalBERT fine-tuned token/sequence classification)
#    For CPU POC, use a smaller sequence classifier OR keep ZSC as fallback.
#
# Recommended: swap this to a LegalBERT-like finetune when you have one.
CLAUSE_MODEL = "typeform/distilbert-base-uncased-mnli"  # fallback verifier/classifier

# Optional NLI verifier (ContractNLI-like verification layer)
USE_NLI_VERIFIER = True
NLI_MODEL = "typeform/distilbert-base-uncased-mnli"  # swap to stronger MNLI later (DeBERTa MNLI)

# If you later have a real finetuned model for "temporal/renewal/eval", plug it here:
# SEQCLS_MODEL = "your-finetuned-legalbert-clause-classifier"
USE_SEQCLS = False
SEQCLS_MODEL = None


### 1.2 Data I/O
- Load OCR text (plain text or JSON) into a single normalized string.
- Preserve any page markers used later for evidence and page‑level outputs.

In [3]:
def load_txt(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def load_ocr_json_with_pages(path: str) -> str:
    """
    Reconstruct text with explicit page markers to preserve provenance:
    [[PAGE=1]] ... [[PAGE=2]] ...
    """
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    out_lines = []
    pages = data.get("pages", [])
    for p_idx, page in enumerate(pages, start=1):
        out_lines.append(f"[[PAGE={p_idx}]]")
        for block in page.get("blocks", []):
            for line in block.get("lines", []):
                words = [w.get("value", "") for w in line.get("words", []) if w.get("value")]
                if words:
                    out_lines.append(" ".join(words))
        out_lines.append("")  # blank line between pages

    return "\n".join(out_lines)

def normalize_text(text: str) -> str:
    text = text.replace("\x0c", "\n")
    # Keep page markers intact
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


### 1.3 Page‑aware segmentation
- Split OCR text into pages, then into blocks and sentences.
- Keep `(page, block_id, sent_id)` so every extracted field can cite evidence.

In [4]:
PAGE_MARKER = re.compile(r"\[\[PAGE=(\d+)\]\]")

def split_into_pages(text: str) -> List[Tuple[int, str]]:
    """
    Returns [(page_num, page_text), ...]
    If no markers exist, treat as page 1.
    """
    chunks = []
    matches = list(PAGE_MARKER.finditer(text))
    if not matches:
        return [(1, text)]

    for i, m in enumerate(matches):
        page_num = int(m.group(1))
        start = m.end()
        end = matches[i+1].start() if i+1 < len(matches) else len(text)
        page_text = text[start:end].strip()
        chunks.append((page_num, page_text))
    return chunks

# OCR-friendly sentence split: don't rely on capitalization
_SENT_SPLIT = re.compile(r"(?<=[\.\?\!])\s+|\n+")

@dataclass
class SentItem:
    sid: int
    page: int
    text: str

@dataclass
class BlockItem:
    bid: int
    page: int
    text: str

def split_sentences_with_meta(text: str, max_len: int = 1200) -> List[SentItem]:
    pages = split_into_pages(text)
    sents: List[SentItem] = []
    sid = 0
    for page_num, page_text in pages:
        raw = [s.strip() for s in _SENT_SPLIT.split(page_text) if s and s.strip()]
        # Light merge for very short OCR fragments
        merged, buf = [], ""
        for s in raw:
            if not buf:
                buf = s
            elif len(buf) < 120 and len(s) < 260:
                buf = buf + " " + s
            else:
                merged.append(buf.strip())
                buf = s
        if buf.strip():
            merged.append(buf.strip())

        for m in merged:
            sents.append(SentItem(sid=sid, page=page_num, text=m[:max_len]))
            sid += 1
    return sents

def split_blocks_with_meta(text: str, max_len: int = 2500) -> List[BlockItem]:
    pages = split_into_pages(text)
    blocks: List[BlockItem] = []
    bid = 0
    for page_num, page_text in pages:
        paras = [b.strip() for b in re.split(r"\n\s*\n+", page_text) if b and b.strip()]
        for b in paras:
            b = re.sub(r"\s+", " ", b).strip()
            if len(b) <= max_len:
                blocks.append(BlockItem(bid=bid, page=page_num, text=b))
                bid += 1
            else:
                for i in range(0, len(b), 2000):
                    blocks.append(BlockItem(bid=bid, page=page_num, text=b[i:i+2000]))
                    bid += 1
    return blocks


## 2) Candidate retrieval (high‑recall)
- Use regex patterns to over‑generate candidate clauses for **validity**, **renewal**, and **evaluation**.
- Retrieve the top candidate sentences/blocks + neighboring context for downstream scoring.

In [6]:
VALIDITY_PATTERNS = [
    r"\bterm\s+of\s+validity\b",
    r"\bterm\s+of\s+(this\s+)?agreement\b",
    r"\beffective\s+date\b",
    r"\beffective\s+upon\b",
    r"\benter\s+into\s+force\b",
    r"\benter\s+into\s+effect\b",
    r"\bshall\s+remain\s+in\s+(force|effect)\b",
    r"\bremain\s+in\s+(force|effect)\b",
    r"\bfor\s+a\s+period\s+of\b",
    r"\bperiod\s+of\b",
    r"\buntil\b",
    r"\bexpires?\b",
    r"\bexpiration\b",
    r"\btermination\b",
    r"\bupon\s+signature\b",
    r"\bdate\s+of\s+signature\b",
]

RENEWAL_PATTERNS = [
    r"\brenew(al|ed|s|ing)?\b",
    r"\bextend(ed|s|ing)?\b",
    r"\bextension\b",
    r"\bautomatic(ally)?\s+renew\b",
    r"\bshall\s+be\s+renewed\b",
    r"\bmay\s+be\s+renewed\b",
    r"\bnon-?renewal\b",
    r"\bunless\s+terminated\b",
    r"\bnotice\b",
    r"\botherwise\s+agreed\s+upon\b",
    r"\bby\s+mutual\s+agreement\b",
    r"\bmutually\s+agreed\b",
]

EVAL_PATTERNS = [
    r"\bevaluat(e|ion|ing)\b",
    r"\breview(s|ed|ing)?\b",
    r"\bassess(ment|es|ed|ing)?\b",
    r"\bmonitor(ing|ed|s)?\b",
    r"\baudit(s|ed|ing)?\b",
    r"\breport(s|ed|ing)?\b",
    r"\bprogress\s+report\b",
    r"\bimplementation\b.*\b(review|evaluation|assessment|monitor|audit|report)\b",
]

def any_match(text: str, patterns: List[str]) -> bool:
    lt = text.lower()
    return any(re.search(p, lt) for p in patterns)

# HeidelTime-inspired temporal extraction (rules baseline)
MONTHY_DATE = re.compile(
    r"\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|"
    r"jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)"
    r"\s+\d{1,2},?\s+\d{4}\b", re.IGNORECASE
)
ORDINAL_MONTHY_DATE = re.compile(
    r"\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|"
    r"jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)"
    r"\s+\d{1,2}(?:st|nd|rd|th)?\s*,?\s*\d{4}\b", re.IGNORECASE
)
DAY_OF_MONTH_DATE = re.compile(
    r"\b\d{1,2}(?:st|nd|rd|th)?\s+of\s+"
    r"(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|"
    r"jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)"
    r"(?:\s+in\s+the\s+year\s+of)?\s+\d{4}\b", re.IGNORECASE
)
NUM_DATE = re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b")

DURATION_NUMERIC = re.compile(r"\b(\d+)\s*(?:\(\s*\d+\s*\)\s*)?(years?|months?|days?)\b", re.IGNORECASE)
NOTICE_PERIOD = re.compile(r"\b(\d+)\s*(?:\(\s*\d+\s*\)\s*)?(days?|months?|years?)\s+(?:prior|before)\b", re.IGNORECASE)

NUMBER_WORDS = {
    "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
    "eleven": 11, "twelve": 12
}
WORD_DURATION = re.compile(r"\b(" + "|".join(NUMBER_WORDS.keys()) + r")\s+(years?|months?|days?)\b", re.IGNORECASE)
WORD_PAREN_DURATION = re.compile(r"\b(" + "|".join(NUMBER_WORDS.keys()) + r")\s*\(\s*(\d+)\s*\)\s*(years?|months?|days?)\b", re.IGNORECASE)

# Anchors for safe derivation
ANCHOR_START = re.compile(r"\beffective\s+date\b|\beffective\s+upon\b|\benter\s+into\s+(force|effect)\b|\bupon\s+signature\b", re.IGNORECASE)
ANCHOR_TERM  = re.compile(r"\bterm\b|\bfor\s+a\s+period\s+of\b|\bshall\s+remain\s+in\s+(force|effect)\b|\buntil\b|\bexpires?\b|\bexpiration\b", re.IGNORECASE)

def parse_dates_from_text(text: str) -> List[str]:
    found = []
    for rgx in [MONTHY_DATE, ORDINAL_MONTHY_DATE, DAY_OF_MONTH_DATE, NUM_DATE]:
        for m in rgx.findall(text):
            # Try MDY then DMY (OCR/intl ambiguity)
            dt = dateparser.parse(m, settings={"DATE_ORDER": "MDY"})
            if not dt:
                dt = dateparser.parse(m, settings={"DATE_ORDER": "DMY"})
            if dt:
                found.append(dt.date().isoformat())
    # de-dup stable order
    return sorted(set(found))

def parse_duration(text: str) -> Optional[str]:
    m = DURATION_NUMERIC.search(text)
    if m:
        return f"{m.group(1)} {m.group(2).lower()}"
    m2 = WORD_PAREN_DURATION.search(text)
    if m2:
        return f"{int(m2.group(2))} {m2.group(3).lower()}"
    m3 = WORD_DURATION.search(text)
    if m3:
        return f"{NUMBER_WORDS[m3.group(1).lower()]} {m3.group(2).lower()}"
    return None

def parse_notice_period(text: str) -> Optional[str]:
    m = NOTICE_PERIOD.search(text)
    if m:
        return f"{m.group(1)} {m.group(2).lower()}"
    return None

def derive_end_date_safe(effective_date: Optional[str], duration: Optional[str], evidence_text: str) -> Optional[str]:
    """
    Only derive end_date if:
    - we have effective_date + duration
    - and the evidence has both start anchor AND term anchor (avoid mixing notice/training periods)
    """
    if not effective_date or not duration:
        return None
    if not (ANCHOR_START.search(evidence_text) and ANCHOR_TERM.search(evidence_text)):
        return None

    m = re.match(r"^\s*(\d+)\s+(years?|months?|days?)\s*$", duration.strip(), re.IGNORECASE)
    if not m:
        return None
    n = int(m.group(1))
    unit = m.group(2).lower()

    try:
        start = datetime.fromisoformat(effective_date).date()
    except Exception:
        return None

    if unit.startswith("year"):
        end = start + relativedelta(years=n)
    elif unit.startswith("month"):
        end = start + relativedelta(months=n)
    else:
        end = start + relativedelta(days=n)

    return end.isoformat()


In [7]:
def retrieve_candidates_with_meta(
    text: str,
    patterns: List[str],
    max_blocks: int = MAX_BLOCKS,
    max_sents: int = MAX_SENTS,
    neighbor_k: int = NEIGHBOR_K
) -> Dict[str, Any]:
    blocks = split_blocks_with_meta(text)
    sents  = split_sentences_with_meta(text)

    block_hits: List[BlockItem] = []
    for b in blocks:
        if any_match(b.text, patterns):
            block_hits.append(b)
            if len(block_hits) >= max_blocks:
                break

    hit_sids = []
    for s in sents:
        if any_match(s.text, patterns):
            hit_sids.append(s.sid)
            if len(hit_sids) >= max_sents:
                break

    expanded = set()
    for sid in hit_sids:
        for j in range(max(0, sid-neighbor_k), min(len(sents), sid+neighbor_k+1)):
            expanded.add(j)

    sent_hits = [sents[i] for i in sorted(expanded)]
    return {"blocks": block_hits, "sentences": sent_hits, "all_sentences": sents}


## 2.2 Semantic filtering (SOTA proxy)
- Load a zero‑shot **NLI/MNLI** model as a proxy for fine‑tuned **LegalBERT/Longformer** clause classifiers.
- Optionally run a second NLI verification pass (ContractNLI‑style hypothesis check).

In [8]:
# Fallback: use MNLI zero-shot as "clause classifier" (POC)
zsc = make_zsc(CLAUSE_MODEL, DEVICE)

if USE_NLI_VERIFIER:
    nli = make_zsc(NLI_MODEL, DEVICE)
else:
    nli = None

def zsc_best(texts: List[str], labels: List[str]) -> List[Tuple[str, float]]:
    if not texts:
        return []
    res = zsc(texts, candidate_labels=labels, multi_label=False)
    return [(r["labels"][0], float(r["scores"][0])) for r in res]

def nli_verify(texts: List[str], hypothesis_pos: str, hypothesis_neg: str) -> List[Tuple[bool, float]]:
    """
    ContractNLI-like verification: "entailed or not".
    Using ZSC MNLI as proxy:
    - candidate_labels = [pos, neg]
    - choose pos => verified True
    """
    if not USE_NLI_VERIFIER or nli is None:
        return [(True, 1.0) for _ in texts]  # no-op verifier for ablation

    res = nli(texts, candidate_labels=[hypothesis_pos, hypothesis_neg], multi_label=False)
    out = []
    for r in res:
        lab = r["labels"][0]
        sc  = float(r["scores"][0])
        out.append((lab == hypothesis_pos, sc))
    return out


Loading ZSC model: typeform/distilbert-base-uncased-mnli


Device set to use cpu


Loading ZSC model: typeform/distilbert-base-uncased-mnli


Device set to use cpu


### 2.3 Clause labels and scoring
- Define label sets for each task (validity / renewal / evaluation).
- Pick the best label per candidate and keep calibrated confidence scores.

In [9]:
# Stage 1: clause type classification (cheap)
VALIDITY_LABELS = [
    "Validity/term clause (effective date, duration, expiration, remain in force)",
    "Other"
]
RENEWAL_LABELS = [
    "Automatic renewal unless terminated or notice is given",
    "Renewal/extension requires mutual agreement",
    "Unilateral renewal/extension option",
    "Other"
]
EVAL_LABELS = [
    "Evaluation/monitoring/reporting/audit/review obligation",
    "Other"
]

# Stage 2: verification (ContractNLI-like)
H_VALID_POS = "This text states the start date, end date, expiration, or duration of the agreement."
H_VALID_NEG = "This text is not about the agreement's validity period."

H_EVAL_POS  = "This text requires evaluation, monitoring, reporting, auditing, or review of implementation."
H_EVAL_NEG  = "This text is not about evaluation, monitoring, or reporting obligations."

H_REN_POS   = "This text describes how the agreement is renewed or extended (automatic, mutual, or unilateral)."
H_REN_NEG   = "This text is not about renewal or extension."


### 2.4 Evidence schema
- Standardize evidence items: text span, page number, source (sentence/block), label, score.
- Store whether the span was NLI‑verified and the verification score.

In [10]:
@dataclass
class EvidenceItem:
    text: str
    page: int
    sid: Optional[int]     # sentence id if sentence evidence
    bid: Optional[int]     # block id if block evidence
    source: str            # "sentence" | "block"
    label: str
    score: float
    verified: bool
    verify_score: float

def build_evidence(
    sent_items: List[SentItem],
    block_items: List[BlockItem],
    labels: List[str],
    hyp_pos: str,
    hyp_neg: str,
    threshold: float
) -> List[EvidenceItem]:
    # Stage 1 classification (type)
    sent_texts  = [s.text for s in sent_items]
    block_texts = [b.text for b in block_items]

    sent_preds  = zsc_best(sent_texts, labels)
    block_preds = zsc_best(block_texts, labels)

    # Stage 2 verification
    sent_ver = nli_verify(sent_texts, hyp_pos, hyp_neg)
    block_ver = nli_verify(block_texts, hyp_pos, hyp_neg)

    out: List[EvidenceItem] = []
    for s, (lab, sc), (ok, vsc) in zip(sent_items, sent_preds, sent_ver):
        out.append(EvidenceItem(
            text=s.text, page=s.page, sid=s.sid, bid=None, source="sentence",
            label=lab, score=sc, verified=ok and vsc >= threshold, verify_score=vsc
        ))
    for b, (lab, sc), (ok, vsc) in zip(block_items, block_preds, block_ver):
        out.append(EvidenceItem(
            text=b.text, page=b.page, sid=None, bid=b.bid, source="block",
            label=lab, score=sc, verified=ok and vsc >= threshold, verify_score=vsc
        ))

    out.sort(key=lambda x: (x.verified, x.verify_score, x.score), reverse=True)
    return out


## 3) Extraction — Validity / Renewal / Evaluation
- Run task-specific extractors that combine: candidate retrieval → semantic filtering → normalization.
- Each extractor returns structured fields **plus evidence** (page + snippet) for auditability.

In [11]:
def extract_validity(text: str) -> Dict[str, Any]:
    seg = retrieve_candidates_with_meta(text, VALIDITY_PATTERNS)
    sents = seg["sentences"]
    blocks = seg["blocks"]

    evidence = build_evidence(
        sent_items=sents,
        block_items=blocks,
        labels=VALIDITY_LABELS,
        hyp_pos=H_VALID_POS,
        hyp_neg=H_VALID_NEG,
        threshold=THRESH_TEMPORAL
    )

    verified = [e for e in evidence if e.verified]
    top = (verified[:8] if verified else evidence[:6])

    effective_date = None
    end_date = None
    duration = None
    end_date_source = None

    # Extract from best verified evidence first
    for e in top:
        # pull duration early
        if duration is None:
            duration = parse_duration(e.text)
        dates = parse_dates_from_text(e.text)

        # Effective date candidates
        if effective_date is None and ANCHOR_START.search(e.text) and dates:
            effective_date = dates[0]

        # End date candidates
        if end_date is None and ANCHOR_TERM.search(e.text) and re.search(r"\buntil\b|\bexpires?\b|\bexpiration\b", e.text, re.IGNORECASE) and dates:
            end_date = dates[-1]
            end_date_source = "explicit"

    # Fallback: if we have multiple dates in verified evidence, pick earliest as start and latest as end (only if verified exists)
    if verified:
        all_dates = []
        for e in verified[:10]:
            all_dates += parse_dates_from_text(e.text)
        all_dates = sorted(set(all_dates))
        if effective_date is None and all_dates:
            effective_date = all_dates[0]
        if end_date is None and len(all_dates) >= 2:
            end_date = all_dates[-1]
            end_date_source = "explicit"

    # Safe derivation (only if evidence supports it)
    if DERIVE_END_DATE and end_date is None and effective_date and duration and verified:
        derived = derive_end_date_safe(effective_date, duration, verified[0].text)
        if derived:
            end_date = derived
            end_date_source = "derived"

    status = "found" if (effective_date or end_date or duration) else ("uncertain" if (sents or blocks) else "absent")

    return {
        "effective_date": effective_date,
        "end_date": end_date,
        "duration": duration,
        "end_date_source": end_date_source,
        "validity_status": status,
        "validity_evidence": [asdict(e) for e in top]
    }


In [12]:
def extract_renewal(text: str) -> Dict[str, Any]:
    seg = retrieve_candidates_with_meta(text, RENEWAL_PATTERNS)
    sents = seg["sentences"]
    blocks = seg["blocks"]

    if not (sents or blocks):
        return {
            "renewal_type": "absent",
            "notice_period": None,
            "renewal_status": "absent",
            "renewal_evidence": []
        }

    evidence = build_evidence(
        sent_items=sents,
        block_items=blocks,
        labels=RENEWAL_LABELS,
        hyp_pos=H_REN_POS,
        hyp_neg=H_REN_NEG,
        threshold=THRESH_RENEWAL
    )
    verified = [e for e in evidence if e.verified]
    top = (verified[:10] if verified else evidence[:6])

    # Document-level decision: any verified evidence triggers type
    # Priority: automatic > mutual > unilateral (you can justify this in thesis)
    renewal_type = "uncertain"
    if any(e.label == RENEWAL_LABELS[0] for e in verified):
        renewal_type = "automatic"
    elif any(e.label == RENEWAL_LABELS[1] for e in verified):
        renewal_type = "by_mutual_agreement"
    elif any(e.label == RENEWAL_LABELS[2] for e in verified):
        renewal_type = "unilateral_option"

    # Notice period extraction: scan verified evidence, then top evidence
    notice = None
    for e in (verified[:12] + top[:8]):
        notice = notice or parse_notice_period(e.text)
        if notice:
            break

    # If patterns hit but no verified => uncertain
    status = "found" if (renewal_type != "uncertain") else "uncertain"

    return {
        "renewal_type": renewal_type,
        "notice_period": notice,
        "renewal_status": status,
        "renewal_evidence": [asdict(e) for e in top]
    }


In [13]:
def extract_evaluation(text: str) -> Dict[str, Any]:
    seg = retrieve_candidates_with_meta(text, EVAL_PATTERNS)
    sents = seg["sentences"]
    blocks = seg["blocks"]

    if not (sents or blocks):
        return {
            "evaluation": "absent",
            "evaluation_status": "absent",
            "evaluation_evidence": []
        }

    evidence = build_evidence(
        sent_items=sents,
        block_items=blocks,
        labels=EVAL_LABELS,
        hyp_pos=H_EVAL_POS,
        hyp_neg=H_EVAL_NEG,
        threshold=THRESH_EVAL
    )
    verified = [e for e in evidence if e.verified]
    top = (verified[:8] if verified else evidence[:6])

    if verified:
        return {
            "evaluation": "present",
            "evaluation_status": "found",
            "evaluation_evidence": [asdict(e) for e in top]
        }
    else:
        # candidates existed but verifier didn't confirm -> uncertain (not absent)
        return {
            "evaluation": "uncertain",
            "evaluation_status": "uncertain",
            "evaluation_evidence": [asdict(e) for e in top]
        }


## 4) Baselines (for ablations)
- **Keyword baseline:** simple presence/absence using the same pattern lists.
- **Rules-only temporal baseline:** HeidelTime-inspired date/duration parsing without ML, used as a comparison point.

In [15]:
def baseline_keyword(text: str, patterns: List[str]) -> bool:
    return any_match(text, patterns)

def baseline_temporal_rules(text: str) -> Dict[str, Any]:
    """
    HeidelTime-inspired: extract dates/durations from high-recall candidate regions.
    No ML. Useful as rule baseline in thesis.
    """
    seg = retrieve_candidates_with_meta(text, VALIDITY_PATTERNS)
    cands = [s.text for s in seg["sentences"][:30]] + [b.text for b in seg["blocks"][:12]]

    all_dates = []
    for c in cands:
        all_dates += parse_dates_from_text(c)
    all_dates = sorted(set(all_dates))

    duration = None
    for c in cands:
        duration = duration or parse_duration(c)

    effective_date = all_dates[0] if all_dates else None
    end_date = all_dates[-1] if len(all_dates) >= 2 else None
    end_date_source = "explicit" if end_date else None

    if DERIVE_END_DATE and end_date is None and effective_date and duration:
        # rules baseline uses a weaker derivation than hybrid, but still anchor-checked
        derived = derive_end_date_safe(effective_date, duration, " ".join(cands[:3]))
        if derived:
            end_date = derived
            end_date_source = "derived"

    status = "found" if (effective_date or end_date or duration) else ("uncertain" if cands else "absent")
    return {
        "effective_date": effective_date,
        "end_date": end_date,
        "duration": duration,
        "end_date_source": end_date_source,
        "validity_status": status
    }


In [16]:
def process_document(doc_id: str, source_path: str, raw_text: str) -> Dict[str, Any]:
    text = normalize_text(raw_text)

    # Baselines
    b_kw_val  = baseline_keyword(text, VALIDITY_PATTERNS)
    b_kw_ren  = baseline_keyword(text, RENEWAL_PATTERNS)
    b_kw_eval = baseline_keyword(text, EVAL_PATTERNS)

    b_rules_validity = baseline_temporal_rules(text)

    # Hybrid (SOTA-style)
    validity = extract_validity(text)
    renewal  = extract_renewal(text)
    evalcl   = extract_evaluation(text)

    return {
        "doc_id": doc_id,
        "source_path": source_path,

        # Hybrid outputs
        **validity,
        **renewal,
        **evalcl,

        # Baselines for thesis comparison
        "baseline_keyword_validity": b_kw_val,
        "baseline_keyword_renewal": b_kw_ren,
        "baseline_keyword_eval": b_kw_eval,

        "baseline_rules_effective_date": b_rules_validity["effective_date"],
        "baseline_rules_end_date": b_rules_validity["end_date"],
        "baseline_rules_duration": b_rules_validity["duration"],
        "baseline_rules_end_date_source": b_rules_validity["end_date_source"],
        "baseline_rules_validity_status": b_rules_validity["validity_status"],
    }

def list_agreements(root: str) -> List[Tuple[str, str]]:
    found: Dict[str, Dict[str, str]] = {}
    for dp, _, fnames in os.walk(root):
        for fn in fnames:
            low = fn.lower()
            if not (low.endswith(".txt") or low.endswith(".json")):
                continue
            path = os.path.join(dp, fn)
            doc_id = os.path.splitext(fn)[0]
            ext = os.path.splitext(fn)[1].lower()
            found.setdefault(doc_id, {})
            found[doc_id][ext] = path

    out = []
    for doc_id, paths in found.items():
        out.append((doc_id, paths[".txt"] if ".txt" in paths else paths[".json"]))
    out.sort(key=lambda x: x[0])
    return out

def run_poc(root: str, n: int = 10, strategy: str = "first", seed: int = RANDOM_SEED) -> pd.DataFrame:
    docs = list_agreements(root)
    if not docs:
        raise ValueError(f"No .txt/.json files found under: {root}")

    if len(docs) <= n:
        chosen = docs
    else:
        if strategy == "random":
            random.seed(seed)
            chosen = random.sample(docs, n)
        else:
            chosen = docs[:n]

    rows = []
    for doc_id, path in tqdm(chosen, desc=f"POC ({len(chosen)} agreements)"):
        if path.lower().endswith(".txt"):
            raw_text = load_txt(path)
        else:
            raw_text = load_ocr_json_with_pages(path)

        rows.append(process_document(doc_id, path, raw_text))

    return pd.DataFrame(rows)


## 5) Batch processing and outputs
- Iterate over a folder of OCR exports, apply the hybrid extractors, and aggregate results.
- Output is a pandas DataFrame suitable for error analysis and thesis tables.

In [18]:
root = r"OCR_output/California"

df = run_poc(root, n=12, strategy="random")   # or strategy="random"
df


POC (12 agreements): 100%|██████████| 12/12 [04:09<00:00, 20.79s/it]


Unnamed: 0,doc_id,source_path,effective_date,end_date,duration,end_date_source,validity_status,validity_evidence,renewal_type,notice_period,...,evaluation_status,evaluation_evidence,baseline_keyword_validity,baseline_keyword_renewal,baseline_keyword_eval,baseline_rules_effective_date,baseline_rules_end_date,baseline_rules_duration,baseline_rules_end_date_source,baseline_rules_validity_status
0,1October28,OCR_output/California\1October28.txt,,,,,uncertain,[{'text': 'represent any obligation of funds b...,automatic,,...,absent,[],True,True,False,,,,,uncertain
1,13142023,OCR_output/California\13142023.txt,,,,,absent,[],by_mutual_agreement,,...,found,[{'text': 'these actions contribute to establi...,False,True,True,,,,,absent
2,110252023,OCR_output/California\110252023.txt,2023-10-25,,3 months,,found,[{'text': 'This MOU may be modified at any tim...,by_mutual_agreement,,...,found,[{'text': '5 SECTION IX Final Provisions This ...,True,True,True,2023-10-25,,3 months,,found
3,1September19,OCR_output/California\1September19.txt,,,5 years,,found,[{'text': 'consultations between the Participa...,automatic,,...,found,"[{'text': 'conservation, water use efficiency,...",True,True,True,,,5 years,,found
4,1August26,OCR_output/California\1August26.txt,,,,,uncertain,[{'text': '3 V. CONTACT POINTS In order to ens...,by_mutual_agreement,,...,absent,[],True,True,False,,,,,uncertain
5,1April30,OCR_output/California\1April30.txt,2022-12-31,2022-12-31,,explicit,found,"[{'text': 'FURTHER, Upon signature by the Part...",by_mutual_agreement,,...,absent,[],True,True,False,2018-04-30,2022-12-31,,explicit,found
6,1April15,OCR_output/California\1April15.txt,,,,,absent,[],absent,,...,absent,[],False,False,False,,,,,absent
7,1392021,OCR_output/California\1392021.txt,2021-03-09,,3 months,,found,[{'text': 'A Participant that intends to withd...,by_mutual_agreement,,...,found,[{'text': '4. Water policies contributing to t...,True,True,True,2021-03-09,,3 months,,found
8,13112022,OCR_output/California\13112022.txt,,,,,uncertain,[{'text': 'A Participant who intends to withdr...,by_mutual_agreement,,...,absent,[],True,True,False,2022-03-11,,,,found
9,1October4 (1),OCR_output/California\1October4 (1).txt,2019-10-04,,4 year,,found,[{'text': 'such modification is to become effe...,automatic,,...,found,[{'text': 'The Working Group must present a mi...,True,True,True,2019-10-04,,4 year,,found
