In [None]:
"""
super7_docling_snippet_resolver.py

Pipeline:
- Input: list of Super7Input dicts (company_name required; others optional)
- For each company:
    1) Tavily search to get candidate URLs (+ title + snippet)
    2) Search-level filter (title/snippet similarity vs company name)
    3) ScraperTool (Docling-first) to get text from HTML-like pages
       (now explicitly skips PDFs/Word/Excel/PPT by extension)
    4) Doc-level filter (does text even mention the company?)
    5) Snippet extraction:
         - windows around company name
         - regex-based address/phone/zip candidates
    6) LLMExtractor runs on snippets (not full doc)
    7) Super7 summarizer:
         - same-company guard
         - light scoring
         - per-field best value + provenance

Additional rules:
- News domains (e.g. thetimes-tribune.com) are blacklisted as sources
  and are not scraped or used as primary URLs.
- Social media + DNB + news domains are never used as Super7 value sources.

Scraping ethics:
- Realistic browser User-Agent
- requests.Session() to reuse cookies
- small random delays between requests
- domain blacklist for clearly hostile/irrelevant sites
- size guard for huge documents
- you are responsible for only scraping sites whose ToS/robots.txt allow it
"""

import os
import json
import time
import random
import string
import re
from typing import List, Optional, Dict, Any, Tuple
from dataclasses import dataclass
from urllib.parse import urlparse

import logging
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from openai import OpenAI
from langchain_tavily import TavilySearch
from docling.document_converter import DocumentConverter

# ---------------------------------------------------------------------------
# Setup
# ---------------------------------------------------------------------------

load_dotenv()
logging.getLogger("httpx").setLevel(logging.WARNING)

if not os.getenv("OPENAI_API_KEY"):
    raise RuntimeError("Please set OPENAI_API_KEY in your environment.")

if not os.getenv("TAVILY_API_KEY"):
    print("[WARN] TAVILY_API_KEY is not set. Tavily search will fail.")

SUPER7_FIELDS = [
    "company_name",
    "street_address",
    "city",
    "state",
    "country",
    "zip",
    "phone",
]

# Basic corp suffixes for name normalization
CORP_SUFFIXES = {
    "llc", "l.l.c", "inc", "inc.", "corp", "corp.", "corporation",
    "company", "co", "co.", "ltd", "ltd.", "limited", "plc", "s.a.",
    "gmbh", "oy", "ab", "bv", "srl", "sas", "spa", "holdings", "holding",
}

# News outlets we want to ignore for this use-case
NEWS_DOMAIN_BLACKLIST = {
    "thetimes-tribune.com",
    "www.thetimes-tribune.com",
    # add more if they show up as bad sources
}

# Domains we DO NOT want to use for Super7 fields (social + DNB + news)
SUMMARY_DOMAIN_EXCLUDE = {
    "facebook.com",
    "www.facebook.com",
    "instagram.com",
    "www.instagram.com",
    "twitter.com",
    "www.twitter.com",
    "x.com",
    "www.x.com",
    "linkedin.com",
    "www.linkedin.com",
    "tiktok.com",
    "www.tiktok.com",
    "dnb.com",
    "www.dnb.com",
}
SUMMARY_DOMAIN_EXCLUDE |= NEWS_DOMAIN_BLACKLIST

# Domains we know are hostile / blocked / not worth scraping directly
SCRAPER_DOMAIN_BLACKLIST = {
    "firesupport.uk",
    "www.firesupport.uk",
    "search.sunbiz.org",
    "bubba.ai",
    "govtribe.com",
    "www.govtribe.com",
    "brokersnapshot.com",
    "www.brokersnapshot.com",
    "dnb.com",
    "www.dnb.com",
    "b2bhint.com",
    "www.b2bhint.com",
    "yelp.com",
    "www.yelp.com",
    "davids-tire-shop-service.wheree.com",
    "wheree.com",

    # noisy big-PDF domains from your logs
    "luke.af.mil",
    "www.luke.af.mil",
    "nrc.gov",
    "www.nrc.gov",
}
SCRAPER_DOMAIN_BLACKLIST |= NEWS_DOMAIN_BLACKLIST


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def get_domain(url: str) -> Optional[str]:
    if not url:
        return None
    try:
        netloc = urlparse(url).netloc.lower()
        if netloc.startswith("www."):
            netloc = netloc[4:]
        return netloc or None
    except Exception:
        return None


def normalize_company_name(name: str) -> str:
    """
    Normalize company name for similarity:
      - lowercase
      - & -> and
      - remove punctuation
      - drop typical corp suffixes
    """
    if not name:
        return ""

    s = name.lower()
    s = s.replace("&", " and ")

    trans_table = str.maketrans("", "", string.punctuation.replace("&", ""))
    s = s.translate(trans_table)

    tokens = s.split()
    cleaned = [t for t in tokens if t not in CORP_SUFFIXES]
    return " ".join(cleaned).strip()


def jaccard_name_similarity(a: str, b: str) -> float:
    """
    Token Jaccard similarity between normalized names.
    """
    na = normalize_company_name(a)
    nb = normalize_company_name(b)
    if not na or not nb:
        return 0.0

    set_a = set(na.split())
    set_b = set(nb.split())
    if not set_a or not set_b:
        return 0.0

    inter = len(set_a & set_b)
    union = len(set_a | set_b)
    return inter / union if union > 0 else 0.0


def should_consider_search_result(
    company_name: str,
    title: str,
    snippet: str,
    min_sim: float = 0.2,
) -> bool:
    """
    Cheap pre-filter: decide whether a Tavily result is even worth scraping.
    - If title is somewhat similar OR snippet mentions the company name, keep.
    - Otherwise, skip.
    """
    if not title and not snippet:
        return True  # be permissive if we know nothing

    sim = jaccard_name_similarity(company_name, title or "")
    if sim >= min_sim:
        return True

    if company_name and snippet:
        if company_name.lower() in snippet.lower():
            return True

    return False


def doc_mentions_company(s7_name: str, text: str, min_occurrences: int = 1) -> bool:
    """
    Doc-level filter: does the text even look like it's about this company?

    - Check if raw company_name (lowercased) appears.
    - If not, check main token of normalized name.
    """
    if not s7_name or not text:
        return False

    text_lower = text.lower()
    if s7_name.lower() in text_lower:
        return True

    norm = normalize_company_name(s7_name)
    tokens = norm.split()
    if not tokens:
        return False

    main_token = tokens[0]
    if not main_token:
        return False

    return text_lower.count(main_token) >= min_occurrences


# ---------------------------------------------------------------------------
# Snippet extraction
# ---------------------------------------------------------------------------

@dataclass
class Snippet:
    snippet_id: int
    snippet_type: str  # "name_context" | "address_candidate" | "phone_candidate" | "zip_candidate" | "generic"
    text: str


def extract_snippets_for_company(
    full_text: str,
    company_name: str,
    max_snippets: int = 25,
    window_chars: int = 400,
) -> List[Snippet]:
    """
    Extract a small set of high-signal snippets from the full text:
    - windows around company-name mentions
    - regex candidates for phone / address / zip
    """
    snippets: List[Snippet] = []
    used_spans: List[Tuple[int, int]] = []

    text = full_text or ""
    if not text.strip():
        return snippets

    lower_text = text.lower()
    norm_name = normalize_company_name(company_name)
    raw_name = company_name.lower()
    name_variants = set()
    if norm_name:
        name_variants.add(norm_name)
    if raw_name:
        name_variants.add(raw_name)
    if "&" in raw_name:
        name_variants.add(raw_name.replace("&", "and"))

    def add_snippet(start: int, end: int, snippet_type: str):
        nonlocal snippets, used_spans
        # de-duplicate overlapping spans
        for s, e in used_spans:
            if not (end <= s or start >= e):
                return
        chunk = text[start:end].strip()
        if not chunk:
            return
        snippet_id = len(snippets) + 1
        snippets.append(Snippet(snippet_id=snippet_id, snippet_type=snippet_type, text=chunk))
        used_spans.append((start, end))

    # --- 1) Name-anchored snippets ---
    for variant in name_variants:
        if not variant:
            continue
        idx = 0
        while True:
            idx = lower_text.find(variant, idx)
            if idx == -1:
                break
            start = max(0, idx - window_chars)
            end = min(len(text), idx + len(variant) + window_chars)
            add_snippet(start, end, "name_context")
            idx = idx + len(variant)
            if len(snippets) >= max_snippets:
                return snippets

    # --- 2) Regex-based phone candidates ---
    phone_pattern = re.compile(r"\+?\d[\d\-\s\(\)]{7,}")
    for m in phone_pattern.finditer(text):
        start = max(0, m.start() - 80)
        end = min(len(text), m.end() + 80)
        add_snippet(start, end, "phone_candidate")
        if len(snippets) >= max_snippets:
            return snippets

    # --- 3) Regex-based zip candidates (US-style, approximate) ---
    zip_pattern = re.compile(r"\b\d{5}(?:-\d{4})?\b")
    for m in zip_pattern.finditer(text):
        start = max(0, m.start() - 80)
        end = min(len(text), m.end() + 80)
        add_snippet(start, end, "zip_candidate")
        if len(snippets) >= max_snippets:
            return snippets

    # --- 4) Address-ish lines fallback (only if still few snippets) ---
    if len(snippets) < max_snippets:
        lines = text.splitlines()
        address_keywords = [
            "street", "st.", "st ", "road", "rd.", "rd ",
            "avenue", "ave.", "ave ", "boulevard", "blvd",
            "lane", "ln.", "ln ", "drive", "dr.", "dr ",
        ]
        for line in lines:
            l = line.lower()
            if any(kw in l for kw in address_keywords) and any(ch.isdigit() for ch in l):
                chunk = line.strip()
                if not chunk:
                    continue
                snippet_id = len(snippets) + 1
                snippets.append(Snippet(snippet_id=snippet_id, snippet_type="address_candidate", text=chunk))
                if len(snippets) >= max_snippets:
                    break

    # If we somehow got nothing, add a generic first N chars as a last resort
    if not snippets:
        chunk = text[:800].strip()
        if chunk:
            snippets.append(Snippet(snippet_id=1, snippet_type="generic", text=chunk))

    return snippets


def snippets_to_prompt_block(snippets: List[Snippet]) -> str:
    """
    Convert snippets into a textual block for the LLM prompt.
    """
    if not snippets:
        return "No snippets were extracted; the document text was empty or uninformative."

    lines = ["Here are the extracted snippets (pre-filtered for likely relevance):"]
    for sn in snippets:
        lines.append(f"[SNIPPET {sn.snippet_id}] type={sn.snippet_type}")
        lines.append(sn.text)
        lines.append("")  # blank line between snippets
    return "\n".join(lines)


# ---------------------------------------------------------------------------
# Models
# ---------------------------------------------------------------------------

class Super7Input(BaseModel):
    company_name: str
    country: Optional[str] = None
    state: Optional[str] = None
    city: Optional[str] = None
    street_address: Optional[str] = None
    zip: Optional[str] = None
    phone: Optional[str] = None


class ExtractedEntity(BaseModel):
    entity_type: str
    value: str
    source_urls: List[str] = Field(default_factory=list)
    confidence: Optional[float] = None  # should be in [0,1]


class PageExtractionResult(BaseModel):
    url: str
    entities: List[ExtractedEntity] = Field(default_factory=list)
    match_score_name: float = 0.0
    match_score_address: float = 0.0
    match_score_phone: float = 0.0
    looks_like_official_site: bool = False
    overall_score: float = 0.0
    reason: str = ""


@dataclass
class CandidateRecord:
    url: str
    source_type: str
    extraction: PageExtractionResult


# ---------------------------------------------------------------------------
# Web search
# ---------------------------------------------------------------------------

class WebSearchTool:
    def __init__(self, max_results: int = 5):
        key = os.getenv("TAVILY_API_KEY")
        if not key:
            raise RuntimeError("Missing TAVILY_API_KEY.")
        self.tool = TavilySearch(max_results=max_results, tavily_api_key=key)

    def search(self, queries: List[str]) -> List[Dict[str, Any]]:
        seen: Dict[str, Dict[str, Any]] = {}
        for q in queries:
            res = self.tool.invoke({"query": q})
            for r in res.get("results", []):
                url = r.get("url")
                if not url:
                    continue
                if url not in seen:
                    seen[url] = {
                        "url": url,
                        "title": r.get("title", ""),
                        "source_type": "web_search",
                        "content": r.get("content", ""),
                    }
        return list(seen.values())


# ---------------------------------------------------------------------------
# ScraperTool using Docling + polite crawling + size guard
# ---------------------------------------------------------------------------

class ScraperTool:
    """
    Scraper that prefers Docling for rich formats (but now explicitly
    skips PDFs/Word/Excel/PPT by extension), and falls back to HTML +
    BeautifulSoup if Docling fails.

    - Realistic browser-like User-Agent
    - requests.Session() to persist cookies
    - small random delays between requests (throttling)
    - domain blacklist
    - size guard for large documents

    Returns plain text/markdown truncated to max_chars.
    """

    SKIP_EXTENSIONS = (
        ".pdf",
        ".doc",
        ".docx",
        ".xls",
        ".xlsx",
        ".ppt",
        ".pptx",
    )

    def __init__(
        self,
        timeout: int = 10,
        delay_range: tuple = (1.0, 3.0),
        use_markdown: bool = True,
        max_content_length_bytes: int = 8_000_000,  # ~8 MB limit
    ):
        self.timeout = timeout
        self.delay_range = delay_range
        self.use_markdown = use_markdown
        self.max_content_length_bytes = max_content_length_bytes

        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/120.0.0.0 Safari/537.36"
            ),
            "Accept": (
                "text/html,application/xhtml+xml,application/xml;"
                "q=0.9,image/avif,image/webp,*/*;q=0.8"
            ),
            "Accept-Language": "en-US,en;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "keep-alive",
        })

        self.converter = DocumentConverter()

    def _delay(self):
        lo, hi = self.delay_range
        if hi > 0:
            time.sleep(random.uniform(lo, hi))

    def _scrape_html_basic(self, html: str) -> str:
        soup = BeautifulSoup(html, "html.parser")
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()
        return "\n".join(
            line.strip()
            for line in soup.get_text("\n").splitlines()
            if line.strip()
        )

    def _head_too_large(self, url: str) -> bool:
        """
        Lightweight HEAD to check Content-Length before we download/convert.
        If the file is larger than max_content_length_bytes, we skip it.
        """
        if not self.max_content_length_bytes:
            return False

        try:
            self._delay()
            resp = self.session.head(url, timeout=self.timeout, allow_redirects=True)
        except requests.RequestException:
            # If HEAD fails, don't block; we'll let normal flow decide.
            return False

        cl = resp.headers.get("Content-Length")
        if cl is None:
            return False

        try:
            size = int(cl)
        except ValueError:
            return False

        if size > self.max_content_length_bytes:
            print(f"[SCRAPER] Skipping {url} (size {size} > {self.max_content_length_bytes} bytes).")
            return True
        return False

    def fetch(self, url: str, max_chars: int = 50000) -> str:
        domain = get_domain(url) or ""
        if domain in SCRAPER_DOMAIN_BLACKLIST:
            # known problematic or unwanted domains
            return ""

        # Skip PDFs / Word / Excel / PPT by extension
        path = urlparse(url).path.lower()
        if any(path.endswith(ext) for ext in self.SKIP_EXTENSIONS):
            # For this project, we ignore non-HTML docs
            return ""

        # Quick size check first (for large docs)
        if self._head_too_large(url):
            return ""

        # 1) Try Docling directly with URL (for HTML-like content)
        try:
            self._delay()
            result = self.converter.convert(url)
            doc = result.document

            if self.use_markdown:
                text = doc.export_to_markdown()
            else:
                text = doc.export_to_markdown()  # markdown is fine for LLM

            if text:
                if len(text) > max_chars:
                    text = text[:max_chars]
                return text

        except Exception as e:
            print(f"[SCRAPER] Docling failed for {url}: {e}")

        # 2) Fallback: raw HTML
        try:
            self._delay()
            resp = self.session.get(url, timeout=self.timeout, allow_redirects=True)
        except requests.RequestException as e:
            print(f"[SCRAPER] Failed {url}: {e}")
            return ""

        if resp.status_code in (401, 403, 429):
            print(f"[SCRAPER] HTTP {resp.status_code} for {url}, skipping.")
            return ""

        try:
            resp.raise_for_status()
        except requests.HTTPError as e:
            print(f"[SCRAPER] HTTP error {resp.status_code} for {url}: {e}")
            return ""

        text = self._scrape_html_basic(resp.text)
        if len(text) > max_chars:
            text = text[:max_chars]
        return text


# ---------------------------------------------------------------------------
# LLM Extractor (snippet-based)
# ---------------------------------------------------------------------------

class LLMExtractor:
    def __init__(self, model: str = "gpt-4o-mini", temperature: float = 0.0):
        self.client = OpenAI()
        self.model = model
        self.temperature = temperature

    def build_prompt_from_snippets(
        self,
        s7: Super7Input,
        url: str,
        snippets: List[Snippet],
    ) -> str:
        s7_json = json.dumps(s7.model_dump(), indent=2)
        snippet_block = snippets_to_prompt_block(snippets)

        return f"""
You are an information extraction assistant.

You are given:
- A target company Super7 input
- A URL
- A small set of text snippets extracted from that URL, pre-filtered for relevance

Your job is to extract entities using these exact entity_type values when applicable:

Super7-related:
- "company_name"
- "street_address"
- "city"
- "state"
- "country"
- "zip"
- "phone"

Identifier-related:
- "dot_number"
- "registration_id"
- "tax_id"
- "mc_number"

Other:
- "industry"
- "email"
- "website"
- "social_link"
- "director"
- "other"

For each entity:
- entity_type: one of the above strings
- value: string
- source_urls: array of URLs (MUST include "{url}" at minimum)
- confidence: 0.0 to 1.0

Also compute:
- match_score_name: 0.0 to 1.0 (how well the snippets match the company name)
- match_score_address: 0.0 to 1.0
- match_score_phone: 0.0 to 1.0
- looks_like_official_site: true/false (is this likely the official website / main profile?)
- overall_score: 0.0 to 1.0 (summary of how relevant this URL is to the company)
- reason: short explanation

Important:
- The snippets may contain other companies or entities; only extract entities that clearly belong to the target company.
- Be conservative with confidence and scores; if unsure, use lower values.

Return STRICT JSON ONLY in this shape (no extra commentary):

{{
  "url": "{url}",
  "entities": [
    {{
      "entity_type": "company_name" | "street_address" | "city" | "state" | "country" | "zip" | "phone" |
                      "dot_number" | "registration_id" | "tax_id" | "mc_number" |
                      "industry" | "email" | "website" | "social_link" | "director" | "other",
      "value": "<string>",
      "source_urls": ["<url1>", "<url2>", "..."],
      "confidence": <number between 0 and 1 or null>
    }}
  ],
  "match_score_name": <0..1>,
  "match_score_address": <0..1>,
  "match_score_phone": <0..1>,
  "looks_like_official_site": <true or false>,
  "overall_score": <0..1>,
  "reason": "<short explanation>"
}}

Super7 input (hints, may be null):
{s7_json}

URL: {url}

{snippet_block}
"""

    def extract_from_snippets(
        self,
        s7: Super7Input,
        url: str,
        snippets: List[Snippet],
    ) -> PageExtractionResult:
        if not snippets:
            return PageExtractionResult(url=url)

        prompt = self.build_prompt_from_snippets(s7, url, snippets)
        resp = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            temperature=self.temperature,
        )
        raw = resp.choices[0].message.content or ""

        try:
            data = json.loads(raw)
        except Exception:
            start = raw.find("{")
            end = raw.rfind("}")
            if start != -1 and end != -1 and end > start:
                try:
                    data = json.loads(raw[start:end + 1])
                except Exception:
                    data = {}
            else:
                data = {}

        url_out = str(data.get("url") or url)
        ents: List[ExtractedEntity] = []

        for e in data.get("entities", []):
            raw_type = e.get("entity_type")
            entity_type = "other" if raw_type is None else (str(raw_type) or "other")

            raw_value = e.get("value")
            value = "" if raw_value is None else str(raw_value)

            raw_srcs = e.get("source_urls") or []
            srcs = [str(s) for s in raw_srcs if s]
            if url_out not in srcs:
                srcs.append(url_out)

            raw_conf = e.get("confidence")
            if isinstance(raw_conf, (int, float)):
                confidence = max(0.0, min(float(raw_conf), 1.0))
            else:
                confidence = 0.0

            ents.append(
                ExtractedEntity(
                    entity_type=entity_type,
                    value=value,
                    source_urls=srcs,
                    confidence=confidence,
                )
            )

        return PageExtractionResult(
            url=url_out,
            entities=ents,
            match_score_name=float(data.get("match_score_name", 0.0)),
            match_score_address=float(data.get("match_score_address", 0.0)),
            match_score_phone=float(data.get("match_score_phone", 0.0)),
            looks_like_official_site=bool(data.get("looks_like_official_site", False)),
            overall_score=float(data.get("overall_score", 0.0)),
            reason=str(data.get("reason", "")),
        )


# ---------------------------------------------------------------------------
# Simple scoring + same-company guard
# ---------------------------------------------------------------------------

def score_field_candidate(
    field: str,
    s7: Super7Input,
    ent: ExtractedEntity,
    page: PageExtractionResult,
) -> float:
    """
    Simple scoring for a candidate entity (raw score, not normalized).

    raw_score =
        ent_confidence
      + 0.5 * page_overall_score
      + 0.2 if looks_like_official_site
      + bonus if matches Super7 hint
    """
    conf = ent.confidence if isinstance(ent.confidence, (int, float)) else 0.0
    score = conf + 0.5 * page.overall_score
    if page.looks_like_official_site:
        score += 0.2

    hint = getattr(s7, field, None)
    if hint:
        h = hint.lower().strip()
        v = ent.value.lower().strip()
        if v == h:
            score += 0.3
        elif h in v or v in h:
            score += 0.15

    return score


def is_page_same_company(
    s7: Super7Input,
    page: PageExtractionResult,
    min_sim: float = 0.6,
) -> bool:
    """
    Minimal same-company guard:
    - Look at extracted company_name entities on this page.
    - Compute name similarity vs. target company_name.
    - If any >= min_sim, treat as same company.
    - If no company_name entities at all, we allow the page (can't decide).
    """
    target = s7.company_name
    if not target:
        return True

    sims = []
    for e in page.entities:
        if e.entity_type == "company_name" and e.value:
            sims.append(jaccard_name_similarity(target, e.value))

    if not sims:
        # no explicit company_name extracted; don't block
        return True

    best_sim = max(sims)
    return best_sim >= min_sim


def summarize_super7_simple(
    s7: Super7Input,
    candidates: List[CandidateRecord],
) -> Dict[str, Optional[Dict[str, Any]]]:
    """
    Collect all entities from all pages and pick best per Super7 field.

    For non-company_name fields, we require the page to be "same company"
    according to is_page_same_company().

    The returned "confidence" is normalized into [0,1].
    """
    summary: Dict[str, Optional[Dict[str, Any]]] = {}

    # max possible raw_score ~ 2.0 (conf 1 + 0.5*1 + 0.2 + 0.3)
    RAW_SCORE_MAX = 2.0

    for field in SUPER7_FIELDS:
        best_ent = None
        best_page = None
        best_raw_score = -1.0

        for rec in candidates:
            page = rec.extraction

            # For non-name fields, enforce same-company guard
            if field != "company_name" and not is_page_same_company(s7, page):
                continue

            for ent in page.entities:
                if ent.entity_type != field:
                    continue
                if not ent.value:
                    continue

                # Skip if ALL sources are excluded domains
                allowed_sources = []
                for src in ent.source_urls:
                    d = get_domain(src)
                    if d and d in SUMMARY_DOMAIN_EXCLUDE:
                        continue
                    allowed_sources.append(src)
                if not allowed_sources:
                    continue

                raw_score = score_field_candidate(field, s7, ent, page)
                if raw_score > best_raw_score:
                    best_raw_score = raw_score
                    best_ent = ent
                    best_page = page

        # if no good candidate, set None
        if not best_ent or best_raw_score < 0.3:
            summary[field] = None
        else:
            # normalize raw_score into [0,1] for exposed confidence
            norm_conf = best_raw_score / RAW_SCORE_MAX
            norm_conf = max(0.0, min(norm_conf, 1.0))

            all_sources = list({s for s in best_ent.source_urls if s})
            primary_source = all_sources[0] if all_sources else (best_page.url if best_page else "")
            summary[field] = {
                "value": best_ent.value,
                "source": primary_source,
                "confidence": norm_conf,
                "all_sources": all_sources,
            }

    return summary


# ---------------------------------------------------------------------------
# Resolver
# ---------------------------------------------------------------------------

class Super7Resolver:
    def __init__(
        self,
        search: WebSearchTool,
        scraper: ScraperTool,
        extractor: LLMExtractor,
    ):
        self.search = search
        self.scraper = scraper
        self.extractor = extractor

    def build_queries(self, s7: Super7Input) -> List[str]:
        name = s7.company_name.strip()
        parts = [name]
        if s7.city:
            parts.append(s7.city)
        if s7.state:
            parts.append(s7.state)
        if s7.country:
            parts.append(s7.country)
        base = " ".join(parts)

        queries = [
            f"{base} official website",
            f"{base} company",
            f"\"{name}\"",
        ]
        if s7.phone:
            queries.append(f"\"{name}\" \"{s7.phone}\"")
        return queries

    def process_company(self, s7: Super7Input) -> Dict[str, Any]:
        queries = self.build_queries(s7)
        search_results = self.search.search(queries)

        candidate_records: List[CandidateRecord] = []
        primary_url = None
        primary_conf = 0.0

        for meta in search_results:
            url = meta["url"]
            domain = get_domain(url) or ""
            if domain in NEWS_DOMAIN_BLACKLIST:
                # For this use-case, we ignore news outlets entirely
                continue

            title = meta.get("title") or ""
            snippet_text = meta.get("content") or ""

            # 1) Search-level filter
            if not should_consider_search_result(s7.company_name, title, snippet_text):
                continue

            # 2) Scrape / convert
            full_text = self.scraper.fetch(url)
            if not full_text.strip():
                # fallback to Tavily snippet if nothing else
                if not snippet_text:
                    continue
                full_text = snippet_text

            # 3) Doc-level filter
            if not doc_mentions_company(s7.company_name, full_text):
                continue

            # 4) Snippet extraction
            snippets = extract_snippets_for_company(full_text, s7.company_name)
            if not snippets:
                continue

            # 5) LLM extraction on snippets
            extraction = self.extractor.extract_from_snippets(s7, url, snippets)
            candidate_records.append(
                CandidateRecord(
                    url=url,
                    source_type=meta.get("source_type", "web_search"),
                    extraction=extraction,
                )
            )

            if extraction.overall_score > primary_conf:
                primary_conf = extraction.overall_score
                primary_url = url

        super7_summary = summarize_super7_simple(s7, candidate_records)

        return {
            "input": s7.model_dump(),
            "primary_url": primary_url,
            "primary_confidence": primary_conf,
            "candidates": [
                {
                    "url": r.url,
                    "overall_score": r.extraction.overall_score,
                    "reason": r.extraction.reason,
                }
                for r in candidate_records
            ],
            "super7_summary": super7_summary,
        }


# ---------------------------------------------------------------------------
# Batch API
# ---------------------------------------------------------------------------

def resolve_super7_batch(super7_payloads: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    super7_payloads: list of dicts with at least "company_name".
    Returns: {"results": [ ... per-company dict ... ]}
    """
    search = WebSearchTool(max_results=5)
    scraper = ScraperTool(
        timeout=10,
        delay_range=(1.5, 4.0),
        max_content_length_bytes=8_000_000,
    )
    extractor = LLMExtractor(model="gpt-4o-mini", temperature=0.0)
    resolver = Super7Resolver(search, scraper, extractor)

    results = []
    for payload in super7_payloads:
        s7 = Super7Input(**payload)
        out = resolver.process_company(s7)
        results.append(out)

    return {"results": results}


# ---------------------------------------------------------------------------
# Manual test
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    batch_input  =  [
        {
            "company_name": "r&k firesupport llc"
        },
        {
            "company_name": "Home Fit solutions LLC",

        },
        {
            "company_name": "David's Tireshop",
        },
        {
            "company_name": "Closhare LLc",

        },
        {
            "company_name": "Nexapoint Holding",
        },
        {
            "company_name": "Making you happy logistics llc",
        },
        {
            "company_name": "butler & associates construction,inc",
        },
        {
            "company_name": "focus wound care centre",
        }
    ]
    res = resolve_super7_batch(batch_input)
    print(json.dumps(res, indent=2))
