In [None]:
# """
# super7_docling_snippet_resolver.py

# Pipeline:
# - Input: list of Super7Input dicts (company_name required; others optional)
# - For each company:
#     1) Tavily search to get candidate URLs (+ title + snippet)
#     2) Search-level filter (title/snippet similarity vs company name)
#     3) ScraperTool (Docling-first) to get text from HTML-like pages
#        (now explicitly skips PDFs/Word/Excel/PPT by extension)
#     4) Doc-level filter (does text even mention the company?)
#     5) Snippet extraction:
#          - windows around company name
#          - regex-based address/phone/zip candidates
#     6) LLMExtractor runs on snippets (not full doc)
#     7) Super7 summarizer:
#          - same-company guard
#          - light scoring
#          - per-field best value + provenance

# Additional rules:
# - News domains (e.g. thetimes-tribune.com) are blacklisted as sources
#   and are not scraped or used as primary URLs.
# - Social media + DNB + news domains are never used as Super7 value sources.

# Scraping ethics:
# - Realistic browser User-Agent
# - requests.Session() to reuse cookies
# - small random delays between requests
# - domain blacklist for clearly hostile/irrelevant sites
# - size guard for huge documents
# - you are responsible for only scraping sites whose ToS/robots.txt allow it
# """

# import os
# import json
# import time
# import random
# import string
# import re
# from typing import List, Optional, Dict, Any, Tuple
# from dataclasses import dataclass
# from urllib.parse import urlparse

# import logging
# import requests
# from bs4 import BeautifulSoup
# from dotenv import load_dotenv
# from pydantic import BaseModel, Field
# from openai import OpenAI
# from langchain_tavily import TavilySearch
# from docling.document_converter import DocumentConverter

# # ---------------------------------------------------------------------------
# # Setup
# # ---------------------------------------------------------------------------

# load_dotenv()
# logging.getLogger("httpx").setLevel(logging.WARNING)

# if not os.getenv("OPENAI_API_KEY"):
#     raise RuntimeError("Please set OPENAI_API_KEY in your environment.")

# if not os.getenv("TAVILY_API_KEY"):
#     print("[WARN] TAVILY_API_KEY is not set. Tavily search will fail.")

# SUPER7_FIELDS = [
#     "company_name",
#     "street_address",
#     "city",
#     "state",
#     "country",
#     "zip",
#     "phone",
# ]

# # Basic corp suffixes for name normalization
# CORP_SUFFIXES = {
#     "llc", "l.l.c", "inc", "inc.", "corp", "corp.", "corporation",
#     "company", "co", "co.", "ltd", "ltd.", "limited", "plc", "s.a.",
#     "gmbh", "oy", "ab", "bv", "srl", "sas", "spa", "holdings", "holding",
# }

# # News outlets we want to ignore for this use-case
# NEWS_DOMAIN_BLACKLIST = {
#     "thetimes-tribune.com",
#     "www.thetimes-tribune.com",
#     # add more if they show up as bad sources
# }

# # Domains we DO NOT want to use for Super7 fields (social + DNB + news)
# SUMMARY_DOMAIN_EXCLUDE = {
#     "facebook.com",
#     "www.facebook.com",
#     "instagram.com",
#     "www.instagram.com",
#     "twitter.com",
#     "www.twitter.com",
#     "x.com",
#     "www.x.com",
#     "linkedin.com",
#     "www.linkedin.com",
#     "tiktok.com",
#     "www.tiktok.com",
#     "dnb.com",
#     "www.dnb.com",
# }
# SUMMARY_DOMAIN_EXCLUDE |= NEWS_DOMAIN_BLACKLIST

# # Domains we know are hostile / blocked / not worth scraping directly
# SCRAPER_DOMAIN_BLACKLIST = {
#     "firesupport.uk",
#     "www.firesupport.uk",
#     "search.sunbiz.org",
#     "bubba.ai",
#     "govtribe.com",
#     "www.govtribe.com",
#     "brokersnapshot.com",
#     "www.brokersnapshot.com",
#     "dnb.com",
#     "www.dnb.com",
#     "b2bhint.com",
#     "www.b2bhint.com",
#     "yelp.com",
#     "www.yelp.com",
#     "davids-tire-shop-service.wheree.com",
#     "wheree.com",

#     # noisy big-PDF domains from your logs
#     "luke.af.mil",
#     "www.luke.af.mil",
#     "nrc.gov",
#     "www.nrc.gov",
# }
# SCRAPER_DOMAIN_BLACKLIST |= NEWS_DOMAIN_BLACKLIST


# # ---------------------------------------------------------------------------
# # Helpers
# # ---------------------------------------------------------------------------

# def get_domain(url: str) -> Optional[str]:
#     if not url:
#         return None
#     try:
#         netloc = urlparse(url).netloc.lower()
#         if netloc.startswith("www."):
#             netloc = netloc[4:]
#         return netloc or None
#     except Exception:
#         return None


# def normalize_company_name(name: str) -> str:
#     """
#     Normalize company name for similarity:
#       - lowercase
#       - & -> and
#       - remove punctuation
#       - drop typical corp suffixes
#     """
#     if not name:
#         return ""

#     s = name.lower()
#     s = s.replace("&", " and ")

#     trans_table = str.maketrans("", "", string.punctuation.replace("&", ""))
#     s = s.translate(trans_table)

#     tokens = s.split()
#     cleaned = [t for t in tokens if t not in CORP_SUFFIXES]
#     return " ".join(cleaned).strip()


# def jaccard_name_similarity(a: str, b: str) -> float:
#     """
#     Token Jaccard similarity between normalized names.
#     """
#     na = normalize_company_name(a)
#     nb = normalize_company_name(b)
#     if not na or not nb:
#         return 0.0

#     set_a = set(na.split())
#     set_b = set(nb.split())
#     if not set_a or not set_b:
#         return 0.0

#     inter = len(set_a & set_b)
#     union = len(set_a | set_b)
#     return inter / union if union > 0 else 0.0


# def should_consider_search_result(
#     company_name: str,
#     title: str,
#     snippet: str,
#     min_sim: float = 0.2,
# ) -> bool:
#     """
#     Cheap pre-filter: decide whether a Tavily result is even worth scraping.
#     - If title is somewhat similar OR snippet mentions the company name, keep.
#     - Otherwise, skip.
#     """
#     if not title and not snippet:
#         return True  # be permissive if we know nothing

#     sim = jaccard_name_similarity(company_name, title or "")
#     if sim >= min_sim:
#         return True

#     if company_name and snippet:
#         if company_name.lower() in snippet.lower():
#             return True

#     return False


# def doc_mentions_company(s7_name: str, text: str, min_occurrences: int = 1) -> bool:
#     """
#     Doc-level filter: does the text even look like it's about this company?

#     - Check if raw company_name (lowercased) appears.
#     - If not, check main token of normalized name.
#     """
#     if not s7_name or not text:
#         return False

#     text_lower = text.lower()
#     if s7_name.lower() in text_lower:
#         return True

#     norm = normalize_company_name(s7_name)
#     tokens = norm.split()
#     if not tokens:
#         return False

#     main_token = tokens[0]
#     if not main_token:
#         return False

#     return text_lower.count(main_token) >= min_occurrences


# # ---------------------------------------------------------------------------
# # Snippet extraction
# # ---------------------------------------------------------------------------

# @dataclass
# class Snippet:
#     snippet_id: int
#     snippet_type: str  # "name_context" | "address_candidate" | "phone_candidate" | "zip_candidate" | "generic"
#     text: str


# def extract_snippets_for_company(
#     full_text: str,
#     company_name: str,
#     max_snippets: int = 25,
#     window_chars: int = 400,
# ) -> List[Snippet]:
#     """
#     Extract a small set of high-signal snippets from the full text:
#     - windows around company-name mentions
#     - regex candidates for phone / address / zip
#     """
#     snippets: List[Snippet] = []
#     used_spans: List[Tuple[int, int]] = []

#     text = full_text or ""
#     if not text.strip():
#         return snippets

#     lower_text = text.lower()
#     norm_name = normalize_company_name(company_name)
#     raw_name = company_name.lower()
#     name_variants = set()
#     if norm_name:
#         name_variants.add(norm_name)
#     if raw_name:
#         name_variants.add(raw_name)
#     if "&" in raw_name:
#         name_variants.add(raw_name.replace("&", "and"))

#     def add_snippet(start: int, end: int, snippet_type: str):
#         nonlocal snippets, used_spans
#         # de-duplicate overlapping spans
#         for s, e in used_spans:
#             if not (end <= s or start >= e):
#                 return
#         chunk = text[start:end].strip()
#         if not chunk:
#             return
#         snippet_id = len(snippets) + 1
#         snippets.append(Snippet(snippet_id=snippet_id, snippet_type=snippet_type, text=chunk))
#         used_spans.append((start, end))

#     # --- 1) Name-anchored snippets ---
#     for variant in name_variants:
#         if not variant:
#             continue
#         idx = 0
#         while True:
#             idx = lower_text.find(variant, idx)
#             if idx == -1:
#                 break
#             start = max(0, idx - window_chars)
#             end = min(len(text), idx + len(variant) + window_chars)
#             add_snippet(start, end, "name_context")
#             idx = idx + len(variant)
#             if len(snippets) >= max_snippets:
#                 return snippets

#     # --- 2) Regex-based phone candidates ---
#     phone_pattern = re.compile(r"\+?\d[\d\-\s\(\)]{7,}")
#     for m in phone_pattern.finditer(text):
#         start = max(0, m.start() - 80)
#         end = min(len(text), m.end() + 80)
#         add_snippet(start, end, "phone_candidate")
#         if len(snippets) >= max_snippets:
#             return snippets

#     # --- 3) Regex-based zip candidates (US-style, approximate) ---
#     zip_pattern = re.compile(r"\b\d{5}(?:-\d{4})?\b")
#     for m in zip_pattern.finditer(text):
#         start = max(0, m.start() - 80)
#         end = min(len(text), m.end() + 80)
#         add_snippet(start, end, "zip_candidate")
#         if len(snippets) >= max_snippets:
#             return snippets

#     # --- 4) Address-ish lines fallback (only if still few snippets) ---
#     if len(snippets) < max_snippets:
#         lines = text.splitlines()
#         address_keywords = [
#             "street", "st.", "st ", "road", "rd.", "rd ",
#             "avenue", "ave.", "ave ", "boulevard", "blvd",
#             "lane", "ln.", "ln ", "drive", "dr.", "dr ",
#         ]
#         for line in lines:
#             l = line.lower()
#             if any(kw in l for kw in address_keywords) and any(ch.isdigit() for ch in l):
#                 chunk = line.strip()
#                 if not chunk:
#                     continue
#                 snippet_id = len(snippets) + 1
#                 snippets.append(Snippet(snippet_id=snippet_id, snippet_type="address_candidate", text=chunk))
#                 if len(snippets) >= max_snippets:
#                     break

#     # If we somehow got nothing, add a generic first N chars as a last resort
#     if not snippets:
#         chunk = text[:800].strip()
#         if chunk:
#             snippets.append(Snippet(snippet_id=1, snippet_type="generic", text=chunk))

#     return snippets


# def snippets_to_prompt_block(snippets: List[Snippet]) -> str:
#     """
#     Convert snippets into a textual block for the LLM prompt.
#     """
#     if not snippets:
#         return "No snippets were extracted; the document text was empty or uninformative."

#     lines = ["Here are the extracted snippets (pre-filtered for likely relevance):"]
#     for sn in snippets:
#         lines.append(f"[SNIPPET {sn.snippet_id}] type={sn.snippet_type}")
#         lines.append(sn.text)
#         lines.append("")  # blank line between snippets
#     return "\n".join(lines)


# # ---------------------------------------------------------------------------
# # Models
# # ---------------------------------------------------------------------------

# class Super7Input(BaseModel):
#     company_name: str
#     country: Optional[str] = None
#     state: Optional[str] = None
#     city: Optional[str] = None
#     street_address: Optional[str] = None
#     zip: Optional[str] = None
#     phone: Optional[str] = None


# class ExtractedEntity(BaseModel):
#     entity_type: str
#     value: str
#     source_urls: List[str] = Field(default_factory=list)
#     confidence: Optional[float] = None  # should be in [0,1]


# class PageExtractionResult(BaseModel):
#     url: str
#     entities: List[ExtractedEntity] = Field(default_factory=list)
#     match_score_name: float = 0.0
#     match_score_address: float = 0.0
#     match_score_phone: float = 0.0
#     looks_like_official_site: bool = False
#     overall_score: float = 0.0
#     reason: str = ""


# @dataclass
# class CandidateRecord:
#     url: str
#     source_type: str
#     extraction: PageExtractionResult


# # ---------------------------------------------------------------------------
# # Web search
# # ---------------------------------------------------------------------------

# class WebSearchTool:
#     def __init__(self, max_results: int = 5):
#         key = os.getenv("TAVILY_API_KEY")
#         if not key:
#             raise RuntimeError("Missing TAVILY_API_KEY.")
#         self.tool = TavilySearch(max_results=max_results, tavily_api_key=key)

#     def search(self, queries: List[str]) -> List[Dict[str, Any]]:
#         seen: Dict[str, Dict[str, Any]] = {}
#         for q in queries:
#             res = self.tool.invoke({"query": q})
#             for r in res.get("results", []):
#                 url = r.get("url")
#                 if not url:
#                     continue
#                 if url not in seen:
#                     seen[url] = {
#                         "url": url,
#                         "title": r.get("title", ""),
#                         "source_type": "web_search",
#                         "content": r.get("content", ""),
#                     }
#         return list(seen.values())


# # ---------------------------------------------------------------------------
# # ScraperTool using Docling + polite crawling + size guard
# # ---------------------------------------------------------------------------

# class ScraperTool:
#     """
#     Scraper that prefers Docling for rich formats (but now explicitly
#     skips PDFs/Word/Excel/PPT by extension), and falls back to HTML +
#     BeautifulSoup if Docling fails.

#     - Realistic browser-like User-Agent
#     - requests.Session() to persist cookies
#     - small random delays between requests (throttling)
#     - domain blacklist
#     - size guard for large documents

#     Returns plain text/markdown truncated to max_chars.
#     """

#     SKIP_EXTENSIONS = (
#         ".pdf",
#         ".doc",
#         ".docx",
#         ".xls",
#         ".xlsx",
#         ".ppt",
#         ".pptx",
#     )

#     def __init__(
#         self,
#         timeout: int = 10,
#         delay_range: tuple = (1.0, 3.0),
#         use_markdown: bool = True,
#         max_content_length_bytes: int = 8_000_000,  # ~8 MB limit
#     ):
#         self.timeout = timeout
#         self.delay_range = delay_range
#         self.use_markdown = use_markdown
#         self.max_content_length_bytes = max_content_length_bytes

#         self.session = requests.Session()
#         self.session.headers.update({
#             "User-Agent": (
#                 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
#                 "AppleWebKit/537.36 (KHTML, like Gecko) "
#                 "Chrome/120.0.0.0 Safari/537.36"
#             ),
#             "Accept": (
#                 "text/html,application/xhtml+xml,application/xml;"
#                 "q=0.9,image/avif,image/webp,*/*;q=0.8"
#             ),
#             "Accept-Language": "en-US,en;q=0.9",
#             "Accept-Encoding": "gzip, deflate, br",
#             "Connection": "keep-alive",
#         })

#         self.converter = DocumentConverter()

#     def _delay(self):
#         lo, hi = self.delay_range
#         if hi > 0:
#             time.sleep(random.uniform(lo, hi))

#     def _scrape_html_basic(self, html: str) -> str:
#         soup = BeautifulSoup(html, "html.parser")
#         for tag in soup(["script", "style", "noscript"]):
#             tag.decompose()
#         return "\n".join(
#             line.strip()
#             for line in soup.get_text("\n").splitlines()
#             if line.strip()
#         )

#     def _head_too_large(self, url: str) -> bool:
#         """
#         Lightweight HEAD to check Content-Length before we download/convert.
#         If the file is larger than max_content_length_bytes, we skip it.
#         """
#         if not self.max_content_length_bytes:
#             return False

#         try:
#             self._delay()
#             resp = self.session.head(url, timeout=self.timeout, allow_redirects=True)
#         except requests.RequestException:
#             # If HEAD fails, don't block; we'll let normal flow decide.
#             return False

#         cl = resp.headers.get("Content-Length")
#         if cl is None:
#             return False

#         try:
#             size = int(cl)
#         except ValueError:
#             return False

#         if size > self.max_content_length_bytes:
#             print(f"[SCRAPER] Skipping {url} (size {size} > {self.max_content_length_bytes} bytes).")
#             return True
#         return False

#     def fetch(self, url: str, max_chars: int = 50000) -> str:
#         domain = get_domain(url) or ""
#         if domain in SCRAPER_DOMAIN_BLACKLIST:
#             # known problematic or unwanted domains
#             return ""

#         # Skip PDFs / Word / Excel / PPT by extension
#         path = urlparse(url).path.lower()
#         if any(path.endswith(ext) for ext in self.SKIP_EXTENSIONS):
#             # For this project, we ignore non-HTML docs
#             return ""

#         # Quick size check first (for large docs)
#         if self._head_too_large(url):
#             return ""

#         # 1) Try Docling directly with URL (for HTML-like content)
#         try:
#             self._delay()
#             result = self.converter.convert(url)
#             doc = result.document

#             if self.use_markdown:
#                 text = doc.export_to_markdown()
#             else:
#                 text = doc.export_to_markdown()  # markdown is fine for LLM

#             if text:
#                 if len(text) > max_chars:
#                     text = text[:max_chars]
#                 return text

#         except Exception as e:
#             print(f"[SCRAPER] Docling failed for {url}: {e}")

#         # 2) Fallback: raw HTML
#         try:
#             self._delay()
#             resp = self.session.get(url, timeout=self.timeout, allow_redirects=True)
#         except requests.RequestException as e:
#             print(f"[SCRAPER] Failed {url}: {e}")
#             return ""

#         if resp.status_code in (401, 403, 429):
#             print(f"[SCRAPER] HTTP {resp.status_code} for {url}, skipping.")
#             return ""

#         try:
#             resp.raise_for_status()
#         except requests.HTTPError as e:
#             print(f"[SCRAPER] HTTP error {resp.status_code} for {url}: {e}")
#             return ""

#         text = self._scrape_html_basic(resp.text)
#         if len(text) > max_chars:
#             text = text[:max_chars]
#         return text


# # ---------------------------------------------------------------------------
# # LLM Extractor (snippet-based)
# # ---------------------------------------------------------------------------

# class LLMExtractor:
#     """
#     Handles calls to ChatGPT to:
#     - Extract entities from snippets
#     - Compute match scores vs Super7Input
#     """

#     def __init__(self, model_name: str = "gpt-4o-mini", temperature: float = 0.0):
#         self.llm = ChatOpenAI(
#             model=model_name,
#             temperature=temperature,
#         )

#     @staticmethod
#     def _safe_float(value, default: float = 0.0) -> float:
#         """Convert value to float, handling None and bad types gracefully."""
#         if value is None:
#             return default
#         try:
#             return float(value)
#         except (TypeError, ValueError):
#             return default

#     def build_prompt_from_snippets(
#         self,
#         s7: Super7Input,
#         url: str,
#         snippets: List[Snippet],
#     ) -> str:
#         s7_json = json.dumps(s7.model_dump(), indent=2)
#         snippet_block = snippets_to_prompt_block(snippets)

#         return f"""
# You are an information extraction assistant.

# You are given:
# - A target company Super7 input
# - A URL
# - A small set of text snippets extracted from that URL, pre-filtered for relevance

# Your job is to extract entities using these exact entity_type values when applicable:

# Super7-related:
# - "company_name"
# - "street_address"
# - "city"
# - "state"
# - "country"
# - "zip"
# - "phone"

# Identifier-related:
# - "dot_number"
# - "registration_id"
# - "tax_id"
# - "mc_number"

# Other:
# - "industry"
# - "email"
# - "website"
# - "social_link"
# - "director"
# - "other"

# For each entity:
# - entity_type: one of the above strings
# - value: string
# - source_urls: array of URLs (MUST include "{url}" at minimum)
# - confidence: 0.0 to 1.0

# Also compute:
# - match_score_name: 0.0 to 1.0 (how well the snippets match the company name)
# - match_score_address: 0.0 to 1.0
# - match_score_phone: 0.0 to 1.0
# - looks_like_official_site: true/false (is this likely the official website / main profile?)
# - overall_score: 0.0 to 1.0 (summary of how relevant this URL is to the company)
# - reason: short explanation

# Important:
# - The snippets may contain other companies or entities; only extract entities that clearly belong to the target company.
# - Be conservative with confidence and scores; if unsure, use lower values.

# Return STRICT JSON ONLY in this shape (no extra commentary):

# {{
#   "url": "{url}",
#   "entities": [
#     {{
#       "entity_type": "company_name" | "street_address" | "city" | "state" | "country" | "zip" | "phone" |
#                       "dot_number" | "registration_id" | "tax_id" | "mc_number" |
#                       "industry" | "email" | "website" | "social_link" | "director" | "other",
#       "value": "<string>",
#       "source_urls": ["<url1>", "<url2>", "..."],
#       "confidence": <number between 0 and 1 or null>
#     }}
#   ],
#   "match_score_name": <0..1>,
#   "match_score_address": <0..1>,
#   "match_score_phone": <0..1>,
#   "looks_like_official_site": <true or false>,
#   "overall_score": <0..1>,
#   "reason": "<short explanation>"
# }}

# Super7 input (hints, may be null):
# {s7_json}

# URL: {url}

# {snippet_block}
# """

#     def extract_from_snippets(
#         self,
#         s7: Super7Input,
#         url: str,
#         snippets: List[Snippet],
#     ) -> PageExtractionResult:
#         if not snippets:
#             return PageExtractionResult(url=url)

#         prompt = self.build_prompt_from_snippets(s7, url, snippets)
#         resp = self.client.chat.completions.create(
#             model=self.model,
#             messages=[{"role": "user", "content": prompt}],
#             temperature=self.temperature,
#         )
#         raw = resp.choices[0].message.content or ""

#         try:
#             data = json.loads(raw)
#         except Exception:
#             start = raw.find("{")
#             end = raw.rfind("}")
#             if start != -1 and end != -1 and end > start:
#                 try:
#                     data = json.loads(raw[start:end + 1])
#                 except Exception:
#                     data = {}
#             else:
#                 data = {}

#         url_out = str(data.get("url") or url)
#         ents: List[ExtractedEntity] = []

#         for e in data.get("entities", []):
#             raw_type = e.get("entity_type")
#             entity_type = "other" if raw_type is None else (str(raw_type) or "other")

#             raw_value = e.get("value")
#             value = "" if raw_value is None else str(raw_value)

#             raw_srcs = e.get("source_urls") or []
#             srcs = [str(s) for s in raw_srcs if s]
#             if url_out not in srcs:
#                 srcs.append(url_out)

#             raw_conf = e.get("confidence")
#             if isinstance(raw_conf, (int, float)):
#                 confidence = max(0.0, min(float(raw_conf), 1.0))
#             else:
#                 confidence = 0.0

#             ents.append(
#                 ExtractedEntity(
#                     entity_type=entity_type,
#                     value=value,
#                     source_urls=srcs,
#                     confidence=confidence,
#                 )
#             )

#         return PageExtractionResult(
#             url=url_out,
#             entities=ents,
#             match_score_name=float(data.get("match_score_name", 0.0)),
#             match_score_address=float(data.get("match_score_address", 0.0)),
#             match_score_phone=float(data.get("match_score_phone", 0.0)),
#             looks_like_official_site=bool(data.get("looks_like_official_site", False)),
#             overall_score=float(data.get("overall_score", 0.0)),
#             reason=str(data.get("reason", "")),
#         )


# # ---------------------------------------------------------------------------
# # Simple scoring + same-company guard
# # ---------------------------------------------------------------------------

# def score_field_candidate(
#     field: str,
#     s7: Super7Input,
#     ent: ExtractedEntity,
#     page: PageExtractionResult,
# ) -> float:
#     """
#     Simple scoring for a candidate entity (raw score, not normalized).

#     raw_score =
#         ent_confidence
#       + 0.5 * page_overall_score
#       + 0.2 if looks_like_official_site
#       + bonus if matches Super7 hint
#     """
#     conf = ent.confidence if isinstance(ent.confidence, (int, float)) else 0.0
#     score = conf + 0.5 * page.overall_score
#     if page.looks_like_official_site:
#         score += 0.2

#     hint = getattr(s7, field, None)
#     if hint:
#         h = hint.lower().strip()
#         v = ent.value.lower().strip()
#         if v == h:
#             score += 0.3
#         elif h in v or v in h:
#             score += 0.15

#     return score


# def is_page_same_company(
#     s7: Super7Input,
#     page: PageExtractionResult,
#     min_sim: float = 0.6,
# ) -> bool:
#     """
#     Minimal same-company guard:
#     - Look at extracted company_name entities on this page.
#     - Compute name similarity vs. target company_name.
#     - If any >= min_sim, treat as same company.
#     - If no company_name entities at all, we allow the page (can't decide).
#     """
#     target = s7.company_name
#     if not target:
#         return True

#     sims = []
#     for e in page.entities:
#         if e.entity_type == "company_name" and e.value:
#             sims.append(jaccard_name_similarity(target, e.value))

#     if not sims:
#         # no explicit company_name extracted; don't block
#         return True

#     best_sim = max(sims)
#     return best_sim >= min_sim


# def summarize_super7_simple(
#     s7: Super7Input,
#     candidates: List[CandidateRecord],
# ) -> Dict[str, Optional[Dict[str, Any]]]:
#     """
#     Collect all entities from all pages and pick best per Super7 field.

#     For non-company_name fields, we require the page to be "same company"
#     according to is_page_same_company().

#     The returned "confidence" is normalized into [0,1].
#     """
#     summary: Dict[str, Optional[Dict[str, Any]]] = {}

#     # max possible raw_score ~ 2.0 (conf 1 + 0.5*1 + 0.2 + 0.3)
#     RAW_SCORE_MAX = 2.0

#     for field in SUPER7_FIELDS:
#         best_ent = None
#         best_page = None
#         best_raw_score = -1.0

#         for rec in candidates:
#             page = rec.extraction

#             # For non-name fields, enforce same-company guard
#             if field != "company_name" and not is_page_same_company(s7, page):
#                 continue

#             for ent in page.entities:
#                 if ent.entity_type != field:
#                     continue
#                 if not ent.value:
#                     continue

#                 # Skip if ALL sources are excluded domains
#                 allowed_sources = []
#                 for src in ent.source_urls:
#                     d = get_domain(src)
#                     if d and d in SUMMARY_DOMAIN_EXCLUDE:
#                         continue
#                     allowed_sources.append(src)
#                 if not allowed_sources:
#                     continue

#                 raw_score = score_field_candidate(field, s7, ent, page)
#                 if raw_score > best_raw_score:
#                     best_raw_score = raw_score
#                     best_ent = ent
#                     best_page = page

#         # if no good candidate, set None
#         if not best_ent or best_raw_score < 0.3:
#             summary[field] = None
#         else:
#             # normalize raw_score into [0,1] for exposed confidence
#             norm_conf = best_raw_score / RAW_SCORE_MAX
#             norm_conf = max(0.0, min(norm_conf, 1.0))

#             all_sources = list({s for s in best_ent.source_urls if s})
#             primary_source = all_sources[0] if all_sources else (best_page.url if best_page else "")
#             summary[field] = {
#                 "value": best_ent.value,
#                 "source": primary_source,
#                 "confidence": norm_conf,
#                 "all_sources": all_sources,
#             }

#     return summary


# # ---------------------------------------------------------------------------
# # Resolver
# # ---------------------------------------------------------------------------

# class Super7Resolver:
#     def __init__(
#         self,
#         search: WebSearchTool,
#         scraper: ScraperTool,
#         extractor: LLMExtractor,
#     ):
#         self.search = search
#         self.scraper = scraper
#         self.extractor = extractor

#     def build_queries(self, s7: Super7Input) -> List[str]:
#         name = s7.company_name.strip()
#         parts = [name]
#         if s7.city:
#             parts.append(s7.city)
#         if s7.state:
#             parts.append(s7.state)
#         if s7.country:
#             parts.append(s7.country)
#         base = " ".join(parts)

#         queries = [
#             f"{base} official website",
#             f"{base} company",
#             f"\"{name}\"",
#         ]
#         if s7.phone:
#             queries.append(f"\"{name}\" \"{s7.phone}\"")
#         return queries

#     def process_company(self, s7: Super7Input) -> Dict[str, Any]:
#         queries = self.build_queries(s7)
#         search_results = self.search.search(queries)

#         candidate_records: List[CandidateRecord] = []
#         primary_url = None
#         primary_conf = 0.0

#         for meta in search_results:
#             url = meta["url"]
#             domain = get_domain(url) or ""
#             if domain in NEWS_DOMAIN_BLACKLIST:
#                 # For this use-case, we ignore news outlets entirely
#                 continue

#             title = meta.get("title") or ""
#             snippet_text = meta.get("content") or ""

#             # 1) Search-level filter
#             if not should_consider_search_result(s7.company_name, title, snippet_text):
#                 continue

#             # 2) Scrape / convert
#             full_text = self.scraper.fetch(url)
#             if not full_text.strip():
#                 # fallback to Tavily snippet if nothing else
#                 if not snippet_text:
#                     continue
#                 full_text = snippet_text

#             # 3) Doc-level filter
#             if not doc_mentions_company(s7.company_name, full_text):
#                 continue

#             # 4) Snippet extraction
#             snippets = extract_snippets_for_company(full_text, s7.company_name)
#             if not snippets:
#                 continue

#             # 5) LLM extraction on snippets
#             extraction = self.extractor.extract_from_snippets(s7, url, snippets)
#             candidate_records.append(
#                 CandidateRecord(
#                     url=url,
#                     source_type=meta.get("source_type", "web_search"),
#                     extraction=extraction,
#                 )
#             )

#             if extraction.overall_score > primary_conf:
#                 primary_conf = extraction.overall_score
#                 primary_url = url

#         super7_summary = summarize_super7_simple(s7, candidate_records)

#         return {
#             "input": s7.model_dump(),
#             "primary_url": primary_url,
#             "primary_confidence": primary_conf,
#             "candidates": [
#                 {
#                     "url": r.url,
#                     "overall_score": r.extraction.overall_score,
#                     "reason": r.extraction.reason,
#                 }
#                 for r in candidate_records
#             ],
#             "super7_summary": super7_summary,
#         }


# # ---------------------------------------------------------------------------
# # Batch API
# # ---------------------------------------------------------------------------

# def resolve_super7_batch(super7_payloads: List[Dict[str, Any]]) -> Dict[str, Any]:
#     """
#     super7_payloads: list of dicts with at least "company_name".
#     Returns: {"results": [ ... per-company dict ... ]}
#     """
#     search = WebSearchTool(max_results=5)
#     scraper = ScraperTool(
#         timeout=10,
#         delay_range=(1.5, 4.0),
#         max_content_length_bytes=8_000_000,
#     )
#     extractor = LLMExtractor(model="gpt-4o-mini", temperature=0.0)
#     resolver = Super7Resolver(search, scraper, extractor)

#     results = []
#     for payload in super7_payloads:
#         s7 = Super7Input(**payload)
#         out = resolver.process_company(s7)
#         results.append(out)

#     return {"results": results}


# # ---------------------------------------------------------------------------
# # Manual test
# # ---------------------------------------------------------------------------

# # if __name__ == "__main__":
# #     batch_input  =  [
# #         {
# #             "company_name": "2"
# #         },
# #         {
# #             "company_name": "Home Fit solutions LLC",

# #         },
# #         {
# #             "company_name": "David's Tireshop",
# #         },
# #         {
# #             "company_name": "Closhare LLc",

# #         },
# #         {
# #             "company_name": "Nexapoint Holding",
# #         },
# #         {
# #             "company_name": "Making you happy logistics llc",
# #         },
# #         {
# #             "company_name": "butler & associates construction,inc",
# #         },
# #         {
# #             "company_name": "focus wound care centre",
# #         }
# #     ]
# #     res = resolve_super7_batch(batch_input)
# #     print(json.dumps(res, indent=2))


In [2]:
"""
super7_resolver.py

Pipeline:
- Input: Super7Input (company name + optional hints)
- Web search (Tavily) to get candidate URLs
- Scrape each URL (HTML) with polite rules
- Extract focused snippets (name, address, phone, zip)
- Use LLM (OpenAI via LangChain) to:
  - Extract entities (with per-entity source_urls + confidence)
  - Compute page-level match scores
- Score & select best candidate per Super7 field
- Output JSON with:
  - primary_url, primary_confidence
  - candidates (URLs + scores)
  - super7_summary (value, source, confidence, all_sources per field)
"""

import os
import time
import json
import re
import logging
import random
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional, Tuple
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup
from pydantic import BaseModel, Field

from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_tavily import TavilySearch

load_dotenv()

# -----------------------------------------------------------------------------
# Logging
# -----------------------------------------------------------------------------

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

logger = logging.getLogger(__name__)

# -----------------------------------------------------------------------------
# Constants / Config
# -----------------------------------------------------------------------------

# Domains we NEVER want to scrape at all (for HTML scraping).
SCRAPER_DOMAIN_BLACKLIST = {
    "www.dnb.com",
    "dnb.com",
}

# Domains we don't want to use as primary sources in Super7 summary
SUMMARY_DOMAIN_EXCLUDE = {
    # Data vendors / noisy aggregators
    "www.dnb.com",
    "dnb.com",
    "www.b2bhint.com",
    "b2bhint.com",
    # Social / user-generated
    "www.facebook.com",
    "facebook.com",
    "www.instagram.com",
    "instagram.com",
    "x.com",
    "twitter.com",
    "www.tiktok.com",
    "tiktok.com",
    # News / media
    "www.thetimes-tribune.com",
    "thetimes-tribune.com",
}

# File extensions we skip (we don't want PDFs/Office for this stage)
SKIP_EXTENSIONS = {
    ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
    ".zip", ".rar",
}

# Maximum HTML text length per page (post-clean) we keep
MAX_HTML_CHARS = 50000

# Throttling boundaries (to be polite)
REQUEST_DELAY_MIN = 0.5
REQUEST_DELAY_MAX = 1.5

# -----------------------------------------------------------------------------
# Utility functions
# -----------------------------------------------------------------------------

CORP_SUFFIXES = [
    "llc", "inc", "corp", "corporation", "ltd", "limited",
    "oy", "oyj", "sa", "gmbh", "plc", "lp", "llp", "bv",
    "srl", "sro", "pte", "sdn", "bhd", "ag", "nv"
]


def normalize_company_name(name: str) -> str:
    if not name:
        return ""
    s = name.lower()
    s = s.replace("&", " and ")
    s = re.sub(r"[^\w\s]", " ", s)
    tokens = [t for t in s.split() if t]
    filtered = [t for t in tokens if t not in CORP_SUFFIXES]
    return " ".join(filtered)


def jaccard_name_similarity(a: str, b: str) -> float:
    na = set(normalize_company_name(a).split())
    nb = set(normalize_company_name(b).split())
    if not na or not nb:
        return 0.0
    inter = len(na & nb)
    union = len(na | nb)
    return inter / union


def get_domain(url: str) -> str:
    try:
        return urlparse(url).netloc.lower()
    except Exception:
        return ""


def url_has_skip_extension(url: str) -> bool:
    path = urlparse(url).path.lower()
    for ext in SKIP_EXTENSIONS:
        if path.endswith(ext):
            return True
    return False


def should_consider_search_result(company_name: str, title: str, snippet: str) -> bool:
    """
    Decide if a Tavily search result looks relevant enough to scrape.
    """
    if not (title or snippet):
        return False

    # basic domain heuristics can be added here if you like

    sim = jaccard_name_similarity(company_name, title or "")
    if sim >= 0.2:
        return True

    # check company name (normalized) appears in snippet
    norm_name = normalize_company_name(company_name)
    if norm_name and snippet:
        if norm_name in snippet.lower():
            return True

    # fallback: raw name substring match
    if company_name and snippet and company_name.lower() in snippet.lower():
        return True

    return False


def doc_mentions_company(company_name: str, text: str, min_token_hits: int = 2) -> bool:
    """
    Quick filter: does the doc text look like it's about this company?
    """
    if not text:
        return False

    # raw name match
    if company_name.lower() in text.lower():
        return True

    norm = normalize_company_name(company_name)
    tokens = [t for t in norm.split() if t]
    if not tokens:
        return False

    # pick the longest token as main
    main = max(tokens, key=len)

    hits = text.lower().count(main.lower())
    return hits >= min_token_hits


def extract_snippets_for_company(
    text: str,
    company_name: str,
    max_snippets: int = 25,
    window_chars: int = 300,
) -> List[Dict[str, Any]]:
    """
    Extract small text windows where the company is mentioned + likely address/phone/zip lines.
    Returns a list of dicts:
      {
        "id": int,
        "type": "name_context" | "phone_context" | "zip_context" | "address_context" | "generic",
        "text": str
      }
    """
    snippets: List[Dict[str, Any]] = []
    if not text:
        return snippets

    lower_text = text.lower()
    norm_name = normalize_company_name(company_name)
    raw_name = company_name.lower()

    variants = set()
    if norm_name:
        variants.add(norm_name)
    if raw_name:
        variants.add(raw_name)
    # & vs and
    variants |= {v.replace("&", " and ") for v in variants}
    variants |= {v.replace(" and ", " & ") for v in variants}

    # 1) name-based windows
    used_ranges = []
    for v in variants:
        if not v.strip():
            continue
        start = 0
        while True:
            idx = lower_text.find(v, start)
            if idx == -1:
                break
            left = max(0, idx - window_chars)
            right = min(len(text), idx + len(v) + window_chars)
            candidate = text[left:right].strip()
            if candidate:
                snippets.append(
                    {
                        "id": len(snippets),
                        "type": "name_context",
                        "text": candidate,
                    }
                )
                used_ranges.append((left, right))
            start = idx + len(v)
            if len(snippets) >= max_snippets:
                break
        if len(snippets) >= max_snippets:
            break

    if len(snippets) >= max_snippets:
        return snippets[:max_snippets]

    # 2) regex-based patterns (phone, zip, address-like lines)
    phone_pattern = re.compile(
        r"(\+?\d[\d\-\(\)\s]{6,}\d)",
        re.MULTILINE,
    )
    zip_pattern = re.compile(r"\b\d{5}(?:-\d{4})?\b")
    address_keywords = [
        "street", "st.", "st ", "road", "rd.", "rd ",
        "avenue", "ave", "blvd", "lane", "ln", "drive", "dr", "way",
    ]

    # We'll work line-wise
    lines = text.splitlines()
    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue
        lower_line = stripped.lower()

        # Phone context
        if phone_pattern.search(stripped):
            snippets.append(
                {
                    "id": len(snippets),
                    "type": "phone_context",
                    "text": stripped,
                }
            )

        # Zip context
        if zip_pattern.search(stripped):
            snippets.append(
                {
                    "id": len(snippets),
                    "type": "zip_context",
                    "text": stripped,
                }
            )

        # Address-like context
        if any(k in lower_line for k in address_keywords) and re.search(r"\d", stripped):
            snippets.append(
                {
                    "id": len(snippets),
                    "type": "address_context",
                    "text": stripped,
                }
            )

        if len(snippets) >= max_snippets:
            break

    if not snippets:
        # fallback: generic snippet
        snippet = text[:800].strip()
        if snippet:
            snippets.append(
                {
                    "id": 0,
                    "type": "generic",
                    "text": snippet,
                }
            )

    # deduplicate by text
    seen_text = set()
    unique_snippets = []
    for sn in snippets:
        if sn["text"] not in seen_text:
            seen_text.add(sn["text"])
            unique_snippets.append(sn)

    return unique_snippets[:max_snippets]


# -----------------------------------------------------------------------------
# Data models
# -----------------------------------------------------------------------------

class Super7Input(BaseModel):
    company_name: str
    country: Optional[str] = None
    state: Optional[str] = None
    city: Optional[str] = None
    street_address: Optional[str] = None
    zip: Optional[str] = None
    phone: Optional[str] = None


class ExtractedEntity(BaseModel):
    entity_type: str
    value: str
    source_urls: List[str] = Field(default_factory=list)
    confidence: Optional[float] = None


class PageExtractionResult(BaseModel):
    url: str
    entities: List[ExtractedEntity] = Field(default_factory=list)
    match_score_name: float = 0.0
    match_score_address: float = 0.0
    match_score_phone: float = 0.0
    looks_like_official_site: bool = False
    overall_score: float = 0.0
    reason: str = ""


@dataclass
class CandidateRecord:
    url: str
    source_type: str
    extraction: PageExtractionResult
    first_seen_at: float = field(default_factory=time.time)
    last_checked_at: float = field(default_factory=time.time)


# -----------------------------------------------------------------------------
# Web search tool (Tavily)
# -----------------------------------------------------------------------------

class WebSearchTool:
    def __init__(self, max_results: int = 5):
        self.max_results = max_results
        self._tool = TavilySearch(max_results=max_results)

    def search_candidates(self, queries: List[str]) -> List[Dict[str, Any]]:
        seen: Dict[str, Dict[str, Any]] = {}
        for q in queries:
            res = self._tool.invoke({"query": q})
            results = res.get("results", [])

            for r in results:
                url = r.get("url")
                title = r.get("title", "")
                snippet = r.get("content", "") or r.get("snippet", "")
                if not url:
                    continue
                if url not in seen:
                    seen[url] = {
                        "url": url,
                        "title": title,
                        "snippet": snippet,
                        "source_type": "web_search",
                    }
        return list(seen.values())


# -----------------------------------------------------------------------------
# Scraper tool (HTML only, no PDFs/Office)
# -----------------------------------------------------------------------------

class ScraperTool:
    def __init__(self, timeout: int = 10):
        self.timeout = timeout
        self.session = requests.Session()
        self.session.headers.update(
            {
                "User-Agent": (
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/120.0.0.0 Safari/537.36"
                ),
                "Accept-Language": "en-US,en;q=0.9",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Connection": "keep-alive",
            }
        )

    def _polite_delay(self):
        time.sleep(random.uniform(REQUEST_DELAY_MIN, REQUEST_DELAY_MAX))

    def fetch_html(self, url: str) -> str:
        """
        Fetch HTML page and return cleaned text.
        No PDFs/Office – they are skipped.
        """
        domain = get_domain(url)
        if domain in SCRAPER_DOMAIN_BLACKLIST:
            logger.info(f"[SCRAPER] Domain blacklisted: {domain}, skipping {url}")
            return ""

        if url_has_skip_extension(url):
            logger.info(f"[SCRAPER] Skipping non-HTML extension: {url}")
            return ""

        self._polite_delay()

        try:
            resp = self.session.get(url, timeout=self.timeout)
            resp.raise_for_status()
        except requests.HTTPError as e:
            status = e.response.status_code if e.response is not None else None
            if status == 403:
                logger.info(f"[SCRAPER] HTTP 403 for {url}, skipping.")
            else:
                logger.info(f"[SCRAPER] HTTP error {status} for {url}: {e}")
            return ""
        except Exception as e:
            logger.info(f"[SCRAPER] Failed {url}: {e}")
            return ""

        content_type = (resp.headers.get("Content-Type") or "").lower()
        if "text/html" not in content_type and "application/xhtml+xml" not in content_type:
            logger.info(f"[SCRAPER] Non-HTML Content-Type ({content_type}) for {url}, skipping.")
            return ""

        html = resp.text
        soup = BeautifulSoup(html, "html.parser")

        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()

        text = soup.get_text(separator="\n")
        lines = [line.strip() for line in text.splitlines() if line.strip()]
        text = "\n".join(lines)

        if len(text) > MAX_HTML_CHARS:
            text = text[:MAX_HTML_CHARS]

        return text


# -----------------------------------------------------------------------------
# LLM extractor
# -----------------------------------------------------------------------------

class LLMExtractor:
    """
    Uses OpenAI Chat model via LangChain to:
    - Extract entities from snippets
    - Compute page-level match scores
    """

    def __init__(self, model_name: str = "gpt-4o-mini", temperature: float = 0.0):
        self.llm = ChatOpenAI(
            model=model_name,
            temperature=temperature,
        )

    @staticmethod
    def _safe_float(value, default: float = 0.0) -> float:
        """Convert value to float, handling None and bad types gracefully."""
        if value is None:
            return default
        try:
            return float(value)
        except (TypeError, ValueError):
            return default

    def build_extraction_prompt(
        self,
        super7: Super7Input,
        url: str,
        snippets: List[Dict[str, Any]],
    ) -> str:
        s7_dict = super7.model_dump()
        snippets_text = "\n\n".join(
            f"[SNIPPET {sn['id']} - {sn['type']}]\n{sn['text']}"
            for sn in snippets
        )

        instructions = f"""
You are a precise data extraction assistant.

We are trying to extract the **Super7** identity fields for a company from web page snippets:

Super7 fields:
- company_name
- street_address
- city
- state
- country
- zip
- phone

You are given:
1) A target Super7Input (company_name is mandatory; others are optional hints).
2) The URL of a web page.
3) A set of focused text snippets from that page.

Tasks:
1. Decide if this page is about the SAME company as the target.
2. Extract entities related to the company's identity:
   - Use entity_type values exactly from this set when relevant:
     ["company_name", "street_address", "city", "state", "country", "zip", "phone",
      "email", "website", "social_link", "other_id", "other"]
   - For each entity, include:
     - value (string)
     - source_urls: list of URLs where this value is supported (at least include the page URL)
     - confidence: number between 0 and 1

3. Compute page-level scores (0.0 to 1.0):
   - match_score_name
   - match_score_address
   - match_score_phone
   - overall_score
   - looks_like_official_site: boolean
   - reason: short explanation

Be conservative:
- If the page is unrelated, set scores near 0 and return few/no entities.
- If unsure about a value, use a lower confidence.

Return **STRICT JSON** only, no extra commentary, with this shape:

{{
  "url": "<page URL>",
  "entities": [
    {{
      "entity_type": "company_name" | "street_address" | "city" | "state" | "country" | "zip" | "phone" |
                     "email" | "website" | "social_link" | "other_id" | "other",
      "value": "<string>",
      "source_urls": ["<url1>", "<url2>", ...],
      "confidence": <number between 0 and 1 or null>
    }}
  ],
  "match_score_name": <number between 0 and 1>,
  "match_score_address": <number between 0 and 1>,
  "match_score_phone": <number between 0 and 1>,
  "looks_like_official_site": <true or false>,
  "overall_score": <number between 0 and 1>,
  "reason": "<short string>"
}}

Super7Input (hints):

{json.dumps(s7_dict, indent=2)}

Page URL: {url}

Snippets:
{snippets_text}
"""
        return instructions

    def extract_from_snippets(
        self,
        super7: Super7Input,
        url: str,
        snippets: List[Dict[str, Any]],
    ) -> PageExtractionResult:
        """
        Run the LLM on snippets for a single page.
        """
        if not snippets:
            return PageExtractionResult(
                url=url,
                entities=[],
                match_score_name=0.0,
                match_score_address=0.0,
                match_score_phone=0.0,
                looks_like_official_site=False,
                overall_score=0.0,
                reason="No snippets extracted.",
            )

        prompt = self.build_extraction_prompt(super7, url, snippets)
        response = self.llm.invoke(prompt)
        text = response.content

        # Try to parse JSON
        try:
            data = json.loads(text)
        except Exception:
            start = text.find("{")
            end = text.rfind("}")
            if start != -1 and end != -1 and end > start:
                try:
                    data = json.loads(text[start:end + 1])
                except Exception:
                    data = {}
            else:
                data = {}

        if not isinstance(data, dict):
            data = {}

        url_out = data.get("url", url)

        entities_raw = data.get("entities", [])
        if not isinstance(entities_raw, list):
            entities_raw = []

        entities: List[ExtractedEntity] = []
        for e in entities_raw:
            if not isinstance(e, dict):
                continue
            srcs = e.get("source_urls") or [url_out]
            if not isinstance(srcs, list):
                srcs = [url_out]
            if url_out not in srcs:
                srcs.append(url_out)

            raw_conf = e.get("confidence")
            conf = None
            if raw_conf is not None:
                try:
                    conf = float(raw_conf)
                    # clamp to [0, 1]
                    conf = max(0.0, min(1.0, conf))
                except (TypeError, ValueError):
                    conf = None

            entities.append(
                ExtractedEntity(
                    entity_type=str(e.get("entity_type", "unknown")),
                    value=str(e.get("value") or ""),
                    source_urls=srcs,
                    confidence=conf,
                )
            )

        pe = PageExtractionResult(
            url=url_out,
            entities=entities,
            match_score_name=self._safe_float(data.get("match_score_name"), 0.0),
            match_score_address=self._safe_float(data.get("match_score_address"), 0.0),
            match_score_phone=self._safe_float(data.get("match_score_phone"), 0.0),
            looks_like_official_site=bool(data.get("looks_like_official_site", False)),
            overall_score=self._safe_float(data.get("overall_score"), 0.0),
            reason=str(data.get("reason", "")),
        )

        # clamp page scores as well
        pe.match_score_name = max(0.0, min(1.0, pe.match_score_name))
        pe.match_score_address = max(0.0, min(1.0, pe.match_score_address))
        pe.match_score_phone = max(0.0, min(1.0, pe.match_score_phone))
        pe.overall_score = max(0.0, min(1.0, pe.overall_score))

        return pe


# -----------------------------------------------------------------------------
# Same-company guard & scoring
# -----------------------------------------------------------------------------

def is_page_same_company(
    target_company_name: str,
    page_entities: List[ExtractedEntity],
    threshold: float = 0.6,
) -> bool:
    """
    Decide if this page is about the same company based on extracted company_name entities.
    """
    best_sim = 0.0
    for ent in page_entities:
        if ent.entity_type != "company_name":
            continue
        sim = jaccard_name_similarity(target_company_name, ent.value)
        if sim > best_sim:
            best_sim = sim
    return best_sim >= threshold


def score_field_candidate(
    s7: Super7Input,
    field: str,
    ent: ExtractedEntity,
    page: PageExtractionResult,
) -> float:
    """
    Compute a raw score for one candidate entity for one Super7 field.
    """
    base_conf = ent.confidence if ent.confidence is not None else 0.0
    score = base_conf

    # page relevance
    score += 0.5 * page.overall_score

    # official site bonus
    if page.looks_like_official_site:
        score += 0.2

    # hint-based bonus
    hint_value = getattr(s7, field, None)
    if hint_value and ent.value:
        hv = str(hint_value).lower()
        ev = ent.value.lower()
        if hv == ev:
            score += 0.3
        elif hv in ev or ev in hv:
            score += 0.15

    return score


# -----------------------------------------------------------------------------
# Super7 summarization
# -----------------------------------------------------------------------------

def summarize_super7_simple(
    s7: Super7Input,
    candidates: List[CandidateRecord],
) -> Dict[str, Optional[Dict[str, Any]]]:
    """
    For each Super7 field, pick the best entity across all candidate pages.
    """
    fields = [
        "company_name",
        "street_address",
        "city",
        "state",
        "country",
        "zip",
        "phone",
    ]

    summary: Dict[str, Optional[Dict[str, Any]]] = {f: None for f in fields}

    for field in fields:
        best_score = -1.0
        best_ent: Optional[ExtractedEntity] = None
        best_sources: List[str] = []
        best_page: Optional[PageExtractionResult] = None

        for cand in candidates:
            page = cand.extraction
            page_domain = get_domain(page.url)

            # For non-name fields, enforce same-company guard
            if field != "company_name":
                if not is_page_same_company(s7.company_name, page.entities):
                    continue

            for ent in page.entities:
                if ent.entity_type != field:
                    continue
                if not ent.value:
                    continue

                # filter out entities where ALL sources are excluded domains
                all_srcs = ent.source_urls or [page.url]
                if all(
                    get_domain(src) in SUMMARY_DOMAIN_EXCLUDE
                    for src in all_srcs
                ):
                    continue

                raw_score = score_field_candidate(s7, field, ent, page)
                if raw_score > best_score:
                    best_score = raw_score
                    best_ent = ent
                    best_sources = list(set(all_srcs))
                    best_page = page

        if best_ent is not None and best_score >= 0.3:
            # Normalize raw_score ~ [0,2] → [0,1]
            conf = min(max(best_score / 2.0, 0.0), 1.0)
            # pick a primary source not excluded if possible
            primary_source = None
            for src in best_sources:
                if get_domain(src) not in SUMMARY_DOMAIN_EXCLUDE:
                    primary_source = src
                    break
            if primary_source is None and best_sources:
                primary_source = best_sources[0]

            summary[field] = {
                "value": best_ent.value,
                "source": primary_source,
                "confidence": conf,
                "all_sources": best_sources,
            }
        else:
            summary[field] = None

    return summary


# -----------------------------------------------------------------------------
# Resolver Orchestrator
# -----------------------------------------------------------------------------

class Super7Resolver:
    def __init__(
        self,
        search_tool: WebSearchTool,
        scraper: ScraperTool,
        extractor: LLMExtractor,
    ):
        self.search_tool = search_tool
        self.scraper = scraper
        self.extractor = extractor

    def build_queries(self, s7: Super7Input) -> List[str]:
        name = s7.company_name.strip()
        parts = [name]
        if s7.city:
            parts.append(s7.city)
        if s7.state:
            parts.append(s7.state)
        if s7.country:
            parts.append(s7.country)

        base = " ".join(parts)

        queries = [
            f"{base} official website",
            f"{base} company",
            f"\"{name}\"",
        ]

        if s7.phone:
            queries.append(f"\"{name}\" \"{s7.phone}\"")

        return queries

    def process_company(self, s7: Super7Input) -> Dict[str, Any]:
        """
        Full pipeline for one company.
        """
        queries = self.build_queries(s7)
        search_results = self.search_tool.search_candidates(queries)

        candidate_records: List[CandidateRecord] = []

        primary_url: Optional[str] = None
        primary_conf: float = 0.0

        for sr in search_results:
            url = sr["url"]
            title = sr.get("title", "")
            snippet = sr.get("snippet", "")

            if not should_consider_search_result(s7.company_name, title, snippet):
                continue

            # fetch HTML
            text = self.scraper.fetch_html(url)
            if not text:
                # fall back to Tavily snippet if present
                if snippet:
                    text = snippet
                else:
                    continue

            if not doc_mentions_company(s7.company_name, text):
                continue

            snippets = extract_snippets_for_company(text, s7.company_name)
            if not snippets:
                continue

            extraction = self.extractor.extract_from_snippets(s7, url, snippets)

            candidate_records.append(
                CandidateRecord(
                    url=url,
                    source_type=sr.get("source_type", "web_search"),
                    extraction=extraction,
                )
            )

            if extraction.overall_score > primary_conf:
                primary_conf = extraction.overall_score
                primary_url = url

        # Summarize Super7 fields
        super7_summary = summarize_super7_simple(s7, candidate_records)

        # compress candidate info for output
        candidates_out = [
            {
                "url": c.url,
                "overall_score": c.extraction.overall_score,
                "reason": c.extraction.reason,
            }
            for c in sorted(
                candidate_records,
                key=lambda x: x.extraction.overall_score,
                reverse=True,
            )
        ]

        return {
            "company_id": normalize_company_name(s7.company_name),
            "input": s7.model_dump(),
            "primary_url": primary_url,
            "primary_confidence": primary_conf,
            "candidates": candidates_out,
            "super7_summary": super7_summary,
        }


# -----------------------------------------------------------------------------
# Batch interface
# -----------------------------------------------------------------------------

def resolve_super7_batch(super7_payloads: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    High-level function you call from your notebook / app.

    Example:
        batch_input = [
            {"company_name": "Home Fit Solutions LLC", "country": "United States", "city": "Honesdale"},
            {"company_name": "r&k firesupport llc"},
        ]
        res = resolve_super7_batch(batch_input)
    """
    if not os.getenv("OPENAI_API_KEY"):
        raise RuntimeError("Please set OPENAI_API_KEY in your environment.")
    if not os.getenv("TAVILY_API_KEY"):
        logger.warning("TAVILY_API_KEY not set; TavilySearch will fail.")

    search = WebSearchTool(max_results=5)
    scraper = ScraperTool(timeout=10)
    extractor = LLMExtractor(model_name="gpt-4o-mini", temperature=0.0)
    resolver = Super7Resolver(search, scraper, extractor)

    results = []
    for payload in super7_payloads:
        s7 = Super7Input(**payload)
        out = resolver.process_company(s7)
        results.append(out)

    return {"results": results}



In [None]:


batch_input_noaddress = [
    {
        "company_name": "2ClickFit, Inc."
    },
    {
        "company_name": "529 TECH LLC"
    },
    {
        "company_name": "901 Tax Pros LLC"
    },
    {
        "company_name": "A2 Exteriors LLC"
    },
    {
        "company_name": "AAF Logistics LLC"
    },
    {
        "company_name": "Able Path Care & Staffing LLC"
    },
    {
        "company_name": "Acirdek Solutions LLC"
    },
    {
        "company_name": "Adams Site Works, LLC"
    },
    {
        "company_name": "Adonai's Touch Cleaning LLC"
    }
]

res = resolve_super7_batch(batch_input_noaddress)
print(json.dumps(res, indent=2))

[SCRAPER] Docling failed for https://twoclickfit.com/privacy/: 404 Client Error: Not Found for url: https://twoclickfit.com/privacy/
[SCRAPER] HTTP error 404 for https://twoclickfit.com/privacy/: 404 Client Error: Not Found for url: https://twoclickfit.com/privacy/


2025-11-17 17:42:37,605 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:42:37,627 - INFO - Going to convert document batch...
2025-11-17 17:42:37,628 - INFO - Initializing pipeline for SimplePipeline with options hash 995a146ad601044538e6a923bea22f4e
2025-11-17 17:42:37,674 - INFO - Loading plugin 'docling_defaults'
2025-11-17 17:42:37,685 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-17 17:42:37,686 - INFO - Processing document file
2025-11-17 17:42:37,710 - INFO - Finished converting document file in 1.53 sec.


[SCRAPER] Docling failed for https://www.crunchbase.com/organization/2clickfit: 403 Client Error: Forbidden for url: https://www.crunchbase.com/organization/2clickfit
[SCRAPER] HTTP 403 for https://www.crunchbase.com/organization/2clickfit, skipping.


2025-11-17 17:43:09,787 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:43:09,832 - INFO - Going to convert document batch...
2025-11-17 17:43:09,833 - INFO - Processing document 2clickfit-inc.html
2025-11-17 17:43:09,888 - INFO - Finished converting document 2clickfit-inc.html in 1.30 sec.


[SCRAPER] Docling failed for https://www.secinfo.com/$/SEC/Registrant.asp?CIK=2094237: HTTPSConnectionPool(host='www.secinfo.com', port=443): Max retries exceeded with url: /$/SEC/Registrant.asp?CIK=2094237 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:992)')))
[SCRAPER] Failed https://www.secinfo.com/$/SEC/Registrant.asp?CIK=2094237: HTTPSConnectionPool(host='www.secinfo.com', port=443): Max retries exceeded with url: /$/SEC/Registrant.asp?CIK=2094237 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:992)')))


2025-11-17 17:43:27,649 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:43:27,671 - INFO - Going to convert document batch...
2025-11-17 17:43:27,672 - INFO - Processing document 2clickfit-inc
2025-11-17 17:43:27,679 - INFO - Finished converting document 2clickfit-inc in 0.70 sec.


[SCRAPER] Docling failed for https://www.streetinsider.com/SEC+Filings/Form+D+2ClickFit%2C+Inc./25521440.html: 403 Client Error: Forbidden for url: https://www.streetinsider.com/SEC+Filings/Form+D+2ClickFit%2C+Inc./25521440.html
[SCRAPER] HTTP 403 for https://www.streetinsider.com/SEC+Filings/Form+D+2ClickFit%2C+Inc./25521440.html, skipping.


2025-11-17 17:44:02,652 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:44:02,694 - INFO - Going to convert document batch...
2025-11-17 17:44:02,694 - INFO - Processing document 529-technologies
2025-11-17 17:44:02,713 - INFO - Finished converting document 529-technologies in 0.51 sec.
2025-11-17 17:44:16,214 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:44:16,220 - INFO - Going to convert document batch...
2025-11-17 17:44:16,221 - INFO - Processing document file
2025-11-17 17:44:16,231 - INFO - Finished converting document file in 0.03 sec.
2025-11-17 17:44:29,629 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:44:29,653 - INFO - Going to convert document batch...
2025-11-17 17:44:29,654 - INFO - Processing document file
2025-11-17 17:44:29,709 - INFO - Finished converting document file in 0.48 sec.
2025-11-17 17:44:42,681 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:44:42,719 - INFO - Going

[SCRAPER] Docling failed for https://iphone.apkpure.com/app/flyenjoy/com.test.flyenjoy: 403 Client Error: Forbidden for url: https://iphone.apkpure.com/app/flyenjoy/com.test.flyenjoy
[SCRAPER] HTTP 403 for https://iphone.apkpure.com/app/flyenjoy/com.test.flyenjoy, skipping.


2025-11-17 17:45:36,470 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:45:36,485 - INFO - Going to convert document batch...
2025-11-17 17:45:36,485 - INFO - Processing document file
2025-11-17 17:45:36,501 - INFO - Finished converting document file in 0.11 sec.
2025-11-17 17:45:45,384 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:45:45,403 - INFO - Going to convert document batch...
2025-11-17 17:45:45,403 - INFO - Processing document 901-Tax-Pros-100095787630640
2025-11-17 17:45:45,403 - INFO - Finished converting document 901-Tax-Pros-100095787630640 in 0.62 sec.


[SCRAPER] HTTP error 400 for https://www.facebook.com/p/901-Tax-Pros-100095787630640/: 400 Client Error: Bad Request for url: https://www.facebook.com/p/901-Tax-Pros-100095787630640/


2025-11-17 17:45:59,558 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:45:59,569 - INFO - Going to convert document batch...
2025-11-17 17:45:59,569 - INFO - Processing document DGBbTs8xXJ4
2025-11-17 17:45:59,569 - INFO - Finished converting document DGBbTs8xXJ4 in 0.91 sec.
2025-11-17 17:46:11,798 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:46:11,839 - INFO - Going to convert document batch...
2025-11-17 17:46:11,839 - INFO - Processing document file
2025-11-17 17:46:11,884 - INFO - Finished converting document file in 2.34 sec.
2025-11-17 17:46:17,703 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:46:17,715 - INFO - Going to convert document batch...
2025-11-17 17:46:17,716 - INFO - Processing document 508644161510000
2025-11-17 17:46:17,718 - INFO - Finished converting document 508644161510000 in 0.92 sec.
2025-11-17 17:46:28,870 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:46:28,879 - 

[SCRAPER] HTTP error 400 for https://www.facebook.com/groups/864305987876251/posts/1480557592917751/: 400 Client Error: Bad Request for url: https://www.facebook.com/groups/864305987876251/posts/1480557592917751/


2025-11-17 17:48:24,479 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:48:24,497 - INFO - Going to convert document batch...
2025-11-17 17:48:24,500 - INFO - Processing document 864305987876251
2025-11-17 17:48:24,502 - INFO - Finished converting document 864305987876251 in 1.03 sec.


[SCRAPER] HTTP error 400 for https://www.facebook.com/groups/864305987876251/: 400 Client Error: Bad Request for url: https://www.facebook.com/groups/864305987876251/


2025-11-17 17:48:39,755 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:48:39,764 - INFO - Going to convert document batch...
2025-11-17 17:48:39,764 - INFO - Processing document 7762970760384647
2025-11-17 17:48:39,764 - INFO - Finished converting document 7762970760384647 in 0.70 sec.


[SCRAPER] HTTP error 400 for https://www.facebook.com/groups/1364647736883680/posts/7762970760384647/: 400 Client Error: Bad Request for url: https://www.facebook.com/groups/1364647736883680/posts/7762970760384647/


2025-11-17 17:48:56,501 - INFO - detected formats: [<InputFormat.XML_USPTO: 'xml_uspto'>]
2025-11-17 17:48:56,502 - ERROR - Input document pressure-washing with format None does not match any allowed format: (dict_keys([<InputFormat.DOCX: 'docx'>, <InputFormat.PPTX: 'pptx'>, <InputFormat.HTML: 'html'>, <InputFormat.IMAGE: 'image'>, <InputFormat.PDF: 'pdf'>, <InputFormat.ASCIIDOC: 'asciidoc'>, <InputFormat.MD: 'md'>, <InputFormat.CSV: 'csv'>, <InputFormat.XLSX: 'xlsx'>, <InputFormat.XML_USPTO: 'xml_uspto'>, <InputFormat.XML_JATS: 'xml_jats'>, <InputFormat.METS_GBS: 'mets_gbs'>, <InputFormat.JSON_DOCLING: 'json_docling'>, <InputFormat.AUDIO: 'audio'>, <InputFormat.VTT: 'vtt'>]))
2025-11-17 17:48:56,503 - INFO - Going to convert document batch...


[SCRAPER] Docling failed for https://www.thumbtack.com/ct/norwich/pressure-washing: File format not allowed: pressure-washing
[SCRAPER] Docling failed for https://www.zoominfo.com/c/aaf-logistics-inc/346190257: 403 Client Error: Forbidden for url: https://www.zoominfo.com/c/aaf-logistics-inc/346190257
[SCRAPER] HTTP 403 for https://www.zoominfo.com/c/aaf-logistics-inc/346190257, skipping.
[SCRAPER] Docling failed for https://seamless.ai/b/aaf-logistics-inc-147131241: 403 Client Error: Forbidden for url: https://seamless.ai/b/aaf-logistics-inc-147131241


2025-11-17 17:49:44,733 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:49:44,779 - INFO - Going to convert document batch...
2025-11-17 17:49:44,780 - INFO - Processing document logistics
2025-11-17 17:49:44,830 - INFO - Finished converting document logistics in 0.25 sec.
2025-11-17 17:49:57,089 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:49:57,093 - INFO - Going to convert document batch...
2025-11-17 17:49:57,094 - INFO - Processing document aaf-logistics-llc-usdot-3746208
2025-11-17 17:49:57,095 - INFO - Finished converting document aaf-logistics-llc-usdot-3746208 in 0.92 sec.
2025-11-17 17:50:06,815 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:50:06,822 - INFO - Going to convert document batch...
2025-11-17 17:50:06,822 - INFO - Processing document 3425130
2025-11-17 17:50:06,822 - INFO - Finished converting document 3425130 in 1.05 sec.
2025-11-17 17:50:20,878 - INFO - detected formats: [<InputFormat.HTML: '

[SCRAPER] Docling failed for https://www.buzzfile.com/business/AAF-Logistics-LLC-405-924-7957: 403 Client Error: Forbidden for url: https://www.buzzfile.com/business/AAF-Logistics-LLC-405-924-7957
[SCRAPER] HTTP 403 for https://www.buzzfile.com/business/AAF-Logistics-LLC-405-924-7957, skipping.


2025-11-17 17:51:03,871 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:51:03,875 - INFO - Going to convert document batch...
2025-11-17 17:51:03,876 - INFO - Processing document alvin-fabre-7ab66b1a9
2025-11-17 17:51:03,877 - INFO - Finished converting document alvin-fabre-7ab66b1a9 in 0.88 sec.
2025-11-17 17:51:19,659 - INFO - detected formats: [<InputFormat.XML_USPTO: 'xml_uspto'>]
2025-11-17 17:51:19,659 - ERROR - Input document query.asp with format None does not match any allowed format: (dict_keys([<InputFormat.DOCX: 'docx'>, <InputFormat.PPTX: 'pptx'>, <InputFormat.HTML: 'html'>, <InputFormat.IMAGE: 'image'>, <InputFormat.PDF: 'pdf'>, <InputFormat.ASCIIDOC: 'asciidoc'>, <InputFormat.MD: 'md'>, <InputFormat.CSV: 'csv'>, <InputFormat.XLSX: 'xlsx'>, <InputFormat.XML_USPTO: 'xml_uspto'>, <InputFormat.XML_JATS: 'xml_jats'>, <InputFormat.METS_GBS: 'mets_gbs'>, <InputFormat.JSON_DOCLING: 'json_docling'>, <InputFormat.AUDIO: 'audio'>, <InputFormat.VTT: 'vtt'>]))
2

[SCRAPER] Docling failed for https://safer.fmcsa.dot.gov/query.asp?searchtype=ANY&query_type=queryCarrierSnapshot&query_param=USDOT&query_string=3425130: File format not allowed: query.asp


2025-11-17 17:51:48,054 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:51:48,069 - INFO - Going to convert document batch...
2025-11-17 17:51:48,070 - INFO - Processing document aaf-logistics-llc
2025-11-17 17:51:48,084 - INFO - Finished converting document aaf-logistics-llc in 0.78 sec.


[SCRAPER] Docling failed for https://npidb.org/organizations/nursing_service/home-health_163wh0200x/1518848704.aspx: 403 Client Error: Forbidden for url: https://npidb.org/organizations/nursing_service/home-health_163wh0200x/1518848704.aspx
[SCRAPER] HTTP 403 for https://npidb.org/organizations/nursing_service/home-health_163wh0200x/1518848704.aspx, skipping.


2025-11-17 17:52:33,524 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:52:33,545 - INFO - Going to convert document batch...
2025-11-17 17:52:33,551 - INFO - Processing document privacy-policy
2025-11-17 17:52:33,575 - INFO - Finished converting document privacy-policy in 0.12 sec.
2025-11-17 17:52:50,865 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:52:50,896 - INFO - Going to convert document batch...
2025-11-17 17:52:50,897 - INFO - Processing document services
2025-11-17 17:52:50,919 - INFO - Finished converting document services in 0.14 sec.
2025-11-17 17:53:05,960 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:53:05,975 - INFO - Going to convert document batch...
2025-11-17 17:53:05,975 - INFO - Processing document 61583722310879
2025-11-17 17:53:05,980 - INFO - Finished converting document 61583722310879 in 0.78 sec.


[SCRAPER] HTTP error 400 for https://www.facebook.com/61583722310879/: 400 Client Error: Bad Request for url: https://www.facebook.com/61583722310879/


2025-11-17 17:53:22,746 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:53:22,789 - INFO - Going to convert document batch...
2025-11-17 17:53:22,789 - INFO - Processing document file
2025-11-17 17:53:22,845 - INFO - Finished converting document file in 0.20 sec.
2025-11-17 17:53:41,639 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:53:41,653 - INFO - Going to convert document batch...
2025-11-17 17:53:41,653 - INFO - Processing document able-path-care-staffing-llc-791324234
2025-11-17 17:53:41,676 - INFO - Finished converting document able-path-care-staffing-llc-791324234 in 1.38 sec.
2025-11-17 17:54:00,324 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:54:00,395 - INFO - Going to convert document batch...
2025-11-17 17:54:00,395 - INFO - Processing document ablecarecompany
2025-11-17 17:54:00,451 - INFO - Finished converting document ablecarecompany in 1.11 sec.


[SCRAPER] Docling failed for https://providerwire.com/home-health-registered-nurse/georgia/atlanta/able-path-care-staffing-llc-1518848704: 403 Client Error: Forbidden for url: https://providerwire.com/home-health-registered-nurse/georgia/atlanta/able-path-care-staffing-llc-1518848704
[SCRAPER] HTTP 403 for https://providerwire.com/home-health-registered-nurse/georgia/atlanta/able-path-care-staffing-llc-1518848704, skipping.
[SCRAPER] Docling failed for https://npiprofile.com/npi/1518848704: 403 Client Error: Forbidden for url: https://npiprofile.com/npi/1518848704
[SCRAPER] HTTP 403 for https://npiprofile.com/npi/1518848704, skipping.


2025-11-17 17:54:52,916 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:54:52,916 - INFO - Going to convert document batch...
2025-11-17 17:54:52,916 - INFO - Processing document able-path-care-staffing-llc-b39196679
2025-11-17 17:54:52,932 - INFO - Finished converting document able-path-care-staffing-llc-b39196679 in 1.38 sec.


[SCRAPER] Docling failed for https://npir.org/providers/nursing-providers/163wh0200x/p4a75s: 403 Client Error: Forbidden for url: https://npir.org/providers/nursing-providers/163wh0200x/p4a75s
[SCRAPER] HTTP 403 for https://npir.org/providers/nursing-providers/163wh0200x/p4a75s, skipping.


2025-11-17 17:55:24,093 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:55:24,118 - INFO - Going to convert document batch...
2025-11-17 17:55:24,119 - INFO - Processing document file
2025-11-17 17:55:24,162 - INFO - Finished converting document file in 0.20 sec.
2025-11-17 17:55:30,589 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:55:30,646 - INFO - Going to convert document batch...
2025-11-17 17:55:30,647 - INFO - Processing document accurisk-solutions
2025-11-17 17:55:30,674 - INFO - Finished converting document accurisk-solutions in 0.64 sec.
2025-11-17 17:55:37,689 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-17 17:55:37,696 - INFO - Going to convert document batch...
2025-11-17 17:55:37,696 - INFO - Processing document aci-engineering-llc
2025-11-17 17:55:37,717 - INFO - Finished converting document aci-engineering-llc in 0.31 sec.
2025-11-17 17:56:03,632 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025

[SCRAPER] HTTP error 400 for https://www.facebook.com/people/Adonai-Prestige-Touch-Cleaning-Services/61555008185849/: 400 Client Error: Bad Request for url: https://www.facebook.com/people/Adonai-Prestige-Touch-Cleaning-Services/61555008185849/
{
  "results": [
    {
      "input": {
        "company_name": "2ClickFit, Inc.",
        "country": null,
        "state": null,
        "city": null,
        "street_address": null,
        "zip": null,
        "phone": null
      },
      "primary_url": "https://twoclickfit.com/privacy/",
      "primary_confidence": 0.9,
      "candidates": [
        {
          "url": "https://twoclickfit.com/privacy/",
          "overall_score": 0.9,
          "reason": "The URL contains the company name prominently in the privacy policy."
        },
        {
          "url": "https://2clickfit.com/",
          "overall_score": 0.8,
          "reason": "The URL matches the company name and contains relevant information about the company and its founders."

In [6]:
batch_input_address_found = [
    {
        "company_name": "ELITE CAPITAL OPTIMIZATION LLC"
    },
    {
        "company_name": "JET COAST CARRIERS LLC"
    },
    {
        "company_name": "R&K FIRE SUPPORT LLC"
    },
    {
        "company_name": "Home Fit Solutions LLC"
    },
    {
        "company_name": "CHRISTINE TUFTS BCBA INC"
    },
    {
        "company_name": "Beacon Retirement Strategies"
    },
    {
        "company_name": "DAVID'S TIRE SHOP"
    },
    {
        "company_name": "The Leverage Line Group LLC"
    },
    {
        "company_name": "Raquel's Tax Service"
    },
    {
        "company_name": "1KG Da Label"
    }
]


In [None]:
# batch_input_address_found = [
    
#     {
#         "company_name": "The Leverage Line Group LLC"
#     }
# ]


In [None]:
res2 = resolve_super7_batch(batch_input_address_found)
print(json.dumps(res2, indent=2))

2025-11-17 18:35:48,231 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-17 18:36:04,249 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-17 18:36:10,094 - INFO - [SCRAPER] HTTP 403 for https://search.sunbiz.org/Inquiry/corporationsearch/SearchResults?inquiryType=EntityName&searchTerm=ELITE%20CAPITAL%20GROUP,%20CORP, skipping.
2025-11-17 18:36:15,033 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-17 18:36:25,621 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-17 18:36:26,546 - INFO - [SCRAPER] HTTP 403 for https://search.sunbiz.org/Inquiry/corporationsearch/SearchResults?inquiryType=EntityName&searchTerm=ELITE%20CAPITAL%20INVESTMENTS,%20LLC, skipping.
2025-11-17 18:36:31,753 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-17 18:36:45,099 - INFO -

{
  "results": [
    {
      "company_id": "elite capital optimization",
      "input": {
        "company_name": "ELITE CAPITAL OPTIMIZATION LLC",
        "country": null,
        "state": null,
        "city": null,
        "street_address": null,
        "zip": null,
        "phone": null
      },
      "primary_url": "https://www.bizprofile.net/fl/orlando/elite-capital-optimization-llc",
      "primary_confidence": 0.8,
      "candidates": [
        {
          "url": "https://www.bizprofile.net/fl/orlando/elite-capital-optimization-llc",
          "overall_score": 0.8,
          "reason": "The page contains detailed information about Elite Capital Optimization LLC, confirming the company name and address."
        },
        {
          "url": "https://search.sunbiz.org/Inquiry/corporationsearch/SearchResults?inquiryType=EntityName&searchTerm=ELITE%20CAPITAL%20GROUP,%20CORP",
          "overall_score": 0.5,
          "reason": "The page contains the exact company name, indicating 