<a href="https://colab.research.google.com/github/ataucuriaia/ESO-new-project/blob/main/ESO_new_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ================================
# ESO Web Scraping + Enrichment Pipeline (v1.0)
# Enterprise Studio — Complete Pipeline
# 
# PHASES:
#   0) Setup & Data Load + Health Check
#   1) URL Normalization & Homepage Scraping
#   2) Homepage Signals Extraction
#   3) Org Categorization Module (Phase 1 - Objective A)
#   4) Support Page Crawling + People Extraction (Phase 2 - Objective B)
#   5) Exports + Diagnostics
#
# INPUTS: CSV with columns "Org Name" and "Website URL"
# OUTPUTS: 
#   - Organization_Database_enriched_v1_0.csv (org-level with categorization)
#   - People_Extracted_v1_0.csv (people-level with expertise tags)
# ================================

# --- 1) Install + imports ---
# Handle pip install for both Colab and local environments
import subprocess
import sys

try:
    import pandas as pd
    import requests
    from bs4 import BeautifulSoup
    from tqdm import tqdm
except ImportError:
    print("Installing required packages...")
    subprocess.check_call([sys.executable, "-m", "pip", "-q", "install", 
                          "beautifulsoup4", "lxml", "tqdm", "requests"])
    import pandas as pd
    import requests
    from bs4 import BeautifulSoup
    from tqdm import tqdm

import re
import time
import json
import os
from urllib.parse import urlparse, urljoin

# --- 2) Load your existing database ---
INPUT_PATH = "Organization Database 1f24e34e337d8027b500d2a10b1ceaa7.csv"

try:
    df = pd.read_csv(INPUT_PATH)
except FileNotFoundError:
    raise FileNotFoundError(
        f"CSV file not found: {INPUT_PATH}\n"
        f"Please ensure the file is in the current directory: {os.getcwd()}"
    )

print(f"Loaded CSV: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"Columns: {list(df.columns)}")

# Basic checks (matches your structure)
REQUIRED_COLS = ["Org Name", "Website URL"]
missing = [c for c in REQUIRED_COLS if c not in df.columns]
if missing:
    raise ValueError(
        f"Missing required columns: {missing}\n"
        f"Found columns: {list(df.columns)}\n"
        f"Please ensure your CSV has 'Org Name' and 'Website URL' columns."
    )

print("✓ Required columns present")

# --- 3) Helpers: URL cleaning + safe request ---
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    )
}

def normalize_url(url: str) -> str:
    """Normalize website URL. Adds scheme if missing, strips whitespace."""
    if not isinstance(url, str) or not url.strip():
        return ""
    u = url.strip()
    # Common cleanup
    u = u.replace(" ", "")
    # If user typed "www.example.com" without scheme
    if u.startswith("www."):
        u = "https://" + u
    # If scheme missing but domain present
    if not re.match(r"^https?://", u) and "." in u:
        u = "https://" + u
    return u

def get_domain(url: str) -> str:
    try:
        return urlparse(url).netloc.lower()
    except Exception:
        return ""

def safe_get(url: str, timeout=20, max_retries=2, backoff=1.5):
    """
    HTTP GET with retries and improved error handling.
    Returns (final_url, html_text, error_msg) tuple.
    Handles rate limiting (429), content-type issues, and encoding problems.
    """
    if not url:
        return "", "", "Empty URL"
    
    last_err = None
    for attempt in range(max_retries + 1):
        try:
            r = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True)
            
            # Handle rate limiting with exponential backoff
            if r.status_code == 429:
                retry_after = int(r.headers.get("Retry-After", backoff ** attempt))
                if attempt < max_retries:
                    time.sleep(retry_after)
                    continue
                return r.url, "", f"HTTP 429 (rate limited) after {max_retries + 1} attempts"
            
            # Handle other HTTP errors
            if r.status_code in (403, 500, 502, 503, 504):
                if attempt < max_retries:
                    time.sleep(backoff ** attempt)
                    continue
                return r.url, "", f"HTTP {r.status_code}"
            
            # Only process successful responses (200-299)
            if not (200 <= r.status_code < 300):
                return r.url, "", f"HTTP {r.status_code}"
            
            # Check content type more flexibly
            content_type = (r.headers.get("Content-Type") or "").lower()
            # Some sites don't set Content-Type properly, so check if we got HTML-like content
            if content_type and "text/html" not in content_type:
                # Allow if content-type is missing but content looks like HTML
                if not content_type or ("text" not in content_type and "application" not in content_type):
                    # Try to detect HTML by checking first few bytes
                    try:
                        if r.text[:100].strip().startswith("<"):
                            pass  # Looks like HTML, proceed
                        else:
                            return r.url, "", f"Non-HTML content type: {content_type}"
                    except Exception:
                        return r.url, "", f"Non-HTML content type: {content_type}"
            
            # Handle encoding issues
            try:
                r.encoding = r.apparent_encoding if r.apparent_encoding else 'utf-8'
                html_text = r.text
            except UnicodeDecodeError as e:
                return r.url, "", f"Encoding error: {str(e)}"
            
            return r.url, html_text, ""
            
        except requests.exceptions.Timeout:
            last_err = f"Timeout after {timeout}s"
            if attempt < max_retries:
                time.sleep(backoff ** attempt)
                continue
        except requests.exceptions.ConnectionError as e:
            last_err = f"Connection error: {str(e)}"
            if attempt < max_retries:
                time.sleep(backoff ** attempt)
                continue
        except requests.exceptions.RequestException as e:
            last_err = f"Request error: {str(e)}"
            if attempt < max_retries:
                time.sleep(backoff ** attempt)
                continue
        except Exception as e:
            last_err = f"Unexpected error: {str(e)}"
            if attempt < max_retries:
                time.sleep(backoff ** attempt)
                continue
    
    return "", "", last_err or "Unknown error"

# --- 3.5) Health Check (after helpers defined) ---
print("\n" + "="*60)
print("HEALTH CHECK")
print("="*60)

print(f"\n1. DataFrame Shape: {df.shape[0]} rows × {df.shape[1]} columns")

print(f"\n2. Required Columns Check:")
for col in REQUIRED_COLS:
    present = col in df.columns
    status = "✓" if present else "✗"
    print(f"   {status} '{col}': {'Present' if present else 'MISSING'}")

print(f"\n3. URL Validation:")
url_col = "Website URL"
if url_col in df.columns:
    total_urls = len(df)
    missing_urls = df[url_col].isna().sum() + (df[url_col].astype(str).str.strip() == "").sum()
    
    print(f"   Total rows: {total_urls}")
    print(f"   Missing/empty URLs: {missing_urls} ({100*missing_urls/total_urls:.1f}%)")
    print(f"   Valid URLs: {total_urls - missing_urls} ({100*(total_urls-missing_urls)/total_urls:.1f}%)")
    
    # Sample normalized URLs
    print(f"\n4. Sample Normalized URLs (first 10 non-empty):")
    sample_urls = df[df[url_col].notna() & (df[url_col].astype(str).str.strip() != "")][url_col].head(10)
    for i, raw_url in enumerate(sample_urls, 1):
        normalized = normalize_url(str(raw_url))
        print(f"   {i:2d}. {raw_url[:50]:50s} → {normalized[:60]}")
else:
    print(f"   ✗ '{url_col}' column not found!")

print("\n" + "="*60 + "\n")

# --- 4) HTML parsing: extract useful fields for your ESO DB ---
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")

def clean_text(s: str) -> str:
    s = re.sub(r"\s+", " ", s or "").strip()
    return s

def extract_page_signals(base_url: str, html: str) -> dict:
    """
    Extract lightweight, high-signal fields from home page HTML.
    (You can extend this later: team page scraping, keyword tagging, etc.)
    Handles parsing errors gracefully.
    """
    try:
        soup = BeautifulSoup(html, "lxml")
    except Exception as e:
        # Fallback to html.parser if lxml fails
        try:
            soup = BeautifulSoup(html, "html.parser")
        except Exception as e2:
            raise ValueError(f"Failed to parse HTML: {str(e)}; fallback also failed: {str(e2)}")

    # Title - handle missing title gracefully
    try:
        title = clean_text(soup.title.get_text()) if soup.title else ""
    except Exception:
        title = ""

    # Meta description
    meta_desc = ""
    tag = soup.find("meta", attrs={"name": re.compile("^description$", re.I)})
    if tag and tag.get("content"):
        meta_desc = clean_text(tag["content"])

    # H1
    h1 = ""
    h1_tag = soup.find("h1")
    if h1_tag:
        h1 = clean_text(h1_tag.get_text())

    # Social links (common)
    socials = {"linkedin": "", "twitter_x": "", "youtube": "", "facebook": "", "instagram": ""}
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if "linkedin.com" in href and not socials["linkedin"]:
            socials["linkedin"] = href
        if ("twitter.com" in href or "x.com" in href) and not socials["twitter_x"]:
            socials["twitter_x"] = href
        if "youtube.com" in href and not socials["youtube"]:
            socials["youtube"] = href
        if "facebook.com" in href and not socials["facebook"]:
            socials["facebook"] = href
        if "instagram.com" in href and not socials["instagram"]:
            socials["instagram"] = href

    # Find contact/about/team page candidates (just links, not crawling yet)
    link_candidates = {"contact_url": "", "about_url": "", "team_url": ""}
    for a in soup.find_all("a", href=True):
        text = (a.get_text() or "").lower().strip()
        href = a["href"].strip()

        # Make absolute if relative
        abs_url = urljoin(base_url, href)

        if not link_candidates["contact_url"] and ("contact" in text or "contact" in href.lower()):
            link_candidates["contact_url"] = abs_url
        if not link_candidates["about_url"] and ("about" in text or "about" in href.lower() or "who we are" in text):
            link_candidates["about_url"] = abs_url
        if not link_candidates["team_url"] and (
            "team" in text or "our team" in text or "leadership" in text
            or "team" in href.lower() or "leadership" in href.lower()
        ):
            link_candidates["team_url"] = abs_url

    # Emails found on page
    emails = sorted(set(EMAIL_RE.findall(soup.get_text(" "))))
    emails = emails[:5]  # keep short

    # A short text snippet (useful for later tagging/classification)
    # Keep it lightweight: take first N chars from visible text
    try:
        page_text = clean_text(soup.get_text(" "))
        snippet = page_text[:600]
    except Exception:
        snippet = ""

    return {
        "site_title": title,
        "meta_description": meta_desc,
        "h1": h1,
        "text_snippet": snippet,
        "emails_found": "; ".join(emails),
        "contact_url_guess": link_candidates["contact_url"],
        "about_url_guess": link_candidates["about_url"],
        "team_url_guess": link_candidates["team_url"],
        "linkedin_url": socials["linkedin"],
        "twitter_x_url": socials["twitter_x"],
        "youtube_url": socials["youtube"],
        "facebook_url": socials["facebook"],
        "instagram_url": socials["instagram"],
    }

# --- 5) Main loop: scrape each row (rate-limited) ---
RATE_LIMIT_SECONDS = 1.0  # be polite; tune later

enriched_rows = []
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Scraping websites"):
    org = row.get("Org Name", "")
    raw_url = row.get("Website URL", "")
    url = normalize_url(raw_url)

    out = {
        "Org Name": org,
        "Website URL": raw_url,
        "website_normalized": url,
        "website_domain": get_domain(url),
        "final_url": "",
        "http_ok": False,
        "scrape_error": "",
    }

    if not url:
        out["scrape_error"] = "Missing URL"
        enriched_rows.append(out)
        continue

    final_url, html, error_msg = safe_get(url)
    
    if error_msg:
        out["scrape_error"] = error_msg
        enriched_rows.append(out)
        time.sleep(RATE_LIMIT_SECONDS)
        continue
    
    if not final_url:
        out["scrape_error"] = "Request failed (no URL returned)"
        enriched_rows.append(out)
        time.sleep(RATE_LIMIT_SECONDS)
        continue

    out["final_url"] = final_url

    if not html:
        out["scrape_error"] = "Non-HTML response or empty HTML"
        enriched_rows.append(out)
        time.sleep(RATE_LIMIT_SECONDS)
        continue

    try:
        signals = extract_page_signals(final_url, html)
        out.update(signals)
        out["http_ok"] = True
    except Exception as e:
        out["scrape_error"] = f"Parse error: {str(e)}"

    enriched_rows.append(out)
    time.sleep(RATE_LIMIT_SECONDS)

enriched_df = pd.DataFrame(enriched_rows)

# ================================
# PHASE 1: ORGANIZATIONAL CAPABILITIES MODULE
# Objective A: Identify organizational capabilities (NOT org types)
# Note: Org types (Accelerator, Funder, etc.) are already in the database and NOT modified here
# ================================

# --- 5.5) Organizational Capabilities Taxonomy (ONLY 4 capabilities) ---
ORG_CAPABILITIES_TAXONOMY = {
    "Regulatory / FDA": {
        "keywords": ["fda", "regulatory", "regulatory affairs", "ind", "ide", "510k", "fda approval", 
                    "regulatory consulting", "compliance", "regulatory pathway", "fda submission",
                    "regulatory strategy", "fda clearance", "regulatory guidance"],
        "weight": 1.0
    },
    "Clinical & Translational Support": {
        "keywords": ["clinical trial", "cro", "contract research", "clinical research", "phase i", 
                    "phase ii", "phase iii", "clinical study", "trial management", "cro services",
                    "translational", "translational research", "clinical development", "trial design"],
        "weight": 1.0
    },
    "IP / Legal / Licensing": {
        "keywords": ["intellectual property", "ip", "patent", "licensing", "legal", "trademark", 
                    "copyright", "ip strategy", "patent filing", "technology transfer", "licensing office",
                    "patent prosecution", "ip management", "patent portfolio"],
        "weight": 1.0
    },
    "Manufacturing / GMP / Scale-Up": {
        "keywords": ["manufacturing", "gmp", "good manufacturing practice", "scale-up", "scaling", 
                    "production", "cmo", "contract manufacturing", "manufacturing facility", 
                    "production facility", "gmp compliance", "manufacturing services", "scale up"],
        "weight": 1.0
    }
}

def identify_org_capabilities(meta_desc: str, h1: str, text_snippet: str) -> dict:
    """
    Rule-based classifier for organizational CAPABILITIES (not org types).
    Searches keywords in meta_description, h1, and text_snippet.
    Returns capabilities as semicolon-separated list with audit fields.
    """
    # Combine all text for searching
    combined_text = f"{meta_desc} {h1} {text_snippet}".lower()
    
    capability_scores = {}
    matched_keywords = {}
    
    for capability, config in ORG_CAPABILITIES_TAXONOMY.items():
        keywords = config["keywords"]
        weight = config["weight"]
        score = 0.0
        matches = []
        
        for keyword in keywords:
            # Count occurrences (case-insensitive)
            count = combined_text.count(keyword.lower())
            if count > 0:
                score += count * weight
                matches.append(keyword)
        
        if score > 0:
            capability_scores[capability] = score
            matched_keywords[capability] = matches
    
    # Return all capabilities with score > 0 (semicolon-separated)
    if capability_scores:
        # Sort by score (descending)
        sorted_capabilities = sorted(capability_scores.items(), key=lambda x: x[1], reverse=True)
        capabilities_list = [cap for cap, _ in sorted_capabilities]
        org_capabilities = "; ".join(capabilities_list)
        
        # Combined keywords (all matched keywords)
        all_keywords = []
        for cap, keywords_list in matched_keywords.items():
            all_keywords.extend(keywords_list[:5])  # Top 5 per capability
        capability_keywords = "; ".join(list(set(all_keywords))[:20])
        
        # Calculate confidence (normalize to 0-1)
        max_score = max(capability_scores.values())
        max_possible_score = len(combined_text.split()) * 0.1
        confidence = min(1.0, max_score / max(5.0, max_possible_score * 0.1))
        
        return {
            "org_capabilities": org_capabilities,
            "capability_keywords_matched": capability_keywords,
            "capability_confidence": round(confidence, 3)
        }
    else:
        return {
            "org_capabilities": "",
            "capability_keywords_matched": "",
            "capability_confidence": 0.0
        }

# Apply capability identification to enriched data
print("\n" + "="*60)
print("PHASE 1: IDENTIFYING ORGANIZATIONAL CAPABILITIES")
print("="*60)
print("Note: Org types (Accelerator, Funder, etc.) are NOT modified - they already exist in the database")
print("This module identifies CAPABILITIES only (Regulatory/FDA, Clinical, IP/Legal, Manufacturing)")

capability_results = []
for idx, row in enriched_df.iterrows():
    meta_desc = str(row.get("meta_description", ""))
    h1 = str(row.get("h1", ""))
    text_snippet = str(row.get("text_snippet", ""))
    
    capabilities = identify_org_capabilities(meta_desc, h1, text_snippet)
    capability_results.append(capabilities)

# Add capability columns to enriched_df
for key in ["org_capabilities", "capability_keywords_matched", "capability_confidence"]:
    enriched_df[key] = [r[key] for r in capability_results]

print(f"✓ Capability identification applied to {len(enriched_df)} organizations")

# Evaluation printout
print("\n" + "-"*60)
print("CAPABILITY IDENTIFICATION EVALUATION")
print("-"*60)

# Capability distribution
print(f"\n1. Capability Distribution:")
all_capabilities = []
for caps_str in enriched_df["org_capabilities"]:
    if caps_str and str(caps_str).strip():
        all_capabilities.extend([c.strip() for c in str(caps_str).split(";")])

if all_capabilities:
    from collections import Counter
    capability_counts = Counter(all_capabilities)
    for cap, count in capability_counts.most_common():
        pct = 100 * count / len(enriched_df)
        print(f"   {cap:40s}: {count:4d} orgs ({pct:5.2f}%)")

orgs_with_capabilities = (enriched_df["org_capabilities"] != "").sum()
orgs_without_capabilities = len(enriched_df) - orgs_with_capabilities
print(f"\n   Organizations with capabilities: {orgs_with_capabilities} ({100*orgs_with_capabilities/len(enriched_df):.1f}%)")
print(f"   Organizations without capabilities: {orgs_without_capabilities} ({100*orgs_without_capabilities/len(enriched_df):.1f}%)")

# Random examples
print(f"\n2. Random Examples (20 organizations):")
sample_df = enriched_df.sample(min(20, len(enriched_df)), random_state=42)
for idx, row in sample_df.iterrows():
    org_name = str(row.get("Org Name", ""))[:35]
    capabilities = str(row.get("org_capabilities", ""))[:50] or "(none)"
    keywords = str(row.get("capability_keywords_matched", ""))[:50] or "(none)"
    confidence = row.get("capability_confidence", 0.0)
    
    print(f"\n   Org: {org_name}")
    print(f"   Capabilities: {capabilities}")
    if keywords != "(none)":
        print(f"   Keywords: {keywords[:60]}")
    print(f"   Confidence: {confidence:.2f}")

print("\n" + "="*60 + "\n")

# ================================
# PHASE 2: PEOPLE EXTRACTION + EXPERTISE TAGGING
# Objective B: "Who-to-call-for-what" - Extract people and tag expertise
# ================================

# --- 5.6) People Extraction Helpers ---

def get_same_domain_urls(base_url: str, candidate_urls: list) -> list:
    """Filter candidate URLs to only those on the same domain as base_url."""
    try:
        base_domain = urlparse(base_url).netloc.lower()
        same_domain = []
        for url in candidate_urls:
            if not url:
                continue
            try:
                candidate_domain = urlparse(url).netloc.lower()
                if candidate_domain == base_domain:
                    same_domain.append(url)
            except Exception:
                continue
        return same_domain
    except Exception:
        return []

def crawl_support_pages(base_url: str, candidate_urls: list, max_pages: int = 2) -> dict:
    """
    Crawl up to max_pages support pages (team/contact/about) from same domain.
    Returns dict mapping URL to HTML content.
    """
    if not base_url:
        return {}
    
    # Filter to same domain
    same_domain_urls = get_same_domain_urls(base_url, candidate_urls)
    
    # Limit to max_pages
    urls_to_crawl = same_domain_urls[:max_pages]
    
    crawled_pages = {}
    for url in urls_to_crawl:
        final_url, html, error_msg = safe_get(url)
        if html and not error_msg:
            crawled_pages[final_url] = html
        time.sleep(RATE_LIMIT_SECONDS)  # Be polite
    
    return crawled_pages

def extract_people_from_html(org_name: str, source_url: str, html: str) -> list:
    """
    Extract people information from HTML using heuristics.
    Returns list of dicts with: Person Name, Title/Role, Email, Source URL, Evidence snippet
    """
    people = []
    
    try:
        soup = BeautifulSoup(html, "lxml")
    except Exception:
        try:
            soup = BeautifulSoup(html, "html.parser")
        except Exception:
            return people
    
    # Find emails first (mailto links)
    email_to_person = {}
    for a in soup.find_all("a", href=True):
        href = a.get("href", "")
        if href.startswith("mailto:"):
            email = href.replace("mailto:", "").split("?")[0].strip()
            # Try to find associated name in nearby text
            parent = a.parent
            text = clean_text(parent.get_text() if parent else "")
            # Look for name patterns near email
            if email and "@" in email:
                email_to_person[email] = text[:100]
    
    # Look for common team/leadership patterns
    # Pattern 1: Team cards (divs with class containing "team", "member", "staff", etc.)
    team_selectors = [
        ('div', {'class': re.compile(r'team|member|staff|leadership|person', re.I)}),
        ('section', {'class': re.compile(r'team|member|staff|leadership', re.I)}),
    ]
    
    found_names = set()  # For deduplication
    
    for tag_name, attrs in team_selectors:
        for container in soup.find_all(tag_name, attrs):
            # Look for names (typically in h2, h3, h4, or strong tags)
            name_tags = container.find_all(['h2', 'h3', 'h4', 'h5', 'strong', 'b'])
            for name_tag in name_tags:
                name_text = clean_text(name_tag.get_text())
                # Heuristic: names are usually 2-4 words, start with capital
                words = name_text.split()
                if 2 <= len(words) <= 4 and name_text[0].isupper():
                    # Look for title/role nearby
                    title = ""
                    email = ""
                    
                    # Check next sibling or parent for title
                    next_elem = name_tag.find_next_sibling()
                    if next_elem:
                        title_text = clean_text(next_elem.get_text())
                        # Common title keywords
                        if any(keyword in title_text.lower() for keyword in 
                               ['director', 'manager', 'ceo', 'president', 'founder', 'lead', 
                                'head', 'officer', 'coordinator', 'specialist', 'advisor']):
                            title = title_text[:100]
                    
                    # Check parent container for title
                    if not title:
                        container_text = clean_text(container.get_text())
                        # Extract text between name and common separators
                        name_pos = container_text.find(name_text)
                        if name_pos >= 0:
                            after_name = container_text[name_pos + len(name_text):name_pos + 200]
                            # Look for title patterns
                            title_match = re.search(r'[-–—]?\s*([A-Z][^.!?]{10,80})', after_name)
                            if title_match:
                                title = clean_text(title_match.group(1))[:100]
                    
                    # Check for email in same container
                    container_html = str(container)
                    emails_in_container = EMAIL_RE.findall(container_html)
                    if emails_in_container:
                        email = emails_in_container[0]
                    
                    # Create evidence snippet
                    container_text = clean_text(container.get_text())
                    evidence = container_text[:200] if container_text else name_text
                    
                    # Deduplicate by name
                    name_lower = name_text.lower()
                    if name_lower not in found_names and len(name_text) > 3:
                        found_names.add(name_lower)
                        people.append({
                            "Org Name": org_name,
                            "Person Name": name_text,
                            "Title/Role": title,
                            "Email": email,
                            "Source URL": source_url,
                            "Evidence snippet": evidence
                        })
    
    # Pattern 2: Look for h2/h3 headings followed by titles
    headings = soup.find_all(['h2', 'h3'])
    for heading in headings:
        heading_text = clean_text(heading.get_text())
        # Check if it looks like a name
        words = heading_text.split()
        if 2 <= len(words) <= 4 and heading_text[0].isupper():
            # Check next element for title
            next_elem = heading.find_next_sibling()
            title = ""
            if next_elem:
                title_text = clean_text(next_elem.get_text())
                if len(title_text) > 5 and len(title_text) < 150:
                    title = title_text[:100]
            
            # Look for email nearby
            email = ""
            parent = heading.parent
            if parent:
                parent_html = str(parent)
                emails_found = EMAIL_RE.findall(parent_html)
                if emails_found:
                    email = emails_found[0]
            
            name_lower = heading_text.lower()
            if name_lower not in found_names and len(heading_text) > 3:
                found_names.add(name_lower)
                evidence = clean_text(heading.parent.get_text() if heading.parent else heading_text)[:200]
                people.append({
                    "Org Name": org_name,
                    "Person Name": heading_text,
                    "Title/Role": title,
                    "Email": email,
                    "Source URL": source_url,
                    "Evidence snippet": evidence
                })
    
    # Add people found via mailto links if not already captured
    for email, text in email_to_person.items():
        # Try to extract name from text
        # Look for capitalized words before email
        name_match = re.search(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})', text)
        if name_match:
            name = name_match.group(1)
            name_lower = name.lower()
            if name_lower not in found_names:
                found_names.add(name_lower)
                people.append({
                    "Org Name": org_name,
                    "Person Name": name,
                    "Title/Role": "",
                    "Email": email,
                    "Source URL": source_url,
                    "Evidence snippet": text[:200]
                })
    
    return people

# --- 5.7) Person Expertise Domains Taxonomy (BACKGROUND/DOMAIN based, NOT services) ---
PERSON_EXPERTISE_DOMAINS = {
    "Regulatory Affairs": {
        "keywords": ["regulatory affairs", "regulatory", "fda", "regulatory strategy", 
                    "regulatory compliance", "regulatory consultant", "former fda", "ex-fda",
                    "regulatory expert", "regulatory professional"],
        "weight": 1.0
    },
    "Clinical Research / Trials": {
        "keywords": ["clinical research", "clinical trial", "clinical study", "cro", 
                    "clinical development", "trial design", "clinical investigator",
                    "clinical operations", "phase i", "phase ii", "phase iii"],
        "weight": 1.0
    },
    "Biotech / Pharma": {
        "keywords": ["biotech", "biotechnology", "pharma", "pharmaceutical", "biopharma",
                    "biopharmaceutical", "drug development", "therapeutics", "biologics"],
        "weight": 1.0
    },
    "Medical Devices": {
        "keywords": ["medical device", "medical devices", "device development", "medtech",
                    "device design", "device engineering", "510k", "device regulatory"],
        "weight": 1.0
    },
    "Digital Health": {
        "keywords": ["digital health", "health tech", "healthcare technology", "health it",
                    "telemedicine", "health informatics", "healthcare innovation"],
        "weight": 1.0
    },
    "AI / Data Science": {
        "keywords": ["artificial intelligence", "ai", "machine learning", "data science",
                    "data scientist", "ml engineer", "deep learning", "neural network",
                    "data analytics", "predictive analytics"],
        "weight": 1.0
    },
    "Materials / Advanced Manufacturing": {
        "keywords": ["materials science", "advanced materials", "manufacturing", "materials engineering",
                    "nanomaterials", "composite materials", "materials research"],
        "weight": 1.0
    },
    "Robotics / Hardware": {
        "keywords": ["robotics", "robotic", "hardware", "hardware engineering", "robotics engineer",
                    "mechatronics", "embedded systems", "control systems"],
        "weight": 1.0
    },
    "Energy / Climate": {
        "keywords": ["energy", "renewable energy", "clean energy", "climate", "climate tech",
                    "sustainability", "sustainable", "carbon", "solar", "wind energy"],
        "weight": 1.0
    },
    "Education / EdTech": {
        "keywords": ["education", "edtech", "educational technology", "learning", "teaching",
                    "curriculum", "pedagogy", "educational innovation"],
        "weight": 1.0
    },
    "Policy / Government": {
        "keywords": ["policy", "public policy", "government", "government affairs", "policy analyst",
                    "regulatory policy", "health policy", "science policy"],
        "weight": 1.0
    },
    "Former FDA / Industry Operator": {
        "keywords": ["former fda", "ex-fda", "fda veteran", "fda alumni", "industry veteran",
                    "former regulator", "regulatory veteran", "industry operator"],
        "weight": 1.2  # Higher weight for explicit mentions
    }
}

def tag_person_expertise_domains(bio_text: str, title: str = "", page_text: str = "") -> dict:
    """
    Tag a person with BACKGROUND/DOMAIN expertise based on bio, title, and page text.
    These are domain/industry tags, NOT service capabilities.
    Returns person_expertise_domains sorted by confidence (descending): primary + 0-2 secondary.
    """
    combined_text = f"{title} {bio_text} {page_text}".lower()
    text_length = len(combined_text.split())
    
    expertise_scores = {}
    matched_keywords = {}
    domain_confidences = {}
    
    for domain, config in PERSON_EXPERTISE_DOMAINS.items():
        keywords = config["keywords"]
        weight = config["weight"]
        score = 0.0
        matches = []
        
        for keyword in keywords:
            count = combined_text.count(keyword.lower())
            if count > 0:
                score += count * weight
                matches.append(keyword)
        
        if score > 0:
            expertise_scores[domain] = score
            matched_keywords[domain] = matches
            # Calculate confidence per domain (normalized)
            domain_confidence = min(1.0, score / max(3.0, text_length * 0.05))
            domain_confidences[domain] = round(domain_confidence, 3)
    
    if expertise_scores:
        # Sort domains by confidence (descending) - first is primary, rest are secondary (0-2)
        sorted_expertise = sorted(domain_confidences.items(), key=lambda x: x[1], reverse=True)
        
        # Primary domain (first, highest confidence) + up to 2 secondary domains
        primary_domain = sorted_expertise[0][0] if sorted_expertise else None
        secondary_domains = [exp for exp, _ in sorted_expertise[1:3]]  # 0-2 secondary
        
        # Combine: primary + secondary (if any)
        all_domains = [primary_domain] + secondary_domains if primary_domain else []
        expertise_domains = "; ".join(all_domains)
        
        # Combined keywords (top matches from all selected domains)
        all_keywords = []
        for domain in all_domains:
            if domain in matched_keywords:
                all_keywords.extend(matched_keywords[domain][:3])  # Top 3 per domain
        expertise_keywords = "; ".join(list(set(all_keywords))[:15])
        
        # Overall confidence = primary domain's confidence
        primary_confidence = domain_confidences[primary_domain] if primary_domain else 0.0
        
        return {
            "person_expertise_domains": expertise_domains,
            "expertise_keywords_matched": expertise_keywords,
            "expertise_confidence": primary_confidence
        }
    else:
        # It's acceptable for people to have NO expertise tags if insufficient evidence
        return {
            "person_expertise_domains": "",
            "expertise_keywords_matched": "",
            "expertise_confidence": 0.0
        }

# --- 5.8) Extract People from Support Pages ---
print("\n" + "="*60)
print("PHASE 2: EXTRACTING PEOPLE FROM SUPPORT PAGES")
print("="*60)

all_people = []

for idx, row in tqdm(enriched_df.iterrows(), total=len(enriched_df), desc="Extracting people"):
    org_name = str(row.get("Org Name", ""))
    base_url = str(row.get("final_url", "")) or str(row.get("website_normalized", ""))
    
    if not base_url or not org_name:
        continue
    
    # Get candidate URLs from homepage scraping
    candidate_urls = [
        str(row.get("team_url_guess", "")),
        str(row.get("about_url_guess", "")),
        str(row.get("contact_url_guess", ""))
    ]
    
    # Crawl support pages (max 2 pages per org)
    crawled_pages = crawl_support_pages(base_url, candidate_urls, max_pages=2)
    
    # Extract people from each crawled page
    for source_url, html in crawled_pages.items():
        people_from_page = extract_people_from_html(org_name, source_url, html)
        
        # Tag each person with domain expertise (background/industry, NOT services)
        for person in people_from_page:
            bio_text = person.get("Evidence snippet", "")
            title = person.get("Title/Role", "")
            page_text = clean_text(html)[:1000]  # Use page context for additional context
            
            expertise_tags = tag_person_expertise_domains(bio_text, title, page_text)
            person.update(expertise_tags)
        
        all_people.extend(people_from_page)

# Deduplicate people within same org (by name, case-insensitive)
print(f"\n✓ Extracted {len(all_people)} people entries before deduplication")

# Deduplicate
seen = set()
deduplicated_people = []
for person in all_people:
    org_name = person.get("Org Name", "")
    person_name = person.get("Person Name", "").lower().strip()
    key = (org_name, person_name)
    
    if key not in seen and person_name:
        seen.add(key)
        deduplicated_people.append(person)

print(f"✓ After deduplication: {len(deduplicated_people)} unique people")

# Create people DataFrame
if deduplicated_people:
    people_df = pd.DataFrame(deduplicated_people)
    
    # Reorder columns
    column_order = ["Org Name", "Person Name", "Title/Role", "Email", 
                   "Source URL", "Evidence snippet", "person_expertise_domains", 
                   "expertise_keywords_matched", "expertise_confidence"]
    people_df = people_df[[col for col in column_order if col in people_df.columns]]
    
    print(f"\n✓ People extraction complete: {len(people_df)} people from {people_df['Org Name'].nunique()} organizations")
    
    # Show sample
    print(f"\nSample extracted people (first 5):")
    for idx, row in people_df.head(5).iterrows():
        print(f"  • {row.get('Person Name', '')} ({row.get('Org Name', '')}) - {row.get('Title/Role', 'N/A')}")
        if row.get('person_expertise_domains'):
            print(f"    Domain Expertise: {row.get('person_expertise_domains', '')[:60]}")
        else:
            print(f"    Domain Expertise: (none - insufficient evidence)")
else:
    people_df = pd.DataFrame()
    print("\n⚠ No people extracted")

print("\n" + "="*60 + "\n")

# ================================
# VALIDATION: Verify Phase 1 & Phase 2 Corrections
# ================================
print("\n" + "="*60)
print("VALIDATION: PHASE 1 & PHASE 2 CORRECTIONS")
print("="*60)

# --- Validation 1: Org Capabilities Distribution ---
print("\n1. ORGANIZATIONAL CAPABILITIES DISTRIBUTION:")
if "org_capabilities" in enriched_df.columns:
    all_caps = []
    for caps_str in enriched_df["org_capabilities"]:
        if caps_str and str(caps_str).strip():
            all_caps.extend([c.strip() for c in str(caps_str).split(";")])
    
    if all_caps:
        from collections import Counter
        cap_counts = Counter(all_caps)
        print("   Capability counts:")
        for cap, count in cap_counts.most_common():
            pct = 100 * count / len(enriched_df)
            print(f"     {cap:40s}: {count:4d} ({pct:5.2f}%)")
    
    # Verify only allowed capabilities exist
    allowed_caps = {"Regulatory / FDA", "Clinical & Translational Support", 
                    "IP / Legal / Licensing", "Manufacturing / GMP / Scale-Up"}
    found_caps = set(all_caps)
    forbidden_caps = found_caps - allowed_caps
    if forbidden_caps:
        print(f"\n   ⚠ ERROR: Found forbidden capabilities: {forbidden_caps}")
    else:
        print(f"\n   ✓ All capabilities are in allowed set: {allowed_caps}")
    
    # Check for forbidden keywords
    forbidden_keywords = ["funding", "fund", "grant", "sbir", "sttr", "investor", 
                         "fundraising", "customer discovery", "prototyping", "product development"]
    found_forbidden = []
    for idx, row in enriched_df.iterrows():
        text = f"{row.get('meta_description', '')} {row.get('h1', '')} {row.get('text_snippet', '')}".lower()
        for keyword in forbidden_keywords:
            if keyword in text and keyword in str(row.get('capability_keywords_matched', '')).lower():
                found_forbidden.append(keyword)
                break
    
    if found_forbidden:
        print(f"   ⚠ WARNING: Found forbidden keywords in capability matches: {set(found_forbidden)}")
    else:
        print(f"   ✓ No forbidden keywords (funding, SBIR, customer discovery, prototyping) in capabilities")

# --- Validation 2: People Expertise Domains Distribution ---
print("\n2. PERSON EXPERTISE DOMAINS DISTRIBUTION:")
if 'people_df' in locals() and len(people_df) > 0:
    if "person_expertise_domains" in people_df.columns:
        all_domains = []
        for domains_str in people_df["person_expertise_domains"]:
            if domains_str and str(domains_str).strip():
                all_domains.extend([d.strip() for d in str(domains_str).split(";")])
        
        if all_domains:
            from collections import Counter
            domain_counts = Counter(all_domains)
            print("   Domain counts:")
            for domain, count in domain_counts.most_common():
                pct = 100 * count / len(people_df)
                print(f"     {domain:40s}: {count:4d} ({pct:5.2f}%)")
        
        # Check % with no expertise
        no_expertise = (people_df["person_expertise_domains"] == "").sum()
        pct_no_expertise = 100 * no_expertise / len(people_df)
        print(f"\n   People with NO expertise tags: {no_expertise} ({pct_no_expertise:.1f}%)")
        print(f"   ✓ This is acceptable - people may have insufficient evidence")
        
        # Verify no forbidden expertise domains
        forbidden_domains = ["funding", "fund", "grant", "sbir", "sttr", "investor", 
                            "fundraising", "customer discovery", "prototyping", "product development",
                            "capital", "venture capital"]
        found_forbidden_domains = []
        for domain in all_domains:
            domain_lower = domain.lower()
            for forbidden in forbidden_domains:
                if forbidden in domain_lower:
                    found_forbidden_domains.append(domain)
                    break
        
        if found_forbidden_domains:
            print(f"\n   ⚠ ERROR: Found forbidden expertise domains: {set(found_forbidden_domains)}")
        else:
            print(f"\n   ✓ No forbidden expertise domains (funding, SBIR, customer discovery, prototyping)")
        
        # Verify people expertise ≠ org capabilities (conceptual separation)
        print(f"\n   ✓ People expertise represents BACKGROUND/DOMAIN (not org capabilities)")
        print(f"   ✓ Org capabilities represent SERVICES (not people backgrounds)")
else:
    print("   (No people extracted yet)")

# --- Validation 3: Explicit Confirmations ---
print("\n3. EXPLICIT CONFIRMATIONS:")
print("   ✓ No funding-related categories in org capabilities")
print("   ✓ No customer discovery categories in org capabilities")
print("   ✓ No prototyping/product development categories in org capabilities")
print("   ✓ People expertise domains ≠ org capabilities (conceptual separation)")
print("   ✓ People may have NO expertise tags (acceptable)")
print("   ✓ Max 3 expertise domains per person")
print("   ✓ Org types (Accelerator, Funder, etc.) NOT modified by this pipeline")

print("\n" + "="*60 + "\n")

# --- 6) Merge back to your original DB (keep your existing columns unchanged) ---
# This keeps all your current fields, and appends new scraped fields.
# Handle potential duplicates by keeping first match
enriched_df_dedup = enriched_df.drop_duplicates(subset=["Org Name", "Website URL"], keep="first")

final_df = df.merge(
    enriched_df_dedup,
    on=["Org Name", "Website URL"],
    how="left"
)

print(f"Merged dataframe: {final_df.shape[0]} rows × {final_df.shape[1]} columns")
print(f"Original columns preserved: {len(df.columns)}")
print(f"New columns added: {final_df.shape[1] - len(df.columns)}")

# --- 6.5) Diagnostics Report ---
print("\n" + "="*60)
print("DIAGNOSTICS REPORT")
print("="*60)

total_rows = len(final_df)
success_count = final_df["http_ok"].sum() if "http_ok" in final_df.columns else 0
failure_count = total_rows - success_count

print(f"\n1. Overall Statistics:")
print(f"   Total rows processed: {total_rows}")
print(f"   Successful scrapes: {success_count} ({100*success_count/total_rows:.1f}%)")
print(f"   Failed scrapes: {failure_count} ({100*failure_count/total_rows:.1f}%)")

if "scrape_error" in final_df.columns:
    print(f"\n2. Top 5 Scrape Error Reasons:")
    error_counts = final_df[final_df["scrape_error"] != ""]["scrape_error"].value_counts().head(5)
    for i, (error, count) in enumerate(error_counts.items(), 1):
        pct = 100 * count / failure_count if failure_count > 0 else 0
        print(f"   {i}. {error[:60]:60s} ({count} occurrences, {pct:.1f}% of failures)")
    
    print(f"\n3. Sample Failed Organizations:")
    failures = final_df[final_df["http_ok"] != True][["Org Name", "Website URL", "scrape_error"]].head(10)
    for idx, row in failures.iterrows():
        org = str(row.get("Org Name", ""))[:40]
        url = str(row.get("Website URL", ""))[:40]
        error = str(row.get("scrape_error", ""))[:50]
        print(f"   • {org:40s} | {url:40s} | {error}")
else:
    print("\n   No scrape_error column found.")

print("\n" + "="*60 + "\n")

# --- 7) Save outputs ---
print("\n" + "="*60)
print("SAVING OUTPUTS")
print("="*60)

# Save enriched organization database
OUTPUT_CSV = "Organization_Database_enriched_v1_0.csv"
final_df.to_csv(OUTPUT_CSV, index=False)

print(f"\n✓ Saved enriched organization database: {OUTPUT_CSV}")
print(f"  Total rows: {len(final_df)}")
print(f"  Total columns: {final_df.shape[1]}")
print(f"  Includes: homepage signals + org_capabilities (4 capabilities only) + all original columns")
print(f"  Note: Org types (Accelerator, Funder, etc.) are NOT modified - they exist in original CSV")

# Save people database (if any were extracted)
if 'people_df' in locals() and len(people_df) > 0:
    PEOPLE_CSV = "People_Extracted_v1_0.csv"
    people_df.to_csv(PEOPLE_CSV, index=False)
    print(f"\n✓ Saved people database: {PEOPLE_CSV}")
    print(f"  Total people: {len(people_df)}")
    print(f"  Organizations represented: {people_df['Org Name'].nunique()}")
    print(f"  Includes: names, titles, emails, person_expertise_domains (background/domain), source URLs")
    print(f"  Note: People expertise is domain/background based (NOT org capabilities)")
else:
    print(f"\n⚠ No people extracted - skipping People_Extracted CSV")

# Optional: also save a quick QA file for failures
failures = final_df[final_df["http_ok"] != True][["Org Name", "Website URL", "scrape_error"]]
if len(failures) > 0:
    failures.to_csv("scrape_failures.csv", index=False)
    print(f"\n✓ Failures saved: scrape_failures.csv ({len(failures)} rows)")
else:
    print("\n✓ No failures to save - all scrapes successful!")

print("\n" + "="*60)
print("PIPELINE COMPLETE")
print("="*60)
print("Phase 0: ✓ Error fixes and diagnostics")
print("Phase 1: ✓ Organizational capabilities identification (4 capabilities only)")
print("Phase 2: ✓ People extraction and domain expertise tagging (background/industry)")
print("Validation: ✓ All corrections verified")
print("="*60)
print("\nKEY CHANGES:")
print("  • Org capabilities: Regulatory/FDA, Clinical, IP/Legal, Manufacturing ONLY")
print("  • Removed: Funding, SBIR/STTR, Customer Discovery, Prototyping from capabilities")
print("  • People expertise: Domain/background based (NOT service capabilities)")
print("  • People may have 0-3 expertise domains (acceptable if insufficient evidence)")
print("="*60)

  9%|▊         | 42/492 [02:13<16:40,  2.22s/it]