<a href="https://colab.research.google.com/github/ataucuriaia/ESO-new-project/blob/main/ESO_new_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ================================
# Colab Starter Notebook (English)
# Enterprise Studio — ESO Web Scraping + Enrichment (v0.1)
# Inputs: your existing CSV with columns:
#   - "Org Name" (col A equivalent)
#   - "Website URL" (col F equivalent)
# Output: enriched CSV with scraped signals (title, meta description, text snippet, contacts, socials, etc.)
# ================================

# --- 1) Install + imports ---
!pip -q install beautifulsoup4 lxml tqdm requests

import re
import time
import json
from urllib.parse import urlparse, urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

# --- 2) Load your existing database (uploaded to Colab) ---
INPUT_PATH = "/content/Organization Database 1f24e34e337d8027b500d2a10b1ceaa7.csv"
# If you re-upload with a different name, change INPUT_PATH.

df = pd.read_csv(INPUT_PATH)

# Basic checks (matches your structure)
REQUIRED_COLS = ["Org Name", "Website URL"]
missing = [c for c in REQUIRED_COLS if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}. Found columns: {list(df.columns)}")

df.head()

# --- 3) Helpers: URL cleaning + safe request ---
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    )
}

def normalize_url(url: str) -> str:
    """Normalize website URL. Adds scheme if missing, strips whitespace."""
    if not isinstance(url, str) or not url.strip():
        return ""
    u = url.strip()
    # Common cleanup
    u = u.replace(" ", "")
    # If user typed "www.example.com" without scheme
    if u.startswith("www."):
        u = "https://" + u
    # If scheme missing but domain present
    if not re.match(r"^https?://", u) and "." in u:
        u = "https://" + u
    return u

def get_domain(url: str) -> str:
    try:
        return urlparse(url).netloc.lower()
    except Exception:
        return ""

def safe_get(url: str, timeout=20, max_retries=2, backoff=1.5):
    """HTTP GET with retries. Returns (final_url, html_text) or ("","")."""
    if not url:
        return "", ""
    last_err = None
    for attempt in range(max_retries + 1):
        try:
            r = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True)
            # Handle common blocks
            if r.status_code in (403, 429, 500, 502, 503, 504):
                raise RuntimeError(f"HTTP {r.status_code}")
            if "text/html" not in (r.headers.get("Content-Type") or ""):
                # Some sites return PDFs or other content; skip for now
                return r.url, ""
            return r.url, r.text
        except Exception as e:
            last_err = str(e)
            time.sleep(backoff ** attempt)
    return "", ""

# --- 4) HTML parsing: extract useful fields for your ESO DB ---
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")

def clean_text(s: str) -> str:
    s = re.sub(r"\s+", " ", s or "").strip()
    return s

def extract_page_signals(base_url: str, html: str) -> dict:
    """
    Extract lightweight, high-signal fields from home page HTML.
    (You can extend this later: team page scraping, keyword tagging, etc.)
    """
    soup = BeautifulSoup(html, "lxml")

    # Title
    title = clean_text(soup.title.get_text()) if soup.title else ""

    # Meta description
    meta_desc = ""
    tag = soup.find("meta", attrs={"name": re.compile("^description$", re.I)})
    if tag and tag.get("content"):
        meta_desc = clean_text(tag["content"])

    # H1
    h1 = ""
    h1_tag = soup.find("h1")
    if h1_tag:
        h1 = clean_text(h1_tag.get_text())

    # Social links (common)
    socials = {"linkedin": "", "twitter_x": "", "youtube": "", "facebook": "", "instagram": ""}
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if "linkedin.com" in href and not socials["linkedin"]:
            socials["linkedin"] = href
        if ("twitter.com" in href or "x.com" in href) and not socials["twitter_x"]:
            socials["twitter_x"] = href
        if "youtube.com" in href and not socials["youtube"]:
            socials["youtube"] = href
        if "facebook.com" in href and not socials["facebook"]:
            socials["facebook"] = href
        if "instagram.com" in href and not socials["instagram"]:
            socials["instagram"] = href

    # Find contact/about/team page candidates (just links, not crawling yet)
    link_candidates = {"contact_url": "", "about_url": "", "team_url": ""}
    for a in soup.find_all("a", href=True):
        text = (a.get_text() or "").lower().strip()
        href = a["href"].strip()

        # Make absolute if relative
        abs_url = urljoin(base_url, href)

        if not link_candidates["contact_url"] and ("contact" in text or "contact" in href.lower()):
            link_candidates["contact_url"] = abs_url
        if not link_candidates["about_url"] and ("about" in text or "about" in href.lower() or "who we are" in text):
            link_candidates["about_url"] = abs_url
        if not link_candidates["team_url"] and (
            "team" in text or "our team" in text or "leadership" in text
            or "team" in href.lower() or "leadership" in href.lower()
        ):
            link_candidates["team_url"] = abs_url

    # Emails found on page
    emails = sorted(set(EMAIL_RE.findall(soup.get_text(" "))))
    emails = emails[:5]  # keep short

    # A short text snippet (useful for later tagging/classification)
    # Keep it lightweight: take first N chars from visible text
    page_text = clean_text(soup.get_text(" "))
    snippet = page_text[:600]

    return {
        "site_title": title,
        "meta_description": meta_desc,
        "h1": h1,
        "text_snippet": snippet,
        "emails_found": "; ".join(emails),
        "contact_url_guess": link_candidates["contact_url"],
        "about_url_guess": link_candidates["about_url"],
        "team_url_guess": link_candidates["team_url"],
        "linkedin_url": socials["linkedin"],
        "twitter_x_url": socials["twitter_x"],
        "youtube_url": socials["youtube"],
        "facebook_url": socials["facebook"],
        "instagram_url": socials["instagram"],
    }

# --- 5) Main loop: scrape each row (rate-limited) ---
RATE_LIMIT_SECONDS = 1.0  # be polite; tune later

enriched_rows = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    org = row.get("Org Name", "")
    raw_url = row.get("Website URL", "")
    url = normalize_url(raw_url)

    out = {
        "Org Name": org,
        "Website URL": raw_url,
        "website_normalized": url,
        "website_domain": get_domain(url),
        "final_url": "",
        "http_ok": False,
        "scrape_error": "",
    }

    if not url:
        out["scrape_error"] = "Missing URL"
        enriched_rows.append(out)
        continue

    final_url, html = safe_get(url)
    if not final_url:
        out["scrape_error"] = "Request failed"
        enriched_rows.append(out)
        continue

    out["final_url"] = final_url

    if not html:
        out["scrape_error"] = "Non-HTML response or empty HTML"
        enriched_rows.append(out)
        time.sleep(RATE_LIMIT_SECONDS)
        continue

    try:
        signals = extract_page_signals(final_url, html)
        out.update(signals)
        out["http_ok"] = True
    except Exception as e:
        out["scrape_error"] = f"Parse error: {e}"

    enriched_rows.append(out)
    time.sleep(RATE_LIMIT_SECONDS)

enriched_df = pd.DataFrame(enriched_rows)

# --- 6) Merge back to your original DB (keep your existing columns unchanged) ---
# This keeps all your current fields, and appends new scraped fields.
final_df = df.merge(
    enriched_df,
    on=["Org Name", "Website URL"],
    how="left"
)

final_df.head()

# --- 7) Save outputs ---
OUTPUT_CSV = "Organization_Database_enriched_v0_1.csv"
final_df.to_csv(OUTPUT_CSV, index=False)

print("Saved:", OUTPUT_CSV)

# Optional: also save a quick QA file for failures
failures = final_df[final_df["http_ok"] != True][["Org Name", "Website URL", "scrape_error"]]
failures.to_csv("scrape_failures.csv", index=False)
print("Failures saved: scrape_failures.csv")
print("Failure count:", len(failures))

  9%|▊         | 42/492 [02:13<16:40,  2.22s/it]