In [None]:
import re, os, io, requests, pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs
from datetime import datetime

BASE = "https://www.espon.eu"
URL = "https://www.espon.eu/engage/procurements"
OUT = "/Users/cedricjansen/Desktop/bak-economics/Cedric/espon_tenders.csv"
CUTOFF = datetime(2018, 1, 1)
MAX_PAGES = 20  # hard cap; we will stop earlier if no more links

print(f"BASE verified: {BASE}")
HEADERS = {"User-Agent": "Mozilla/5.0"}
KEYWORDS = ["procure", "tender", "call", "contract", "rfp", "rft", "rfq", "invitation"]

# Optional PDF text extraction
pdf_text = None
try:
    from pypdf import PdfReader
    def pdf_text(content: bytes) -> str:
        try:
            r = PdfReader(io.BytesIO(content))
            return "\n".join([(p.extract_text() or "") for p in r.pages])
        except Exception:
            return ""
except Exception:
    pass


def fetch(url: str, timeout: int = 25) -> bytes:
    try:
        log(f"   GET: {url}")
        r = requests.get(url, headers=HEADERS, timeout=timeout)
        r.raise_for_status()
        return r.content
    except Exception as e:
        log(f"   GET FAILED: {url} -> {e}")
        return b""


def parse_date_any(text: str) -> datetime | None:
    if not text:
        return None
    patterns = [
        r"\b(\d{1,2})[./-](\d{1,2})[./-](\d{2,4})\b",  # 20/09/2028 or 20-09-2028
        r"\b(\d{1,2})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[a-z]*\s+(\d{2,4})\b",  # 20 Sep 2028
        r"\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),\s*(\d{2,4})\b",  # September 20, 2028
    ]
    months = {
        'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'sept':9,'oct':10,'nov':11,'dec':12,
        'january':1,'february':2,'march':3,'april':4,'may':5,'june':6,'july':7,'august':8,'september':9,'october':10,'november':11,'december':12
    }
    for pat in patterns:
        m = re.search(pat, text, flags=re.I)
        if not m:
            continue
        g = list(m.groups())
        # Case 1: d/m/y numeric
        if g[0].isdigit() and g[1].isdigit():
            d, mth, y = int(g[0]), int(g[1]), int(g[2])
            if y < 100: y += 2000
            return datetime(y, mth, d)
        # Case 2: d Month y (e.g., 20 Sep 2028)
        if g[0].isdigit() and not g[1].isdigit():
            d, mname, y = int(g[0]), g[1].lower(), int(g[2])
            if y < 100: y += 2000
            mth = months.get(mname[:3], months.get(mname))
            if mth:
                return datetime(y, mth, d)
        # Case 3: Month d, y (e.g., September 20, 2028)
        if not g[0].isdigit() and g[1].isdigit():
            mname, d, y = g[0].lower(), int(g[1]), int(g[2])
            if y < 100: y += 2000
            mth = months.get(mname[:3], months.get(mname))
            if mth:
                return datetime(y, mth, d)
    return None


def norm_date(text: str) -> str:
    d = parse_date_any(text)
    return d.strftime("%Y-%m-%d") if d else ""


def detect_languages(text: str) -> str:
    if not text:
        return "en"
    langs = []
    t = text.lower()
    if any(k in t for k in ["deutsch","german","de ","(de)"]): langs.append("de")
    if any(k in t for k in ["français","french","fr ","(fr)"]): langs.append("fr")
    if any(k in t for k in ["italiano","italian","it ","(it)"]): langs.append("it")
    if any(k in t for k in ["english","en ","(en)"]): langs.append("en")
    return ";".join(sorted(set(langs))) or "en"


def extract_detail(url: str) -> dict:
    html = fetch(url)
    if not html:
        return {"title":"N/A","description":"N/A","publication_date":"","deadline":"","doc_url":"","pdf_desc":"","pdfs":[],"lang":"en","detail_url":url}
    soup = BeautifulSoup(html, "html.parser")

    # Prefer main content/article container
    content = soup.find("article") or soup.select_one("main .node, .node__content, .region-content, #content, main") or soup

    # Title: h1 within content; fallback to og:title; ensure not navigation placeholder
    title = "N/A"
    h1 = content.find("h1") if content else None
    if h1:
        t = h1.get_text(" ", strip=True)
        if t and t.lower() not in ["main navigation", "navigation"]:
            title = t
    if title == "N/A":
        og = soup.find("meta", attrs={"property":"og:title"})
        if og and og.get("content") and og["content"].strip().lower() not in ["main navigation", "navigation"]:
            title = og["content"].strip()

    # Description: paragraphs inside content, skip placeholders
    description = "N/A"
    if content:
        paras = content.find_all("p")
        for p in paras:
            txt = p.get_text(" ", strip=True)
            if not txt:
                continue
            low = txt.lower()
            if any(bad in low for bad in [
                "select filters below", "cookies", "main navigation", "subscribe", "share this", "related content"
            ]):
                continue
            if len(txt) < 40:
                continue
            description = txt
            break

    body_text = content.get_text(" ", strip=True) if content else soup.get_text(" ", strip=True)

    # Dates
    publication_date = ""
    for lab in ["published", "publication date", "date of publication", "date"]:
        m = re.search(lab + r"[^\d]{0,80}(.*)$", body_text, flags=re.I)
        if m:
            publication_date = norm_date(m.group(0)) or norm_date(m.group(1))
            if publication_date: break
    if not publication_date:
        publication_date = norm_date(body_text)

    deadline = ""
    for lab in ["deadline", "submission", "closing", "application deadline", "tenders must be submitted"]:
        m = re.search(lab + r"[^\d]{0,80}(.*)$", body_text, flags=re.I)
        if m:
            deadline = norm_date(m.group(0)) or norm_date(m.group(1))
            if deadline: break

    # Tendering documents URL and description
    doc_url = ""; pdf_desc = ""
    for a in content.find_all("a", href=True) if content else soup.find_all("a", href=True):
        txt = a.get_text(" ", strip=True)
        href = a["href"].strip()
        if any(k in txt.lower() for k in ["tender document","documentation","technical specification","annex","download","documents","tender dossier"]):
            doc_url = urljoin(BASE, href)
            parent_text = a.find_parent().get_text(" ", strip=True) if a.find_parent() else txt
            pdf_desc = parent_text[:250]
            break

    # Collect PDFs
    pdfs = []
    for a in (content.find_all("a", href=True) if content else soup.find_all("a", href=True)):
        href = a["href"].strip()
        if href.lower().endswith((".pdf",".xlsx",".xls")):
            pdfs.append(urljoin(BASE, href))

    # Languages
    lang = detect_languages(body_text)

    # PDF snippet fallback
    if (description == "N/A") and pdf_text and pdfs:
        content_bytes = fetch(pdfs[0])
        snippet = pdf_text(content_bytes)[:800] if content_bytes else ""
        if snippet:
            description = snippet

    return {
        "title": title,
        "description": description,
        "publication_date": publication_date,
        "deadline": deadline,
        "doc_url": doc_url,
        "pdf_desc": pdf_desc,
        "pdfs": sorted(set(pdfs)),
        "lang": lang,
        "detail_url": url
    }


rows = []
visited = set()
page_num = 1
more_pages = True

# Logging helper

def log(msg: str):
    print(msg, flush=True)

# Helper: find listing links reliably on listing pages

def find_listing_links(soup: BeautifulSoup) -> list[tuple[str,str]]:
    main = soup.find("main") or soup
    links = []
    # Target ONLY detail pages under /engage/procurements/<slug>
    for a in main.find_all("a", href=True):
        href = (a["href"] or "").strip()
        if not href or href.startswith("#"): 
            continue
        full = urljoin(BASE, href)
        low = full.lower()
        if not low.startswith(BASE + "/"): 
            continue
        if any(x in low for x in ["/contact", "/privacy", "/legal", "/cookies", "/media"]): 
            continue
        # Must be detail: path depth >= 4 (/engage/procurements/<slug>) and no query/hash
        if "/engage/procurements/" in low and low.rstrip("/").count("/") >= 4 and not any(x in low for x in ["?", "#"]):
            text = a.get_text(" ", strip=True)
            snippet = str(a)[:180].replace('\n',' ')
            log(f"   link(detail): {text[:80]} -> {full} | a-snippet: {snippet}")
            links.append((full, text))
    # Deduplicate order-preserving
    seen = set(); out = []
    for u,t in links:
        if u in seen: 
            continue
        seen.add(u); out.append((u,t))
    return out

# Helper: resolve listing/taxonomy pages to detail pages by scanning their internal links

def resolve_to_detail(url: str) -> str:
    low = url.lower()
    if "/engage/procurements/" in low and low.rstrip("/").count("/") >= 4 and not any(x in low for x in ["?", "#"]):
        return url
    html = fetch(url)
    if not html:
        return url
    s = BeautifulSoup(html, "html.parser")
    for full, text in find_listing_links(s):
        low2 = full.lower()
        if "/engage/procurements/" in low2 and low2.rstrip("/").count("/") >= 4 and not any(x in low2 for x in ["?", "#"]):
            return full
    return url

while more_pages and page_num <= MAX_PAGES:
    page_url = URL if page_num == 1 else f"{URL}?page={page_num-1}"
    log(f"== Page {page_num} URL: {page_url}")
    html = fetch(page_url)
    soup = BeautifulSoup(html, "html.parser") if html else BeautifulSoup("", "html.parser")

    items = find_listing_links(soup)
    log(f"Page {page_num}: {len(items)} candidate links (listing)")

    kept = 0
    page_has_old = False
    for url_found, anchor_text in items:
        detail_url = resolve_to_detail(url_found)
        if detail_url in visited: 
            continue
        visited.add(detail_url)
        if detail_url.lower().endswith((".pdf",".xls",".xlsx")):
            continue
        log(f" -> Detail: {detail_url}")
        det = extract_detail(detail_url)
        # Fallback: use anchor text as title if detail page yielded N/A
        if (not det.get("title") or det.get("title") == "N/A") and (anchor_text and anchor_text.strip() and anchor_text.strip().lower() not in ["read more", "more", "details"]):
            det["title"] = anchor_text.strip()
            log(f"    title fallback from anchor: {det['title'][:100]}")
        else:
            log(f"    title: {det.get('title','N/A')[:100]}")
        if det.get("title") == "N/A" or det.get("description") == "N/A":
            log(f"    Warn: N/A fields (title/description) for {detail_url}")
        dt = parse_date_any(det.get("publication_date","")) or parse_date_any(det.get("deadline",""))
        if dt and dt < CUTOFF:
            page_has_old = True
        rows.append(det)
        kept += 1

    log(f"Page {page_num}: kept {kept} tenders")

    next_link = soup.find("a", string=lambda s: isinstance(s,str) and s.strip().lower() in ["next","older","more",">"])
    page_num += 1 if next_link else 0
    more_pages = bool(next_link)

# Build structured output rows
out_rows = []
for r in rows:
    out_rows.append({
        "organization": "ESPON",
        "languages": r.get("lang","en"),
        "submission_language": r.get("lang","en"),
        "deadline": r.get("deadline",""),
        "Round of questions 1": "N/A",
        "Submit by": "N/A",
        "cpv_code": "N/A",
        "cpv_label": "N/A",
        "title": r.get("title",""),
        "publication_id": "N/A",
        "publication_date": r.get("publication_date",""),
        "url": r.get("detail_url",""),
        "Description": r.get("description",""),
        "pdf_description": r.get("pdf_desc",""),
        "pdf_urls": ";".join(r.get("pdfs", [])[:12])
    })

cols = [
    "organization","languages","submission_language","deadline","Round of questions 1","Submit by",
    "cpv_code","cpv_label","title","publication_id","publication_date","url","Description","pdf_description","pdf_urls"
]
df = pd.DataFrame(out_rows, columns=cols)
df.to_csv(OUT, index=False, encoding="utf-8-sig")
print(f"Wrote {len(df)} rows to {OUT}")


BASE verified: https://www.espon.eu
   GET: https://www.espon.eu/engage/procurements
   link(detail): Open PIN - [WEBHOUSE] An ESPON barometer on European local housing markets Annou -> https://www.espon.eu/engage/procurements/pin-webhouse-espon-barometer-european-local-housing-markets | a-snippet: <a class="link-to-node" href="/engage/procurements/pin-webhouse-espon-barometer-european-local-housing-markets"> <div class="views-row"> <div class="engage-status-wrapper"> <div cl
   link(detail): Closed Prior Information Notice (PIN) for 4 Targeted Analysis Projects Announcem -> https://www.espon.eu/engage/procurements/prior-information-notice-pin-4-targeted-analysis-projects | a-snippet: <a class="link-to-node" href="/engage/procurements/prior-information-notice-pin-4-targeted-analysis-projects"> <div class="views-row"> <div class="engage-status-wrapper"> <div clas
   link(detail): Open Call for expression of interest to establish a pool of editors to develop c -> https://www.espon.eu/eng

In [None]:
# Multi-filter sweep to capture open + closed tenders
# Reuses helpers defined above: log, fetch, find_listing_links, resolve_to_detail, extract_detail, parse_date_any

from datetime import datetime

OUT_ALL = "/Users/cedricjansen/Desktop/bak-economics/Cedric/espon_tenders_all.csv"
CUTOFF_HIST = datetime(2018, 1, 1)
MAX_PAGES_PER_FILTER = 20

# Try different filter combinations to get both open and closed
FILTERS = [
    "",  # default (open only)
    "?f[0]=status%3AClosed",  # closed only
    "?f[0]=status%3AOpen",    # open only (explicit)
    "?f[0]=status%3AOpen&f[1]=status%3AClosed",  # both open and closed
]

rows_h = []
visited_h = set()

log("Starting multi-filter sweep (open + closed)...")

for filter_idx, filter_suffix in enumerate(FILTERS):
    log(f"=== Filter {filter_idx + 1}/{len(FILTERS)}: {filter_suffix or 'default'}")
    
    page_num_h = 1
    more_pages_h = True
    
    while more_pages_h and page_num_h <= MAX_PAGES_PER_FILTER:
        page_url = f"{URL}{filter_suffix}" if page_num_h == 1 else f"{URL}{filter_suffix}&page={page_num_h-1}"
        log(f"== Filter {filter_idx + 1} Page {page_num_h} URL: {page_url}")
        html = fetch(page_url)
        soup = BeautifulSoup(html, "html.parser") if html else BeautifulSoup("", "html.parser")

        items = find_listing_links(soup)
        log(f"Filter {filter_idx + 1} Page {page_num_h}: {len(items)} candidate links")

        kept = 0
        page_has_old = False
        for url_found, anchor_text in items:
            detail_url = resolve_to_detail(url_found)
            if detail_url in visited_h:
                continue
            visited_h.add(detail_url)
            if detail_url.lower().endswith((".pdf",".xls",".xlsx")):
                continue
            log(f" -> Detail: {detail_url}")
            det = extract_detail(detail_url)
            if (not det.get("title") or det.get("title") == "N/A") and (anchor_text and anchor_text.strip() and anchor_text.strip().lower() not in ["read more", "more", "details"]):
                det["title"] = anchor_text.strip()
                log(f"    title fallback from anchor: {det['title'][:100]}")
            dt = parse_date_any(det.get("publication_date","")) or parse_date_any(det.get("deadline",""))
            if dt and dt < CUTOFF_HIST:
                page_has_old = True
            rows_h.append(det)
            kept += 1

        log(f"Filter {filter_idx + 1} Page {page_num_h}: kept {kept} tenders")

        next_link = soup.find("a", string=lambda s: isinstance(s,str) and s.strip().lower() in ["next","older","more",">"])
        page_num_h += 1 if next_link else 0
        more_pages_h = bool(next_link and not page_has_old)
        
        # Stop if no more pages for this filter
        if not next_link:
            log(f"Filter {filter_idx + 1}: no more pages")
            break

# Build structured output and save
out_rows_h = []
for r in rows_h:
    out_rows_h.append({
        "organization": "ESPON",
        "languages": r.get("lang","en"),
        "submission_language": r.get("lang","en"),
        "deadline": r.get("deadline",""),
        "Round of questions 1": "N/A",
        "Submit by": "N/A",
        "cpv_code": "N/A",
        "cpv_label": "N/A",
        "title": r.get("title",""),
        "publication_id": "N/A",
        "publication_date": r.get("publication_date",""),
        "url": r.get("detail_url",""),
        "Description": r.get("description",""),
        "pdf_description": r.get("pdf_desc",""),
        "pdf_urls": ";".join(r.get("pdfs", [])[:12])
    })

cols = [
    "organization","languages","submission_language","deadline","Round of questions 1","Submit by",
    "cpv_code","cpv_label","title","publication_id","publication_date","url","Description","pdf_description","pdf_urls"
]
import pandas as pd

df_all = pd.DataFrame(out_rows_h, columns=cols).drop_duplicates(subset=["url"]) if out_rows_h else pd.DataFrame(columns=cols)

df_all.to_csv(OUT_ALL, index=False, encoding="utf-8-sig")
log(f"Historical sweep wrote {len(df_all)} rows to {OUT_ALL}")


Starting historical sweep (includes closed if listed in pagination)...
== Hist Page 1 URL: https://www.espon.eu/engage/procurements
   GET: https://www.espon.eu/engage/procurements
   link(detail): Open PIN - [WEBHOUSE] An ESPON barometer on European local housing markets Annou -> https://www.espon.eu/engage/procurements/pin-webhouse-espon-barometer-european-local-housing-markets | a-snippet: <a class="link-to-node" href="/engage/procurements/pin-webhouse-espon-barometer-european-local-housing-markets"> <div class="views-row"> <div class="engage-status-wrapper"> <div cl
   link(detail): Closed Prior Information Notice (PIN) for 4 Targeted Analysis Projects Announcem -> https://www.espon.eu/engage/procurements/prior-information-notice-pin-4-targeted-analysis-projects | a-snippet: <a class="link-to-node" href="/engage/procurements/prior-information-notice-pin-4-targeted-analysis-projects"> <div class="views-row"> <div class="engage-status-wrapper"> <div clas
   link(detail): Open Call fo