In [11]:
from playwright.async_api import async_playwright

async def capture_endpoints():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        endpoints = set()

        page.on(
            "response",
            lambda res: endpoints.add(res.url)
            if "/webapi/api/" in res.url else None
        )

        await page.goto("https://financiamento.iapmei.pt/inicio/home", wait_until="domcontentloaded")

        # ⬇️ THIS IS THE IMPORTANT PART
        await page.wait_for_timeout(3000)
        await page.mouse.wheel(0, 3000)   # trigger lazy loading
        await page.wait_for_timeout(3000)

        await browser.close()
        return endpoints


In [12]:
endpoints = await capture_endpoints()
for e in endpoints:
    print(e)


https://financiamento.iapmei.pt/webapi/api/ItensPesquisas?caraterizacao=2
https://financiamento.iapmei.pt/webapi/api/Paginas/Footer
https://financiamento.iapmei.pt/webapi/api/indicadores
https://financiamento.iapmei.pt/webapi/api/ItensPesquisas/arrayvalores
https://financiamento.iapmei.pt/webapi/api/paginas
https://financiamento.iapmei.pt/webapi/api/sliders
https://financiamento.iapmei.pt/webapi/api/ItensPesquisas
https://financiamento.iapmei.pt/webapi/api/ItensPesquisas?caraterizacao=1
https://financiamento.iapmei.pt/webapi/api/Itens/arrayvalores
https://financiamento.iapmei.pt/webapi/api/Configuracao/GetRedesSociais


In [18]:
import asyncio
from playwright.async_api import async_playwright
import requests
from bs4 import BeautifulSoup
import json
from pathlib import Path

OUTPUT_DIR = Path("data")
OUTPUT_DIR.mkdir(exist_ok=True)

BASE_URL = "https://financiamento.iapmei.pt/webapi/api"

# ------------------------------
# Helper functions
# ------------------------------
def clean_html(html_str):
    if not html_str:
        return ""
    return BeautifulSoup(html_str, "html.parser").get_text(separator="\n", strip=True)

def fetch_json(url, params=None):
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    return r.json()

def extract_text_from_data(data):
    texts = []

    if isinstance(data, dict):
        for key in ["titulo", "descricao", "conteudo", "texto"]:
            val = data.get(key)
            if isinstance(val, str) and val.strip():
                texts.append(clean_html(val))

        # Rich text components
        for comp in data.get("componentes", []):
            if isinstance(comp, dict):
                for v in comp.values():
                    if isinstance(v, str) and len(v.strip()) > 50:
                        texts.append(clean_html(v))

    elif isinstance(data, list):
        for el in data:
            if isinstance(el, dict):
                for v in el.values():
                    if isinstance(v, str) and len(v.strip()) > 50:
                        texts.append(clean_html(v))
            elif isinstance(el, str) and el.strip():
                texts.append(clean_html(el))

    return "\n".join(texts)

# ------------------------------
# Step 1: Capture endpoints via Playwright
# ------------------------------
async def capture_endpoints():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)  # set True to hide browser
        page = await browser.new_page()
        endpoints = set()

        # Capture all API responses
        page.on("response", lambda res: endpoints.add(res.url) if "/webapi/api/" in res.url else None)

        # Open homepage
        await page.goto("https://financiamento.iapmei.pt/inicio/home", wait_until="domcontentloaded")

        # Trigger lazy-loaded content
        await page.wait_for_timeout(3000)
        await page.mouse.wheel(0, 3000)
        await page.wait_for_timeout(3000)

        await browser.close()
        return endpoints

# ------------------------------
# Step 2: Fetch content from discovered endpoints
# ------------------------------
async def main():
    endpoints = await capture_endpoints()
    print(f"Discovered {len(endpoints)} API endpoints")
    
    dataset = []

    # Filter relevant endpoints: only Itens / Paginas / ItensPesquisas
    relevant = [e for e in endpoints if any(x in e.lower() for x in ["/itens", "/paginas", "/itenspesquisas"])]
    print(f"Processing {len(relevant)} relevant endpoints")

    for url in relevant:
        try:
            data = fetch_json(url)
            text = extract_text_from_data(data)

            dataset.append({
                "url": url,
                "text": text
            })
            print(f"Fetched {url} → {len(text)} chars")

        except Exception as e:
            print(f"Error fetching {url}: {e}")

    # Save final dataset
    output_file = OUTPUT_DIR / "financiamento_complete.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=2)

    print(f"All content saved to {output_file}")
    print(f"Total documents: {len(dataset)}")

# ------------------------------
# Run
# ------------------------------
import nest_asyncio
nest_asyncio.apply()

import asyncio

# Instead of asyncio.run(main()), do:
await main()


Discovered 10 API endpoints
Processing 7 relevant endpoints
Fetched https://financiamento.iapmei.pt/webapi/api/ItensPesquisas?caraterizacao=2 → 0 chars
Fetched https://financiamento.iapmei.pt/webapi/api/Paginas/Footer → 7578 chars
Fetched https://financiamento.iapmei.pt/webapi/api/ItensPesquisas/arrayvalores → 0 chars
Fetched https://financiamento.iapmei.pt/webapi/api/paginas → 11209 chars
Fetched https://financiamento.iapmei.pt/webapi/api/ItensPesquisas → 0 chars
Fetched https://financiamento.iapmei.pt/webapi/api/ItensPesquisas?caraterizacao=1 → 0 chars
Fetched https://financiamento.iapmei.pt/webapi/api/Itens/arrayvalores → 0 chars
All content saved to data/financiamento_complete.json
Total documents: 7


In [41]:
import asyncio
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import json
from playwright.async_api import async_playwright
import nest_asyncio

# ------------------------------
# Settings
# ------------------------------
OUTPUT_DIR = Path("data")
OUTPUT_DIR.mkdir(exist_ok=True)

SITES = [
    "https://financiamento.iapmei.pt/inicio/home",
    "https://transparencia.gov.pt/",
    # Add more websites here
]

# ------------------------------
# Helper functions
# ------------------------------
def clean_html(html_str):
    if not html_str:
        return ""
    return BeautifulSoup(html_str, "html.parser").get_text(separator="\n", strip=True)

def fetch_json(url, params=None):
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    return r.json()

def extract_all_text(data):
    """Recursively extract all meaningful strings from JSON"""
    texts = []

    if isinstance(data, dict):
        for v in data.values():
            texts.extend(extract_all_text(v))
    elif isinstance(data, list):
        for el in data:
            texts.extend(extract_all_text(el))
    elif isinstance(data, str) and len(data.strip()) > 20:
        texts.append(clean_html(data))

    return texts

def extract_nextjs_data(html):
    """Extract JSON embedded in Next.js pages"""
    soup = BeautifulSoup(html, "html.parser")
    script = soup.find("script", {"id": "__NEXT_DATA__"})
    if script and script.string:
        try:
            return json.loads(script.string)
        except:
            pass
    return {}

def extract_text_from_html(html):
    """Extract visible text from HTML as a fallback"""
    soup = BeautifulSoup(html, "html.parser")
    texts = []
    for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td', 'li']):
        t = tag.get_text(strip=True)
        if t:
            texts.append(t)
    return "\n".join(texts)

# ------------------------------
# Capture endpoints via Playwright
# ------------------------------
async def capture_json_endpoints(site_url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        endpoints = set()
        page_content = None

        # Capture API responses with JSON content-type
        async def handle_response(res):
            try:
                ct = res.headers.get("content-type", "")
                if "application/json" in ct.lower():
                    endpoints.add(res.url)
            except:
                pass

        page.on("response", handle_response)
        await page.goto(site_url, wait_until="domcontentloaded")

        # Scroll to trigger lazy-loaded requests
        for _ in range(3):
            await page.mouse.wheel(0, 2000)
            await page.wait_for_timeout(1000)

        page_content = await page.content()
        await browser.close()
        return endpoints, page_content

# ------------------------------
# Process one website
# ------------------------------
async def process_site(site_url):
    print(f"\nProcessing site: {site_url}")
    endpoints, html_content = await capture_json_endpoints(site_url)
    print(f"  Discovered {len(endpoints)} JSON endpoints")

    dataset = []

    # 1️⃣ Fetch JSON endpoints
    for ep in endpoints:
        try:
            data = fetch_json(ep)
            text = "\n".join(extract_all_text(data))
            if text.strip():
                dataset.append({"url": ep, "text": text})
                print(f"  Fetched {ep} → {len(text)} chars")
        except Exception as e:
            print(f"  Error fetching {ep}: {e}")

    # 2️⃣ Extract Next.js embedded data
    next_data = extract_nextjs_data(html_content)
    if next_data:
        text = "\n".join(extract_all_text(next_data))
        if text.strip():
            dataset.append({"url": site_url + " (__NEXT_DATA__)", "text": text})
            print(f"  Extracted Next.js data → {len(text)} chars")

    # 3️⃣ Fallback: scrape visible DOM content
    dom_text = extract_text_from_html(html_content)
    if dom_text.strip():
        dataset.append({"url": site_url + " (DOM)", "text": dom_text})
        print(f"  Extracted DOM text → {len(dom_text)} chars")

    return dataset

# ------------------------------
# Main function
# ------------------------------
async def main():
    all_data = await asyncio.gather(*(process_site(site) for site in SITES))
    flat_data = [item for sublist in all_data for item in sublist]

    output_file = OUTPUT_DIR / "dataset.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(flat_data, f, ensure_ascii=False, indent=2)

    print(f"\nSaved {len(flat_data)} documents to {output_file}")

# ------------------------------
# Run
# ------------------------------
nest_asyncio.apply()
await main()



Processing site: https://financiamento.iapmei.pt/inicio/home

Processing site: https://transparencia.gov.pt/
  Discovered 0 JSON endpoints
  Extracted DOM text → 208 chars
  Discovered 10 JSON endpoints
  Fetched https://financiamento.iapmei.pt/webapi/api/ItensPesquisas?caraterizacao=2 → 4398 chars
  Fetched https://financiamento.iapmei.pt/webapi/api/Paginas/Footer → 7787 chars
  Fetched https://financiamento.iapmei.pt/webapi/api/indicadores → 92 chars
  Fetched https://financiamento.iapmei.pt/webapi/api/ItensPesquisas/arrayvalores → 133 chars
  Fetched https://financiamento.iapmei.pt/webapi/api/paginas → 11528 chars



If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  return BeautifulSoup(html_str, "html.parser").get_text(separator="\n", strip=True)


  Fetched https://financiamento.iapmei.pt/webapi/api/sliders → 2064 chars
  Fetched https://financiamento.iapmei.pt/webapi/api/ItensPesquisas → 10095 chars
  Fetched https://financiamento.iapmei.pt/webapi/api/ItensPesquisas?caraterizacao=1 → 5025 chars
  Fetched https://financiamento.iapmei.pt/webapi/api/Itens/arrayvalores → 6322 chars
  Fetched https://financiamento.iapmei.pt/webapi/api/Configuracao/GetRedesSociais → 190 chars
  Extracted DOM text → 729 chars

Saved 12 documents to data/dataset.json
