In [11]:
from playwright.async_api import async_playwright

async def capture_endpoints():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        endpoints = set()

        page.on(
            "response",
            lambda res: endpoints.add(res.url)
            if "/webapi/api/" in res.url else None
        )

        await page.goto("https://financiamento.iapmei.pt/inicio/home", wait_until="domcontentloaded")

        # ⬇️ THIS IS THE IMPORTANT PART
        await page.wait_for_timeout(3000)
        await page.mouse.wheel(0, 3000)   # trigger lazy loading
        await page.wait_for_timeout(3000)

        await browser.close()
        return endpoints


In [12]:
endpoints = await capture_endpoints()
for e in endpoints:
    print(e)


https://financiamento.iapmei.pt/webapi/api/ItensPesquisas?caraterizacao=2
https://financiamento.iapmei.pt/webapi/api/Paginas/Footer
https://financiamento.iapmei.pt/webapi/api/indicadores
https://financiamento.iapmei.pt/webapi/api/ItensPesquisas/arrayvalores
https://financiamento.iapmei.pt/webapi/api/paginas
https://financiamento.iapmei.pt/webapi/api/sliders
https://financiamento.iapmei.pt/webapi/api/ItensPesquisas
https://financiamento.iapmei.pt/webapi/api/ItensPesquisas?caraterizacao=1
https://financiamento.iapmei.pt/webapi/api/Itens/arrayvalores
https://financiamento.iapmei.pt/webapi/api/Configuracao/GetRedesSociais


In [18]:
import asyncio
from playwright.async_api import async_playwright
import requests
from bs4 import BeautifulSoup
import json
from pathlib import Path

OUTPUT_DIR = Path("data")
OUTPUT_DIR.mkdir(exist_ok=True)

BASE_URL = "https://financiamento.iapmei.pt/webapi/api"

# ------------------------------
# Helper functions
# ------------------------------
def clean_html(html_str):
    if not html_str:
        return ""
    return BeautifulSoup(html_str, "html.parser").get_text(separator="\n", strip=True)

def fetch_json(url, params=None):
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    return r.json()

def extract_text_from_data(data):
    texts = []

    if isinstance(data, dict):
        for key in ["titulo", "descricao", "conteudo", "texto"]:
            val = data.get(key)
            if isinstance(val, str) and val.strip():
                texts.append(clean_html(val))

        # Rich text components
        for comp in data.get("componentes", []):
            if isinstance(comp, dict):
                for v in comp.values():
                    if isinstance(v, str) and len(v.strip()) > 50:
                        texts.append(clean_html(v))

    elif isinstance(data, list):
        for el in data:
            if isinstance(el, dict):
                for v in el.values():
                    if isinstance(v, str) and len(v.strip()) > 50:
                        texts.append(clean_html(v))
            elif isinstance(el, str) and el.strip():
                texts.append(clean_html(el))

    return "\n".join(texts)

# ------------------------------
# Step 1: Capture endpoints via Playwright
# ------------------------------
async def capture_endpoints():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)  # set True to hide browser
        page = await browser.new_page()
        endpoints = set()

        # Capture all API responses
        page.on("response", lambda res: endpoints.add(res.url) if "/webapi/api/" in res.url else None)

        # Open homepage
        await page.goto("https://financiamento.iapmei.pt/inicio/home", wait_until="domcontentloaded")

        # Trigger lazy-loaded content
        await page.wait_for_timeout(3000)
        await page.mouse.wheel(0, 3000)
        await page.wait_for_timeout(3000)

        await browser.close()
        return endpoints

# ------------------------------
# Step 2: Fetch content from discovered endpoints
# ------------------------------
async def main():
    endpoints = await capture_endpoints()
    print(f"Discovered {len(endpoints)} API endpoints")
    
    dataset = []

    # Filter relevant endpoints: only Itens / Paginas / ItensPesquisas
    relevant = [e for e in endpoints if any(x in e.lower() for x in ["/itens", "/paginas", "/itenspesquisas"])]
    print(f"Processing {len(relevant)} relevant endpoints")

    for url in relevant:
        try:
            data = fetch_json(url)
            text = extract_text_from_data(data)

            dataset.append({
                "url": url,
                "text": text
            })
            print(f"Fetched {url} → {len(text)} chars")

        except Exception as e:
            print(f"Error fetching {url}: {e}")

    # Save final dataset
    output_file = OUTPUT_DIR / "financiamento_complete.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=2)

    print(f"All content saved to {output_file}")
    print(f"Total documents: {len(dataset)}")

# ------------------------------
# Run
# ------------------------------
import nest_asyncio
nest_asyncio.apply()

import asyncio

# Instead of asyncio.run(main()), do:
await main()


Discovered 10 API endpoints
Processing 7 relevant endpoints
Fetched https://financiamento.iapmei.pt/webapi/api/ItensPesquisas?caraterizacao=2 → 0 chars
Fetched https://financiamento.iapmei.pt/webapi/api/Paginas/Footer → 7578 chars
Fetched https://financiamento.iapmei.pt/webapi/api/ItensPesquisas/arrayvalores → 0 chars
Fetched https://financiamento.iapmei.pt/webapi/api/paginas → 11209 chars
Fetched https://financiamento.iapmei.pt/webapi/api/ItensPesquisas → 0 chars
Fetched https://financiamento.iapmei.pt/webapi/api/ItensPesquisas?caraterizacao=1 → 0 chars
Fetched https://financiamento.iapmei.pt/webapi/api/Itens/arrayvalores → 0 chars
All content saved to data/financiamento_complete.json
Total documents: 7


In [1]:
import scrapy
import re
from bs4 import BeautifulSoup

class Portugal2030Spider(scrapy.Spider):
    name = "botscraper"

    start_urls = [
        "https://portugal2030.pt/wp-json/wp/v2/posts?tags=138&per_page=100&page=1"
    ]

    custom_settings = {
        "DOWNLOAD_DELAY": 1,
        "USER_AGENT": "Mozilla/5.0"
    }

    def parse(self, response):
        posts = response.json()

        for post in posts:
            html = post["content"]["rendered"]
            text = self.extract_text(html)

            yield {
                "id": post["id"],
                "title": post["title"]["rendered"],
                "date": post["date"],
                "url": post["link"],
                "text": text
            }

        # pagination
        total_pages = int(response.headers.get(b"X-WP-TotalPages", 1))
        current_page = int(response.url.split("page=")[-1])

        if current_page < total_pages:
            next_page = current_page + 1
            yield scrapy.Request(
                url=f"https://portugal2030.pt/wp-json/wp/v2/posts?tags=138&per_page=100&page={next_page}",
                callback=self.parse
            )

    def extract_text(self, html):
        soup = BeautifulSoup(html, "html.parser")
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()
        text = soup.get_text(" ", strip=True)
        text = re.sub(r"\s+", " ", text)
        return text
