In [None]:
from google.colab import drive
drive.mount('/content/drive')

!mkdir -p /content/drive/MyDrive/agnos-rag/data/raw

Mounted at /content/drive


In [None]:
!pip install -q playwright bs4 nest_asyncio tldextract
!playwright install chromium -q

error: unknown option '-q'


In [None]:
# Imports & Config
import os, re, asyncio, urllib.parse
from datetime import datetime, timezone, timedelta
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
import nest_asyncio, tldextract

nest_asyncio.apply()

BASE_URL    = "https://www.agnoshealth.com/forums"
SAVE_DIR    = "/content/drive/MyDrive/agnos-rag/data/raw"
MAX_THREADS = 10
WAIT_UNTIL  = "networkidle"

os.makedirs(SAVE_DIR, exist_ok=True)

def slugify(text, maxlen=60):
    text = re.sub(r"\s+", "-", text.strip())
    text = re.sub(r"[^A-Za-z0-9\-]+", "", text)
    text = re.sub(r"-{2,}", "-", text).strip("-")
    return text[:maxlen] if text else "untitled"

def abs_url(base, href):
    return urllib.parse.urljoin(base, href) if href else None

# เวลาไทย (Asia/Bangkok, +07:00)
def now_iso_bkk():
    return datetime.now(timezone(timedelta(hours=7))).isoformat()

In [None]:
async def scrape_forum_html():
    domain_ok = tldextract.extract(BASE_URL).registered_domain
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page(user_agent="Mozilla/5.0 (Colab/Playwright)")

        await page.goto(BASE_URL, wait_until=WAIT_UNTIL)
        home_html = await page.content()

        ts = datetime.now().strftime("%Y%m%d-%H%M%S")
        home_path = os.path.join(SAVE_DIR, f"000_home_{ts}.html")
        meta_block = (
            "<!--\nMETA:\n"
            f"source_url: {BASE_URL}\n"
            f"title: Forums Home\n"
            f"scraped_at: {now_iso_bkk()}\n"
            "-->\n"
        )
        with open(home_path, "w", encoding="utf-8") as f:
            f.write(meta_block + home_html)
        print(f"Saved HOME -> {home_path}")

        soup = BeautifulSoup(home_html, "html.parser")
        anchors = soup.find_all("a")

        candidates = []
        seen_urls = set()
        for a in anchors:
            text = (a.get_text(strip=True) or "")
            href = a.get("href")
            if not href:
                continue
            url = abs_url(BASE_URL, href)
            if not url:
                continue
            if tldextract.extract(url).registered_domain != domain_ok:
                continue
            if ("/thread" in url) or ("/topic" in url) or ("/forums/" in url):
                if url not in seen_urls and text:
                    seen_urls.add(url)
                    candidates.append((text, url))

        # จำกัดจำนวนกระทู้
        thread_list = candidates[:MAX_THREADS]
        if not thread_list:
            print(" ไม่พบลิงก์กระทู้จากหน้าแรก ")

        idx = 0
        for title_text, thread_url in thread_list:
            idx += 1
            try:
                await page.goto(thread_url, wait_until=WAIT_UNTIL)
                thread_html = await page.content()

                # ตั้งชื่อไฟล์กระทู้
                slug = slugify(title_text) or f"thread-{idx:04d}"
                fname = f"thread_{idx:04d}_{slug}.html"
                fpath = os.path.join(SAVE_DIR, fname)

                # แทรก META block ไว้หัวไฟล์
                meta_block = (
                    "<!--\nMETA:\n"
                    f"source_url: {thread_url}\n"
                    f"title: {title_text}\n"
                    f"scraped_at: {now_iso_bkk()}\n"
                    "-->\n"
                )
                with open(fpath, "w", encoding="utf-8") as f:
                    f.write(meta_block + thread_html)

                print(f" Saved THREAD {idx:02d} -> {fpath}")
            except Exception as e:
                print(f" Error on thread {idx} ({thread_url}): {e}")

        await browser.close()

await scrape_forum_html()

  domain_ok = tldextract.extract(BASE_URL).registered_domain


Saved HOME -> /content/drive/MyDrive/agnos-rag/data/raw/000_home_20250917-144838.html


  if tldextract.extract(url).registered_domain != domain_ok:


 Saved THREAD 01 -> /content/drive/MyDrive/agnos-rag/data/raw/thread_0001_20-Acute-pericarditis72520222-15-8.html
 Saved THREAD 02 -> /content/drive/MyDrive/agnos-rag/data/raw/thread_0002_24-Dermatitis-unspecified7222022-5.html
 Saved THREAD 03 -> /content/drive/MyDrive/agnos-rag/data/raw/thread_0003_23-Major-depressive-disorder21820243-2-3-5.html
 Saved THREAD 04 -> /content/drive/MyDrive/agnos-rag/data/raw/thread_0004_20-Menieres-disease82220222-4-5-2.html
 Saved THREAD 05 -> /content/drive/MyDrive/agnos-rag/data/raw/thread_0005_21-Cystitis21620241-2.html
 Saved THREAD 06 -> /content/drive/MyDrive/agnos-rag/data/raw/thread_0006_untitled.html
 Saved THREAD 07 -> /content/drive/MyDrive/agnos-rag/data/raw/thread_0007_20-Cellulitis21520242-3-4-0.html
 Saved THREAD 08 -> /content/drive/MyDrive/agnos-rag/data/raw/thread_0008_20-Dysmenorrhea-unpecified112720233-20-27-3-2-6-0.html
 Saved THREAD 09 -> /content/drive/MyDrive/agnos-rag/data/raw/thread_0009_17-Chancroid11262023-555-555-0.html
 S