In [None]:
!pip install -q sentence-transformers faiss-cpu pandas pyarrow tqdm beautifulsoup4

#STEP 1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, re, json
import pandas as pd
from bs4 import BeautifulSoup

BASE_DIR  = "/content/drive/MyDrive/agnos-rag"
RAW_DIR   = os.path.join(BASE_DIR, "data/raw")
INDEX_DIR = os.path.join(BASE_DIR, "index")
os.makedirs(INDEX_DIR, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#STEP 2

In [None]:
def read_meta_block(html_text):
    """ดึง META block จากไฟล์ HTML (source_url, title, scraped_at)"""
    m = re.search(r"<!--\s*META:(.*?)-->", html_text, flags=re.S)
    meta = {}
    if m:
        block = m.group(1)
        for line in block.splitlines():
            line = line.strip()
            if not line or ":" not in line:
                continue
            k, v = line.split(":", 1)
            meta[k.strip()] = v.strip()
    return meta

def parse_thread_html(filepath):
    """อ่านไฟล์ thread HTML แล้ว extract meta + เนื้อหาโพสต์"""
    with open(filepath, "r", encoding="utf-8") as f:
        html = f.read()
    meta = read_meta_block(html)
    soup = BeautifulSoup(html, "html.parser")

    posts = soup.select("article, div.post, div.message")
    data = []
    for p in posts:
        text = p.get_text(" ", strip=True)
        if not text or len(text) < 20:
            continue
        author = None
        date   = None
        if p.select_one(".username"):
            author = p.select_one(".username").get_text(strip=True)
        if p.select_one("time"):
            date = p.select_one("time").get_text(strip=True)

        data.append({
            "file": os.path.basename(filepath),
            "title": meta.get("title", ""),
            "url": meta.get("source_url", ""),
            "scraped_at": meta.get("scraped_at", ""),
            "author": author,
            "date": date,
            "content": text
        })
    return data

#STEP 3: รวมข้อมูลจากทุกไฟล์ HTML

In [None]:
all_data = []
for fn in sorted(os.listdir(RAW_DIR)):
    if not fn.endswith(".html"):
        continue
    fpath = os.path.join(RAW_DIR, fn)
    if fn.startswith("000_home"):
        continue
    all_data.extend(parse_thread_html(fpath))

df = pd.DataFrame(all_data)
print("จำนวนโพสต์ทั้งหมด:", len(df))
df.head(3)

จำนวนโพสต์ทั้งหมด: 71


Unnamed: 0,file,title,url,scraped_at,author,date,content
0,thread_0001_20-Acute-pericarditis72520222-15-8...,หญิง|อายุ20ปีเยื่อหุ้มหัวใจอักเสบ (Acute peric...,https://www.agnoshealth.com/forums/%E0%B9%80%E...,2025-09-17T21:48:44.803222+07:00,,7/25/2022,หญิง | อายุ 20 ปี เยื่อหุ้มหัวใจอักเสบ (Acute ...
1,thread_0001_20-Acute-pericarditis72520222-15-8...,หญิง|อายุ20ปีเยื่อหุ้มหัวใจอักเสบ (Acute peric...,https://www.agnoshealth.com/forums/%E0%B9%80%E...,2025-09-17T21:48:44.803222+07:00,,11/26/2022,หญิง | อายุ 16 ปี เยื่อหุ้มหัวใจอักเสบ (Acute ...
2,thread_0001_20-Acute-pericarditis72520222-15-8...,หญิง|อายุ20ปีเยื่อหุ้มหัวใจอักเสบ (Acute peric...,https://www.agnoshealth.com/forums/%E0%B9%80%E...,2025-09-17T21:48:44.803222+07:00,,8/9/2022,ชาย | อายุ 30 ปี เยื่อหุ้มหัวใจอักเสบ (Acute p...


#STEP 4: Clean & Preprocess

In [None]:
def clean_text(text):
    return " ".join(text.split()) if text else ""

df["content_clean"] = df["content"].apply(clean_text)
df = df[df["content_clean"].str.len() > 0].reset_index(drop=True)
print("หลังทำความสะอาด:", len(df))

หลังทำความสะอาด: 71


#STEP 5: Chunking

In [None]:
from typing import List

def chunk_text(text: str, chunk_size: int = 600, overlap: int = 100) -> List[str]:
    words = text.split()
    chunks, i = [], 0
    while i < len(words):
        chunk = words[i:i+chunk_size]
        chunks.append(" ".join(chunk))
        i += chunk_size - overlap
    return chunks

chunked_data = []
for i, row in df.iterrows():
    chunks = chunk_text(row["content_clean"])
    for j, ch in enumerate(chunks):
        chunked_data.append({
            "doc_id": i,
            "chunk_id": j,
            "title": row["title"],
            "url": row["url"],
            "author": row["author"],
            "date": row["date"],
            "content_chunk": ch
        })

df_chunks = pd.DataFrame(chunked_data)
print("จำนวน chunks:", len(df_chunks))
df_chunks.head(3)

จำนวน chunks: 71


Unnamed: 0,doc_id,chunk_id,title,url,author,date,content_chunk
0,0,0,หญิง|อายุ20ปีเยื่อหุ้มหัวใจอักเสบ (Acute peric...,https://www.agnoshealth.com/forums/%E0%B9%80%E...,,7/25/2022,หญิง | อายุ 20 ปี เยื่อหุ้มหัวใจอักเสบ (Acute ...
1,1,0,หญิง|อายุ20ปีเยื่อหุ้มหัวใจอักเสบ (Acute peric...,https://www.agnoshealth.com/forums/%E0%B9%80%E...,,11/26/2022,หญิง | อายุ 16 ปี เยื่อหุ้มหัวใจอักเสบ (Acute ...
2,2,0,หญิง|อายุ20ปีเยื่อหุ้มหัวใจอักเสบ (Acute peric...,https://www.agnoshealth.com/forums/%E0%B9%80%E...,,8/9/2022,ชาย | อายุ 30 ปี เยื่อหุ้มหัวใจอักเสบ (Acute p...


#STEP 6: สร้าง Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

embedder = SentenceTransformer("BAAI/bge-small-en-v1.5")

embeddings = embedder.encode(
    df_chunks["content_chunk"].tolist(),
    batch_size=32,
    show_progress_bar=True
)
print("embedding shape:", embeddings.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

embedding shape: (71, 384)


#STEP 7: สร้าง FAISS Index

In [None]:
import faiss
import numpy as np

d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(np.array(embeddings, dtype="float32"))

print("FAISS index สร้างแล้ว vectors:", index.ntotal)

FAISS index สร้างแล้ว vectors: 71


#STEP 8: Save Index + Metadata

In [None]:
faiss.write_index(index, os.path.join(INDEX_DIR, "faiss.index"))
df_chunks.to_parquet(os.path.join(INDEX_DIR, "meta.parquet"), index=False)

print(" Saved FAISS index และ metadata ->", INDEX_DIR)

 Saved FAISS index และ metadata -> /content/drive/MyDrive/agnos-rag/index


#STEP 9: Test Search

In [None]:
query = "heart disease symptoms"
q_emb = embedder.encode([query])

D, I = index.search(np.array(q_emb, dtype="float32"), k=3)
print("ผลลัพธ์การค้นหา:")
for idx, score in zip(I[0], D[0]):
    row = df_chunks.iloc[idx]
    print(f"- {row['content_chunk'][:150]}... (score={score:.4f})")
    print(f"  source: {row['url']}")

ผลลัพธ์การค้นหา:
- ชาย | อายุ 30 ปี เยื่อหุ้มหัวใจอักเสบ (Acute pericarditis) 8/9/2022 เลือดออกตามไรฟัน เหนื่อยง่าย แน่นหน้าอก + 3 ปกติผมเป็นโรคความดันโลหิตสูงอยู่แล้ว ว... (score=0.6076)
  source: https://www.agnoshealth.com/forums/%E0%B9%80%E0%B8%A2%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B8%AB%E0%B8%B8%E0%B9%89%E0%B8%A1%E0%B8%AB%E0%B8%B1%E0%B8%A7%E0%B9%83%E0%B8%88%E0%B8%AD%E0%B8%B1%E0%B8%81%E0%B9%80%E0%B8%AA%E0%B8%9A/14
- หญิง | อายุ 22 ปี เยื่อหุ้มหัวใจอักเสบ (Acute pericarditis) 1/24/2024 แน่นหน้าอก แสบหน้าอก เจ็บหน้าอก + 2 คือมีอาการเจ็บตรงกลางหน้าอก เวลาหายใจเข้าลึก... (score=0.6092)
  source: https://www.agnoshealth.com/forums/%E0%B9%80%E0%B8%A2%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B8%AB%E0%B8%B8%E0%B9%89%E0%B8%A1%E0%B8%AB%E0%B8%B1%E0%B8%A7%E0%B9%83%E0%B8%88%E0%B8%AD%E0%B8%B1%E0%B8%81%E0%B9%80%E0%B8%AA%E0%B8%9A/14
- หญิง | อายุ 26 ปี เยื่อหุ้มหัวใจอักเสบ (Acute pericarditis) 5/19/2024 แน่นหน้าอก แสบหน้าอก เจ็บหน้าอก + 2 ขอบสอบถามหน่อยค่า พอดีว่ามีอาการเจ็บอกแบบจี๊... (score=0.6104)
  source: ht

#เชื่อมกับ 04

In [None]:
CODE_DIR = "/content/drive/MyDrive/agnos-rag/code"
os.makedirs(CODE_DIR, exist_ok=True)

code_path = os.path.join(CODE_DIR, "ingest_build_index.py")

code_str = r'''
import os, re, pandas as pd, faiss, numpy as np
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer

def build_index_from_html(raw_dir, index_dir, model_name="BAAI/bge-small-en-v1.5"):
    os.makedirs(index_dir, exist_ok=True)

    def read_meta_block(html_text):
        import re
        m = re.search(r"<!--\s*META:(.*?)-->", html_text, flags=re.S)
        meta = {}
        if m:
            block = m.group(1)
            for line in block.splitlines():
                if ":" not in line: continue
                k, v = line.split(":", 1)
                meta[k.strip()] = v.strip()
        return meta

    def parse_thread_html(filepath):
        with open(filepath, "r", encoding="utf-8") as f:
            html = f.read()
        meta = read_meta_block(html)
        soup = BeautifulSoup(html, "html.parser")
        posts = soup.select("article, div.post, div.message")
        rows = []
        for p in posts:
            text = p.get_text(" ", strip=True)
            if not text or len(text) < 20: continue
            author = p.select_one(".username")
            date   = p.select_one("time")
            rows.append({
                "file": os.path.basename(filepath),
                "title": meta.get("title", ""),
                "url": meta.get("source_url", ""),
                "scraped_at": meta.get("scraped_at", ""),
                "author": author.get_text(strip=True) if author else None,
                "date": date.get_text(strip=True) if date else None,
                "content": text
            })
        return rows

    # รวมข้อมูลจากไฟล์ .html
    all_rows = []
    for fn in sorted(os.listdir(raw_dir)):
        if not fn.endswith(".html"): continue
        if fn.startswith(("000_home", "home_")): continue
        all_rows.extend(parse_thread_html(os.path.join(raw_dir, fn)))

    if not all_rows:
        print(" ไม่พบโพสต์ในไฟล์ HTML")
        return

    df = pd.DataFrame(all_rows)
    df["content_clean"] = df["content"].apply(lambda x: " ".join(x.split()))
    df = df[df["content_clean"].str.len() > 0].reset_index(drop=True)

    # Chunk
    def chunk_text(text, chunk_size=600, overlap=100):
        words, out, i = text.split(), [], 0
        while i < len(words):
            out.append(" ".join(words[i:i+chunk_size]))
            i += max(1, chunk_size - overlap)
        return out

    chunks = []
    for i, r in df.iterrows():
        for j, ch in enumerate(chunk_text(r["content_clean"])):
            chunks.append({
                "doc_id": i,
                "chunk_id": j,
                "title": r["title"],
                "url": r["url"],
                "author": r["author"],
                "date": r["date"],
                "content_chunk": ch
            })

    df_chunks = pd.DataFrame(chunks)

    # Embedding
    embedder = SentenceTransformer(model_name)
    embs = embedder.encode(df_chunks["content_chunk"].tolist(),
                           batch_size=32, show_progress_bar=True)

    # FAISS index
    d = embs.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(np.array(embs, dtype="float32"))

    # Save
    faiss.write_index(index, os.path.join(index_dir, "faiss.index"))
    df_chunks.to_parquet(os.path.join(index_dir, "meta.parquet"), index=False)

    print(f" Build index เสร็จ | posts={len(df)} | chunks={len(df_chunks)}")
    print(f"   -> {os.path.join(index_dir, 'faiss.index')}")
    print(f"   -> {os.path.join(index_dir, 'meta.parquet')}")
'''

with open(code_path, "w", encoding="utf-8") as f:
    f.write(code_str)

print(f" Saved module -> {code_path}")

 Saved module -> /content/drive/MyDrive/agnos-rag/code/ingest_build_index.py


In [None]:
import os

CODE_DIR = "/content/drive/MyDrive/agnos-rag/code"
os.makedirs(CODE_DIR, exist_ok=True)

code_path = os.path.join(CODE_DIR, "ingest_build_index.py")

code_str = r'''
import os, re, pandas as pd, faiss, numpy as np
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer

def build_index_from_html(raw_dir, index_dir, model_name="BAAI/bge-small-en-v1.5"):
    os.makedirs(index_dir, exist_ok=True)

    def read_meta_block(html_text):
        m = re.search(r"<!--\s*META:(.*?)-->", html_text, flags=re.S)
        meta = {}
        if m:
            for line in m.group(1).splitlines():
                if ":" in line:
                    k, v = line.split(":", 1)
                    meta[k.strip()] = v.strip()
        return meta

    def parse_thread_html(filepath):
        with open(filepath, "r", encoding="utf-8") as f:
            html = f.read()
        meta = read_meta_block(html)
        soup = BeautifulSoup(html, "html.parser")
        posts = soup.select("article, div.post, div.message")
        rows = []
        for p in posts:
            text = p.get_text(" ", strip=True)
            if not text or len(text) < 20: continue
            author = p.select_one(".username")
            date   = p.select_one("time")
            rows.append({
                "file": os.path.basename(filepath),
                "title": meta.get("title", ""),
                "url": meta.get("source_url", ""),
                "scraped_at": meta.get("scraped_at", ""),
                "author": author.get_text(strip=True) if author else None,
                "date": date.get_text(strip=True) if date else None,
                "content": text
            })
        return rows

    # รวมข้อมูลจากไฟล์ .html
    all_rows = []
    for fn in sorted(os.listdir(raw_dir)):
        if not fn.endswith(".html"): continue
        if fn.startswith(("000_home", "home_")): continue
        all_rows.extend(parse_thread_html(os.path.join(raw_dir, fn)))

    if not all_rows:
        print(" ไม่พบโพสต์ในไฟล์ HTML")
        return

    df = pd.DataFrame(all_rows)
    df["content_clean"] = df["content"].apply(lambda x: " ".join(x.split()))
    df = df[df["content_clean"].str.len() > 0].reset_index(drop=True)

    # Chunk
    def chunk_text(text, chunk_size=600, overlap=100):
        words, out, i = text.split(), [], 0
        while i < len(words):
            out.append(" ".join(words[i:i+chunk_size]))
            i += max(1, chunk_size - overlap)
        return out

    chunks = []
    for i, r in df.iterrows():
        for j, ch in enumerate(chunk_text(r["content_clean"])):
            chunks.append({
                "doc_id": i,
                "chunk_id": j,
                "title": r["title"],
                "url": r["url"],
                "author": r["author"],
                "date": r["date"],
                "content_chunk": ch
            })

    df_chunks = pd.DataFrame(chunks)

    # Embedding
    embedder = SentenceTransformer(model_name)
    embs = embedder.encode(df_chunks["content_chunk"].tolist(),
                           batch_size=32, show_progress_bar=True)

    # FAISS index
    d = embs.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(np.array(embs, dtype="float32"))

    # Save
    faiss.write_index(index, os.path.join(index_dir, "faiss.index"))
    df_chunks.to_parquet(os.path.join(index_dir, "meta.parquet"), index=False)

    print(f" Build index เสร็จ | posts={len(df)} | chunks={len(df_chunks)}")
    print(f"   -> {os.path.join(index_dir, 'faiss.index')}")
    print(f"   -> {os.path.join(index_dir, 'meta.parquet')}")
'''


with open(code_path, "w", encoding="utf-8") as f:
    f.write(code_str)

print(f"Saved module -> {code_path}")

Saved module -> /content/drive/MyDrive/agnos-rag/code/ingest_build_index.py
