In [None]:
!pip install -q gradio sentence-transformers faiss-cpu pandas pyarrow transformers accelerate bitsandbytes

#STEP 1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, time, numpy as np, pandas as pd, faiss
from sentence_transformers import SentenceTransformer

BASE_DIR   = "/content/drive/MyDrive/agnos-rag"
INDEX_DIR  = os.path.join(BASE_DIR, "index")
INDEX_PATH = os.path.join(INDEX_DIR, "faiss.index")
META_PATH  = os.path.join(INDEX_DIR, "meta.parquet")

meta = None
index = None
embedder = None

def load_assets():

    global meta, index, embedder
    assert os.path.exists(META_PATH), f"ไม่พบ {META_PATH}"
    assert os.path.exists(INDEX_PATH), f"ไม่พบ {INDEX_PATH}"

    meta  = pd.read_parquet(META_PATH)
    index = faiss.read_index(INDEX_PATH)

    if embedder is None:
        embedder_local = SentenceTransformer("BAAI/bge-small-en-v1.5")
    else:
        embedder_local = embedder
    embedder = embedder_local

    return {
        "rows": len(meta),
        "vectors": index.ntotal
    }

def get_index_mtime():

    def fmt(p):
        ts = os.path.getmtime(p)
        return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(ts))
    return f"meta: {fmt(META_PATH)} | index: {fmt(INDEX_PATH)}"

stats = load_assets()
print(f"Loaded index: rows(meta)={stats['rows']} | vectors={stats['vectors']}")
print("Last modified:", get_index_mtime())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loaded index: rows(meta)=71 | vectors=71
Last modified: meta: 2025-09-17 16:19:53 | index: 2025-09-17 16:19:52


#STEP 2: Retrieval (L2 ตาม index ที่สร้างใน 02)

In [None]:
def do_retrieve_inline(query: str, top_k: int = 6):

    q_emb = embedder.encode([query])
    D, I = index.search(np.array(q_emb, dtype="float32"), k=min(int(top_k), index.ntotal))

    hits = []
    for idx, dist in zip(I[0], D[0]):
        if idx == -1:
            continue
        r = meta.iloc[int(idx)]
        hits.append({
            "title": (r.get("title", "") or "").strip(),
            "url": r.get("url", ""),
            "content_chunk": r.get("content_chunk", "") or "",
            "author": r.get("author", None),
            "date": r.get("date", None),
            "score_l2": float(dist),
        })
    return hits

#STEP 3: เตรียม LLM

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

LLM_NAME = "microsoft/Phi-3-mini-4k-instruct"
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
tok = AutoTokenizer.from_pretrained(LLM_NAME, trust_remote_code=True)
llm = AutoModelForCausalLM.from_pretrained(LLM_NAME, torch_dtype=dtype, device_map="auto")
gen = pipeline("text-generation", model=llm, tokenizer=tok, device_map="auto")

SYSTEM_INSTR = (
    "คุณเป็นผู้ช่วยที่ตอบบนพื้นฐานของแหล่งข้อมูลจาก Agnos Health Forum เท่านั้น "
    "ยกคำตอบอย่างมีเหตุผล ชัดเจน และใส่เลขอ้างอิง [1], [2] … ต่อท้ายประโยคที่เกี่ยวข้อง "
    "ถ้าข้อมูลไม่พอ ให้บอกว่าไม่พบในฐานข้อมูลแทนการเดา"
)

def build_context_and_citations(hits):
    ctx_lines, refs = [], []
    for i, h in enumerate(hits, start=1):
        ctx_lines.append(f"[{i}] {h['content_chunk']}")
        ti = (h['title'] or "(no title)")
        refs.append(f"[{i}] {ti} — {h['url']}")
    return "\n\n".join(ctx_lines), "\n".join(refs)

def make_prompt(question: str, hits, answer_style="Concise"):
    context_text, refs_text = build_context_and_citations(hits)
    style_hint = {
        "Concise": "ตอบสั้น กระชับ ตรงประเด็น ไม่เกิน 6 บรรทัด",
        "Detailed": "ตอบละเอียด มีหัวข้อย่อยและ bullet ชัดเจน",
    }.get(answer_style, "ตอบชัดเจนเหมาะสม")
    user = (
        f"คำถาม: {question}\n\n"
        f"บริบท (อ้างอิงได้):\n{context_text}\n\n"
        f"อย่าคิดเองหากไม่มีในบริบท ให้บอกว่าไม่พบข้อมูล\n"
        f"สไตล์คำตอบ: {style_hint}\n"
        f"อย่าลืมใส่ [เลขอ้างอิง] ต่อท้ายประโยคที่ใช้อ้างอิง\n"
    )
    return SYSTEM_INSTR, user, refs_text

def generate_answer(question: str, hits, answer_style="Concise", max_new_tokens=320):
    if not hits:
        return "ไม่พบข้อมูลในฐานความรู้ที่เกี่ยวข้องกับคำถามนี้", ""
    system, user, refs = make_prompt(question, hits, answer_style)
    if hasattr(tok, "apply_chat_template"):
        prompt = tok.apply_chat_template(
            [{"role":"system","content":system},{"role":"user","content":user}],
            tokenize=False, add_generation_prompt=True
        )
    else:
        prompt = f"<|system|>\n{system}\n<|user|>\n{user}\n<|assistant|>\n"
    out = gen(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.2,
        top_p=0.9,
        pad_token_id=tok.eos_token_id
    )[0]["generated_text"]
    if out.startswith(prompt):
        out = out[len(prompt):]
    return out.strip(), refs

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


#STEP 4: Compose Prompt + Generate

In [None]:
import gradio as gr

def rag_ask(question, top_k=6, style="Concise"):
    if not question or not question.strip():
        return " กรุณาพิมพ์คำถาม", "", get_index_mtime()
    try:
        hits = do_retrieve_inline(question, top_k=int(top_k))
        if not hits:
            return "ไม่พบข้อมูลที่เกี่ยวข้องในฐานความรู้", "", get_index_mtime()


        for h in hits:
            if len(h["content_chunk"]) > 1200:
                h["content_chunk"] = h["content_chunk"][:1200] + " ..."

        answer, _ = generate_answer(question, hits, answer_style=style)

        # citations
        src_md = []
        for i, h in enumerate(hits, 1):
            title = h["title"] if h["title"] else "(no title)"
            src_md.append(f"**[{i}] {title}**  \n{h['url']}")
        return answer, "\n\n".join(src_md), get_index_mtime()
    except Exception as e:
        return f" Error: {e}", "", get_index_mtime()

def on_reload():
    stats = load_assets()
    return f"Reloaded  rows(meta)={stats['rows']} | vectors={stats['vectors']}", get_index_mtime()

with gr.Blocks(title="Agnos Forum RAG Chat") as demo:
    gr.Markdown("## Agnos Forum RAG Chat\nดึง index ล่าสุดอัตโนมัติเมื่อเปิด และสามารถ Reload ได้ด้วยปุ่มด้านขวา")

    with gr.Row():
        q = gr.Textbox(label="คำถาม", placeholder="พิมพ์คำถามที่นี่…", lines=3)
    with gr.Row():
        k = gr.Slider(3, 12, value=6, step=1, label="จำนวนชิ้นส่วนบริบท (Top-K)")
        style = gr.Radio(["Concise","Detailed"], value="Concise", label="สไตล์คำตอบ")

    with gr.Row():
        ask_btn = gr.Button("ถาม", variant="primary")
        reload_btn = gr.Button("Reload Index")
        mtime_box = gr.Textbox(label="Index last modified", value=get_index_mtime(), interactive=False)

    ans = gr.Markdown(label="คำตอบ")
    src = gr.Markdown(label="แหล่งอ้างอิง (Citations)")

    ask_btn.click(fn=rag_ask, inputs=[q, k, style], outputs=[ans, src, mtime_box])
    reload_btn.click(fn=on_reload, inputs=None, outputs=[ans, mtime_box])

demo.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://dfc1d9a2fd9934f928.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
