In [1]:
!pip install -U "transformers>=4.44.0" "sentence-transformers>=3.0.0" "accelerate>=0.26.0"
!pip install -q -U faiss-cpu bitsandbytes
!pip install openai google-generativeai


Collecting transformers>=4.44.0
  Downloading transformers-5.2.0-py3-none-any.whl.metadata (32 kB)
Downloading transformers-5.2.0-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 5.0.0
    Uninstalling transformers-5.0.0:
      Successfully uninstalled transformers-5.0.0
Successfully installed transformers-5.2.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m


In [2]:
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
from google.colab import userdata
from openai import OpenAI
import google.generativeai as genai
import faiss
import numpy as np
import os



All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  loader.exec_module(module)


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


## load embedding model

In [4]:
embed_model = SentenceTransformer('BAAI/bge-m3')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

## Candidate model setup
- baseline model : gemini-3-flash-preview
- candidate 1 : Gemma-SEA-LION-v4-27B-IT
- candidate 2 : Kimi-K2.5

In [6]:
# candidate model list
LLM_CHOICES = {
    "gemini": { # baseline model
        "type": "gemini",
        "api_key": userdata.get("gemini_api"),
        "model": "gemini-3-flash-preview"
    },
    "sealions": { # candidate 1
        "type": "openai",
        "api_key": userdata.get('sealion_api'),
        "base_url": "https://api.sea-lion.ai/v1",
        "model": "aisingapore/Gemma-SEA-LION-v4-27B-IT"
    },
    "kimi": { # candidate 2
        "type": "openai",
        "api_key": userdata.get('openrouter_api'),
        "base_url": "https://openrouter.ai/api/v1",
        "model": "moonshotai/Kimi-K2.5"
    },
}
def get_llm_client(provider_name):
    config = LLM_CHOICES[provider_name]

    if config["type"] == "openai":
        client = OpenAI(
            api_key=config["api_key"],
            base_url=config["base_url"]
        )
        return client, config["model"], "openai"

    if config["type"] == "gemini":
        genai.configure(api_key=config["api_key"])
        model = genai.GenerativeModel(config["model"])
        return model, config["model"], "gemini"


## RAG Knowledge Ingestion

In [7]:
# store RAG data using faiss
medical_kb = [
    {
        "text": "ไข้หวัดทั่วไป: มีไข้ต่ำ เจ็บคอ ไอ น้ำมูกไหล การดูแล: พักผ่อนและดื่มน้ำมากๆ",
        "dept": "อายุรกรรมทั่วไป",
        "severity": "เขียว (ไม่รุนแรง)"
    },
    {
        "text": "อาการแน่นหน้าอก หายใจลำบาก เหงื่อแตก: สัญญาณหัวใจขาดเลือดหรือภาวะวิกฤต",
        "dept": "แผนกฉุกเฉิน",
        "severity": "แดง (ฉุกเฉิน)"
    },
    {
        "text": "ปวดท้องขวาล่างเฉียบพลันและมีไข้: เสี่ยงไส้ติ่งอักเสบ",
        "dept": "แผนกศัลยกรรม/ฉุกเฉิน",
        "severity": "แดง (ฉุกเฉิน)"
    }
]


documents = [d['text'] for d in medical_kb]
doc_embeddings = embed_model.encode(documents, normalize_embeddings=True)

dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension) # Inner Product for normalized cosine similarity
index.add(np.array(doc_embeddings).astype('float32'))

## Setup model context

In [17]:
def medical_rag_system(user_query, provider="kimi", temperature = 0.1):
    # Step: Search
    query_embedding = embed_model.encode([user_query], normalize_embeddings=True)
    _, indices = index.search(np.array(query_embedding).astype('float32'), k=1)

    match = medical_kb[indices[0][0]]


# FILL IN SYSTEM PROMPT HERE
    messages = [
        {
            "role": "system",
            "content": """
        คุณคือผู้ช่วยคัดกรองอาการผู้ป่วย
        ใช้เฉพาะข้อมูลอ้างอิงที่ให้
        ต้องตอบเป็น 3 ส่วน:
        1. การประเมินอาการ
        2. ระดับความรุนแรง
        3. แผนกที่ควรไป
        ตอบภาษาเดียวกับคำถาม
        ห้ามทวนคำสั่ง
        """
        },
        {
    "role": "user",
    "content": f"""
ข้อมูลอ้างอิง:
{match['text']}

คำถาม:
{user_query}
"""
        }
    ]
    client, model_name, provider_type = get_llm_client(provider)
    print("using model: " + model_name)

    if provider_type == "openai":
        response = client.chat.completions.create(
            model=model_name,
            messages=messages,
            temperature=temperature,
            max_tokens=1024
        )
        answer = response.choices[0].message.content

    elif provider_type == "gemini":
        # Gemini expects single prompt string
        full_prompt = "\n".join([m["content"] for m in messages])

        response = client.generate_content(
            full_prompt,
            generation_config={
                "temperature": temperature,
                "max_output_tokens": 1024
            }
        )
        answer = response.text

    disclaimer = "\n\n*หมายเหตุ: นี่คือการคัดกรองและการแนะนำเบื้องต้น ไม่ใช่การวินิจฉัยทางการแพทย์*"
    return answer + disclaimer


## Q&A Implementation

In [18]:
query = "เจ็บหน้าอกมาก หายใจลำบาก"
print(f"ผลลัพธ์การคัดกรอง:\n{medical_rag_system(query,provider="kimi")}")

using model: moonshotai/Kimi-K2.5
ผลลัพธ์การคัดกรอง:
**1. การประเมินอาการ**  
อาการเจ็บหน้าอกมากร่วมกับหายใจลำบาก เป็นสัญญาณของภาวะหัวใจขาดเลือดหรือภาวะวิกฤต

**2. ระดับความรุนแรง**  
วิกฤติ

**3. แผนกที่ควรไป**  
แผนกฉุกเฉิน (ห้องฉุกเฉิน) ทันที

*หมายเหตุ: นี่คือการคัดกรองและการแนะนำเบื้องต้น ไม่ใช่การวินิจฉัยทางการแพทย์*


# **What to fill in proposal**

## LLM Model Selection

## 1. Baseline Model: Gemini 3 Flash
**Gemini 3 Flash** serves as the primary benchmark for this study due to its balance of speed and advanced reasoning.

### Rationale for Selection
* **Instruction Following:** Demonstrates high proficiency in complex reasoning and adhering to strict prompt constraints.
* **Multilingual Support:** Native processing capabilities for Southeast Asian languages, specifically **Thai**.
* **Task Stability:** Provides consistent performance across structured tasks such as classification, extraction, and summarization.
* **Documentation:** Widely adopted and well-documented, making it an ideal reference point for comparative analysis.

### Comparition metric
![Artificial Analysis Intelligence Index](https://artificialanalysis.ai/img/articles/gemini-3-flash-everything-you-need-to-know/Artificial_Analysis_Intelligence_Index_%2816_Dec_25%29.png)

---
## 2. Candidate Models

### A. SEA-LION (Gemma-based 27B Instruction-tuned)
A model specifically architecturalized for the Southeast Asian (SEA) region.

* **Key Strengths:**  Optimized for regional linguistic nuances.
    * Strong alignment with local medical and cultural expressions.
    * Open-weight architecture, providing greater deployment flexibility and private hosting capabilities.
* **Comparison to Baseline:**  **Optimization** Regionally focused vs. Gemini’s global training.
    * **Reasoning:** May trade off some general reasoning for superior local context.

### B. Kimi 2.5
A high-performance model known for its efficiency in handling large datasets.

* **Key Strengths:**
    * Exceptional **long-context handling** for processing lengthy documents.
    * Strong structured output consistency.
    * Competitive cost-to-performance ratio in production environments.
* **Comparison to Baseline:**
    * **Architecture:** Offers a different training approach compared to Google’s Gemini series.
    * **Context:** Prioritizes maintaining coherence over massive input tokens.
  ![kimi_metric](https://miro.medium.com/1*Ycy0aWssByBlhf0pb88CWg.png)

---

## 3. Comparative Summary

| Feature | Gemini 3 Flash (Baseline) | SEA-LION (27B) | Kimi 2.5 |
| :--- | :--- | :--- | :--- |
| **Optimization** | Global / General | Southeast Asian Regional | Long-Context / Efficiency


---

### **Embedding Model** (default)
**1. bge-m3 :** vector embedding model
![bge-m3](https://scontent.fbkk12-5.fna.fbcdn.net/v/t39.30808-6/504256971_3984009171837940_2984735354200405157_n.jpg?_nc_cat=110&ccb=1-7&_nc_sid=aa7b47&_nc_ohc=GrsYXizIJ2MQ7kNvwExDhEt&_nc_oc=AdmT3VCJlAgoYPWfAQemWXXsGkjvCu6BzgA1XwU5cW_9oXT36Lxmsr9_seJD8gv8q5A&_nc_zt=23&_nc_ht=scontent.fbkk12-5.fna&_nc_gid=7_oZaUIZbMQJeefqnBHPkA&oh=00_AfuxZNLD1XXfXyoSGVaizHACVPIaUWjCbVxvPA9zFXLKlQ&oe=699A4510)
reference: https://huggingface.co/spaces/panuthept/thai_sentence_embedding_benchmark

BGE-M3 is specifically trained to handle over 100 languages. In the context of our's project, it excels at Semantic Mapping—understanding that a patient’s casual description of a symptom (e.g., "ปวดจี๊ดๆ ที่อก") carries the same semantic weight as formal medical terms in our's database (e.g., "Chest pain" or "Angina").

