In [1]:
!pip -q install "sentence-transformers>=2.6.0" faiss-cpu "datasets>=2.20.0" \
                 "google-generativeai>=0.7.2" gradio "python-dotenv>=1.0.1"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install google-cloud-secret-manager



In [3]:
import os, re, json, math, time, pickle, textwrap, uuid
from typing import List, Dict, Any, Tuple

import numpy as np
import pandas as pd

import google.generativeai as genai
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import faiss
import gradio as gr

np.random.seed(42)


In [4]:
from google.colab import userdata
gemini_api_key = userdata.get('gemini')
hugging_api_key = userdata.get("huggingface")

In [5]:
CFG = {
    "dataset_id": "umutertugrul/turkish-hospital-medical-articles",
    "hospital_filter": "medipol",
    "title_fields": ["title", "page_title", "name"],
    "text_fields" : ["text", "content", "article_text", "body"],
    "url_fields"  : ["url", "link", "source_url"],
    "source_fields": ["source", "__source", "hospital", "site", "origin"],

    "embedding_model": "trmteb/turkish-embedding-model",
    "chunk_size_words": 300,
    "chunk_overlap_words": 40,

    "top_k": 3,
    "faiss_index_path": "/content/medipol_faiss.index",
    "metadata_path": "/content/medipol_metadata.pkl",

    "gemini_model": "gemini-2.0-flash"  # or "gemini-2.0-flash-exp" / "gemini-1.5-flash"
}
CFG


{'dataset_id': 'umutertugrul/turkish-hospital-medical-articles',
 'hospital_filter': 'medipol',
 'title_fields': ['title', 'page_title', 'name'],
 'text_fields': ['text', 'content', 'article_text', 'body'],
 'url_fields': ['url', 'link', 'source_url'],
 'source_fields': ['source', '__source', 'hospital', 'site', 'origin'],
 'embedding_model': 'trmteb/turkish-embedding-model',
 'chunk_size_words': 300,
 'chunk_overlap_words': 40,
 'top_k': 3,
 'faiss_index_path': '/content/medipol_faiss.index',
 'metadata_path': '/content/medipol_metadata.pkl',
 'gemini_model': 'gemini-2.0-flash'}

In [6]:
genai.configure(api_key=gemini_api_key)

In [7]:
from huggingface_hub import login
login(token=hugging_api_key)


In [8]:
ds = load_dataset(CFG["dataset_id"], split="medipol")
print("Columns:", ds.column_names, "Total rows:", len(ds))



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.45k [00:00<?, ?B/s]

data/acibadem-00000-of-00001.parquet:   0%|          | 0.00/28.8M [00:00<?, ?B/s]

data/anadolusaglik-00000-of-00001.parque(…):   0%|          | 0.00/3.78M [00:00<?, ?B/s]

data/atlas-00000-of-00001.parquet:   0%|          | 0.00/345k [00:00<?, ?B/s]

data/baskentistanbul-00000-of-00001.parq(…):   0%|          | 0.00/890k [00:00<?, ?B/s]

data/bayindir-00000-of-00001.parquet:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

data/florence-00000-of-00001.parquet:   0%|          | 0.00/6.45M [00:00<?, ?B/s]

data/guven-00000-of-00001.parquet:   0%|          | 0.00/2.17M [00:00<?, ?B/s]

data/liv-00000-of-00001.parquet:   0%|          | 0.00/5.73M [00:00<?, ?B/s]

data/medicalpark-00000-of-00001.parquet:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

data/medicalpoint-00000-of-00001.parquet:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

data/medicana-00000-of-00001.parquet:   0%|          | 0.00/8.52M [00:00<?, ?B/s]

data/medipol-00000-of-00001.parquet:   0%|          | 0.00/4.00M [00:00<?, ?B/s]

data/memorial-00000-of-00001.parquet:   0%|          | 0.00/21.3M [00:00<?, ?B/s]

data/yeditepe-00000-of-00001.parquet:   0%|          | 0.00/2.71M [00:00<?, ?B/s]

Generating acibadem split:   0%|          | 0/6339 [00:00<?, ? examples/s]

Generating anadolusaglik split:   0%|          | 0/1012 [00:00<?, ? examples/s]

Generating atlas split:   0%|          | 0/130 [00:00<?, ? examples/s]

Generating baskentistanbul split:   0%|          | 0/394 [00:00<?, ? examples/s]

Generating bayindir split:   0%|          | 0/690 [00:00<?, ? examples/s]

Generating florence split:   0%|          | 0/1641 [00:00<?, ? examples/s]

Generating guven split:   0%|          | 0/666 [00:00<?, ? examples/s]

Generating liv split:   0%|          | 0/2836 [00:00<?, ? examples/s]

Generating medicalpark split:   0%|          | 0/371 [00:00<?, ? examples/s]

Generating medicalpoint split:   0%|          | 0/654 [00:00<?, ? examples/s]

Generating medicana split:   0%|          | 0/2163 [00:00<?, ? examples/s]

Generating medipol split:   0%|          | 0/1380 [00:00<?, ? examples/s]

Generating memorial split:   0%|          | 0/5338 [00:00<?, ? examples/s]

Generating yeditepe split:   0%|          | 0/998 [00:00<?, ? examples/s]

Columns: ['url', 'title', 'text', 'publish_date', 'update_date', 'scrape_date'] Total rows: 1380


In [9]:
HTML_TAG_RE = re.compile(r"<[^>]+>")
WS_RE = re.compile(r"\s+")

def pick_first(row: dict, keys: List[str], default=""):
    for k in keys:
        if k in row and row[k] not in (None, ""):
            return str(row[k])
    return default

def clean_text(t: str) -> str:
    if not t:
        return ""
    t = HTML_TAG_RE.sub(" ", t)
    t = WS_RE.sub(" ", t).strip()
    return t


In [10]:
records = []
for ex in ds:
    title = pick_first(ex, CFG["title_fields"], default="Medipol Makalesi")
    url   = pick_first(ex, CFG["url_fields"], "")
    text  = pick_first(ex, CFG["text_fields"], "")
    text  = clean_text(text)
    if not text or len(text) < 150:  # drop too-short pages
        continue
    records.append({"title": title, "url": url, "text": text})

df = pd.DataFrame(records).drop_duplicates(subset=["url", "title", "text"], keep="first")
print(df.shape)
df.head(10)


(1377, 3)


Unnamed: 0,title,url,text
0,Girişimsel Radyoloji Yöntemleri: Tanı ve Tedav...,https://medipol.com.tr/bilgi-kosesi/adan-zye-s...,"Girişimsel radyoloji, radyolojik görüntüleme t..."
1,"Anafilaksi: Belirtiler, Nedenler ve Hayat Kurt...",https://medipol.com.tr/bilgi-kosesi/adan-zye-s...,"Anafilaksi, ciddi ve hayatı tehdit edebilecek ..."
2,Ameliyatsız Bel Fıtığı Tedavi Yöntemleri ve Et...,https://medipol.com.tr/bilgi-kosesi/adan-zye-s...,"Bel fıtığı, günümüzde birçok kişinin yaşadığı ..."
3,"Adrenal Kitleler: Belirtiler, Teşhis Yöntemler...",https://medipol.com.tr/bilgi-kosesi/adan-zye-s...,"Adrenal kitleler, böbrek üstü bezlerinde (adre..."
4,"Akciğer Kanseri: Belirtileri, Nedenleri ve Ted...",https://medipol.com.tr/bilgi-kosesi/adan-zye-s...,"Akciğer kanseri, dünya genelinde milyonlarca i..."
5,Ağrı Yönetimi ve Tedavi Yöntemleri,https://medipol.com.tr/bilgi-kosesi/adan-zye-s...,"Ağrı, birçok kişinin yaşam kalitesini olumsuz ..."
6,"Böbrek Tümörleri: Belirtiler, Tedavi Yöntemler...",https://medipol.com.tr/bilgi-kosesi/adan-zye-s...,"Böbrek tümörleri, böbreklerde anormal hücre bü..."
7,Böbrek Kanseri ve Risk Faktörleri,https://medipol.com.tr/bilgi-kosesi/adan-zye-s...,"Böbrek kanseri, böbrek hücrelerinin kontrolsüz..."
8,"Covid Sonrası Kalp Sağlığı: Riskler, Belirtile...",https://medipol.com.tr/bilgi-kosesi/adan-zye-s...,"Covid-19, yalnızca solunum yollarını değil, ay..."
9,"Çocuklarda Miyopi: Nedenleri, Belirtileri ve T...",https://medipol.com.tr/bilgi-kosesi/adan-zye-s...,"Miyopi, çocukluk çağında en sık görülen görme ..."


In [11]:
def word_chunks(text: str, size=300, overlap=40) -> List[str]:
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i+size]
        if not chunk:
            break
        chunks.append(" ".join(chunk))
        i += size - overlap
    return chunks

chunk_rows = []
for i, row in df.iterrows():
    chunks = word_chunks(row["text"], CFG["chunk_size_words"], CFG["chunk_overlap_words"])
    for j, ch in enumerate(chunks):
        chunk_rows.append({
            "doc_id": f"doc-{i}",
            "chunk_id": f"doc-{i}-chunk-{j}",
            "title": row["title"],
            "url": row["url"],
            "hospital": "Medipol",
            "content": ch
        })

chunks_df = pd.DataFrame(chunk_rows)
print("Num chunks:", len(chunks_df), "Avg length:", int(chunks_df['content'].str.split().str.len().mean()))
chunks_df.head(2)


Num chunks: 4133 Avg length: 242


Unnamed: 0,doc_id,chunk_id,title,url,hospital,content
0,doc-0,doc-0-chunk-0,Girişimsel Radyoloji Yöntemleri: Tanı ve Tedav...,https://medipol.com.tr/bilgi-kosesi/adan-zye-s...,Medipol,"Girişimsel radyoloji, radyolojik görüntüleme t..."
1,doc-0,doc-0-chunk-1,Girişimsel Radyoloji Yöntemleri: Tanı ve Tedav...,https://medipol.com.tr/bilgi-kosesi/adan-zye-s...,Medipol,drenaj işlemleri 4. Ürolojik Hastalıklar - Böb...


In [12]:
model = SentenceTransformer(CFG["embedding_model"])
dim = model.get_sentence_embedding_dimension()
print("Embedding dim:", dim)

index = None
meta = None

if os.path.exists(CFG["faiss_index_path"]) and os.path.exists(CFG["metadata_path"]):
    print("Loading existing FAISS + metadata...")
    index = faiss.read_index(CFG["faiss_index_path"])
    with open(CFG["metadata_path"], "rb") as f:
        meta = pickle.load(f)
else:
    print("Building FAISS from scratch...")
    embeddings = model.encode(chunks_df["content"].tolist(), batch_size=64, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
    index = faiss.IndexFlatIP(dim)  # cosine via normalized dot-product
    index.add(embeddings)

    # store metadata aligned with vector order
    meta = chunks_df[["chunk_id", "doc_id", "title", "url", "hospital"]].to_dict(orient="records")

    # persist
    faiss.write_index(index, CFG["faiss_index_path"])
    with open(CFG["metadata_path"], "wb") as f:
        pickle.dump(meta, f)

print("Index ntotal:", index.ntotal, "Meta items:", len(meta))


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Embedding dim: 768
Building FAISS from scratch...


Batches:   0%|          | 0/65 [00:00<?, ?it/s]

Index ntotal: 4133 Meta items: 4133


In [13]:
def retrieve(query: str, top_k: int = 3) -> List[Dict[str, Any]]:
    q_emb = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    scores, idxs = index.search(q_emb, top_k)
    idxs = idxs[0].tolist()
    scores = scores[0].tolist()
    results = []
    for rank, (i, s) in enumerate(zip(idxs, scores), 1):
        if i < 0:  # faiss padding if not enough
            continue
        item = meta[i].copy()
        item["score"] = float(s)
        item["rank"] = rank
        item["content"] = chunks_df.iloc[i]["content"]
        results.append(item)
    return results


In [14]:
def build_prompt(question: str, contexts: List[Dict[str, Any]]) -> str:
    # Safety + citation instructions
    header = (
        "Aşağıdaki içerik parçalarına dayanarak soruyu yanıtla.\n"
        "Tıbbi sorular dışında bir soru gelirse 'Benim alanım değil ama' dedikten sonra bildiğin kadarıyla cevap ver.\n"
        "Yetersiz bilgi varsa 'Bilmiyorum' veya 'Belgelerde yeterli bilgi yok' de.\n"
        "Tıbbi tavsiye verme; genel bilgilendirme yap ve uzman görüşüne yönlendir.\n"
        "Cevabın sonunda 'Kaynaklar:' altında başlık ve URL ver.\n\n"
    )

    ctx_txt = []
    for c in contexts:
        ctx_txt.append(
            f"Başlık: {c['title']}\nURL: {c['url']}\nİçerik: {c['content']}\n"
        )
    ctx_block = "\n---\n".join(ctx_txt)

    q = f"Soru: {question}\n"
    return header + "Belgeler:\n" + ctx_block + "\n\n" + q + "Yanıt:"


def generate_answer(question: str, top_k: int = None, model_name: str = None) -> Tuple[str, List[Dict[str, Any]]]:
    k = top_k or CFG["top_k"]
    contexts = retrieve(question, top_k=k)

    prompt = build_prompt(question, contexts)
    model_name = model_name or CFG["gemini_model"]
    model_g = genai.GenerativeModel(model_name)

    resp = model_g.generate_content(prompt)
    text = getattr(resp, "text", "").strip() or "Üzgünüm, bir yanıt oluşturulamadı."
    return text, contexts


In [16]:
import gradio as gr

# from your_module import CFG, generate_answer
FIXED_TOP_K = CFG["top_k"]
FIXED_MODEL = "gemini-2.0-flash-exp"

def chat_fn(message, history):
    try:
        answer, _ctxs = generate_answer(
            message, top_k=int(FIXED_TOP_K), model_name=FIXED_MODEL
        )
        history = history + [[message, answer]]
        return history, gr.update(value="")
    except Exception as e:
        err = f"⚠️ Hata: {e}"
        history = history + [[message, err]]
        return history, gr.update(value="")

with gr.Blocks(
    title="Medipol RAG Chatbot",
    theme=gr.themes.Soft(),
    css="""
    body { background-color: #0f1218; } /* dark background */
    .app-header {
        display:flex; align-items:center; gap:14px;
        padding:18px 20px; border-radius:16px;
        background:linear-gradient(135deg,#1e88ff 0%,#00b8a9 100%);
        color:white; box-shadow:0 4px 12px rgba(0,0,0,0.25);
        margin-bottom:16px;
    }
    .app-header h1 { font-size:1.4rem; margin:0; color:white; }
    .app-subtle { color:rgba(255,255,255,0.92); font-size:0.95rem; margin:3px 0 0 0; }

    /* center the initial welcome message */
    .welcome-msg {
        text-align:center;
        line-height:1.6;
    }
    .welcome-msg b {
        color:#1e88ff;
    }
    """
) as demo:
    # Header
    gr.HTML(
        """
        <div class="app-header">
            <div style="font-size:1.8rem">🏥</div>
            <div>
                <h1>Medipol Medical Articles — Chatbot</h1>
                <div class="app-subtle">Bilgilendirme amaçlıdır; tıbbi tavsiye değildir.</div>
            </div>
        </div>
        """
    )

    # Centered initial message
    chat = gr.Chatbot(
        label="Sohbet",
        height=480,
        bubble_full_width=False,
        value=[
            (
                None,
                """<div class='welcome-msg'>
                👋 Merhaba!<br><br>
                Ben <b>Medipol Chatbot</b>.<br>
                Tıbbi makalelerden derlenmiş bilgilerle sorularınıza yanıt veririm.<br><br>
                <i>Örnek sorular:</i><br>
                • Bel fıtığı tedavi yöntemleri nelerdir?<br>
                • Kütletme sağlıklı mıdır??<br>
                • Migren atağı için kanıta dayalı yaklaşımlar neler?<br>
                </div>"""
            )
        ],
        sanitize_html=False  # allow HTML for centering
    )

    msg = gr.Textbox(
        label="Soru",
        placeholder="Örn: Bel fıtığı tedavi yöntemleri nelerdir?",
        lines=3
    )

    with gr.Row():
        send_btn = gr.Button("Yanıtla", variant="primary")
        clear_btn = gr.Button("Sohbeti Temizle", variant="secondary")

    send_btn.click(chat_fn, inputs=[msg, chat], outputs=[chat, msg])
    msg.submit(chat_fn, inputs=[msg, chat], outputs=[chat, msg])
    clear_btn.click(lambda: [], None, chat)

demo.launch(debug=True)


  chat = gr.Chatbot(
  chat = gr.Chatbot(


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://fa16b05c543ebb5500.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://fa16b05c543ebb5500.gradio.live


