In [37]:
pip install PyMuPDF

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [38]:
import fitz  # PyMuPDF
import re
import unicodedata

import re
import unicodedata

def clean_section_text(text):
    lines = text.strip().splitlines()
    cleaned = []

    for line in lines:
        line = line.strip()

        # Supprimer le contenu entre parenthèses (ex : "(see note)")
        line = re.sub(r"\([^)]*\)", "", line)

        # Supprimer les lignes ne contenant qu’un chiffre
        if re.fullmatch(r"\d{1,3}", line):
            continue

        # Supprimer les mentions spécifiques inutiles
        if "International Medical Guide" in line:
            continue

        # Supprimer une ligne si elle est une répétition directe de la précédente (même contenu)
        if len(cleaned) == 1 and cleaned[0].lower() in line.lower():
            cleaned.pop()
            continue

        # Supprimer les lignes commençant par "See table"
        if line.lower().startswith("see table"):
            continue

        # Supprimer les lignes comme "Table 3.1 The characteristics..." ou "Table 12"
        if re.match(r"^table\s+\d+(\.\d+)?(\s|$)", line.strip(), re.IGNORECASE):
            continue

        # Supprimer les lignes décrivant des types de douleur dans un tableau
        if "types of pain" in line.lower() or "nociceptive pain" in line.lower():
            continue


        # Nettoyage des puces et symboles spéciaux
        line = re.sub(r"[■●◊:•▪❯]", "", line)

        # Supprimer les références à des chapitres ou figures
        line = re.sub(r"\b(?:See\s+)?Chapter\s+\d+\b", "", line, flags=re.IGNORECASE)
        line = re.sub(r"figure\s?\d+(?:\.\d+)?", "", line, flags=re.IGNORECASE)

        # Corriger les mots séparés par erreur (ex : "s e c t i o n" → "section")
        line = re.sub(r"\b([A-Za-z])\s+([A-Za-z])\b", r"\1\2", line)

        # Corriger les ligatures typographiques
        line = line.replace("ﬁ", "fi").replace("ﬂ", "fl")

        # Supprimer les caractères de contrôle invisibles
        line = ''.join(c for c in line if unicodedata.category(c)[0] != 'C')

        # Ajouter la ligne nettoyée si elle est non vide
        if line:
            cleaned.append(line)

    # Fusionner les lignes et séparer les phrases par des sauts de ligne après ponctuation
    merged = ' '.join(cleaned)
    return re.sub(r'([?.:])\s+', r'\1\n', merged).strip()


def clean_title(title):
    # Supprimer le contenu entre parenthèses dans les titres
    title = re.sub(r"\([^)]*\)", "", title)
    return re.sub(r"[^\w\s\-]", "", title).strip().title()

def is_valid_section_title(text, size, bold, upper):
    """
    Détermine si le texte est probablement un titre de section valide.
    """
    if not text or len(text) < 2:
        return False
    if text.startswith('(') or text.endswith(')'):
        return False
    if text in ["OR", "AND"] and len(text) <= 3:
        return False
    if len(text.split()) == 1 and len(text) < 5 and not upper and not bold:
        return False
    return True

# === DÉCOUPAGE EN CHAPITRES ET SECTIONS AVEC NETTOYAGE ===
def extract_chapters_and_sections(
    pdf_path,
    chapter_size=18,
    section_size=14,
    start_page=0,
    end_page=None
):
    """
    Découpe le PDF en chapitres puis en sections :
    - Chapitres détectés par taille de police >= chapter_size ou texte en MAJUSCULES
    - Sections détectées par taille >= section_size ou style gras
    Nettoie chaque chapitre avant de le découper en sections.
    """
    doc = fitz.open(pdf_path)

    # 1️⃣ Charger tout le texte pour positions globales
    pages_text = []
    page_offsets = []
    for i, page in enumerate(doc):
        if i < start_page or (end_page is not None and i > end_page):
            continue
        txt = page.get_text("text")
        page_offsets.append(len("".join(pages_text)))
        pages_text.append(txt)
    full_text = "".join(pages_text)

    # 2️⃣ Collecter tous les titres candidats (chapitre + section)
    elems = []
    for i, page in enumerate(doc):
        if i < start_page or (end_page is not None and i > end_page):
            continue
        base = page_offsets[i - start_page]
        d = page.get_text("dict")
        for block in d.get("blocks", []):
            for line in block.get("lines", []):
                if len(line["spans"]) != 1:
                    continue  # Ignorer les lignes avec plusieurs spans
                span = line["spans"][0]
                txt = span.get("text", "").strip()
                if not txt:
                    continue
                sz = span.get("size", 0)
                flags = span.get("flags", 0)
                bold = bool(flags & 2)
                upper = txt.isupper() and len(txt.split()) < 8

                if not is_valid_section_title(txt, sz, bold, upper):
                    continue

                if sz >= section_size or bold or upper:
                    pos = full_text.find(txt, base)
                    elems.append({"text": txt, "pos": pos if pos >= 0 else base, "page": i, "size": sz, "bold": bold, "upper": upper})
    elems.sort(key=lambda e: e["pos"])

    # 3️⃣ Séparer chapitres et sections
    chapters = []
    current = None
    for e in elems:
        if e["size"] >= chapter_size or (e["upper"] and e["size"] >= section_size):
            current = {"title": e["text"], "start": e["pos"], "page": e["page"], "sections": []}
            chapters.append(current)
        else:
            if current:
                current["sections"].append({"title": e["text"], "start": e["pos"], "page": e["page"]})

    # 4️⃣ Extraire le contenu du chapitre et ajouter la section "Introduction"
    for i, chap in enumerate(chapters):
        start = chap["start"]
        end = chapters[i+1]["start"] if i+1 < len(chapters) else len(full_text)

        first_section_pos = chap["sections"][0]["start"] if chap["sections"] else end

        intro_text = full_text[start:first_section_pos].strip()
        intro_text = clean_section_text(intro_text)

        sections = []
        if intro_text and first_section_pos > start:
            intro_section = {
                "title": "Introduction",
                "start": start,
                "page": chap["page"],
                "body": intro_text
            }
            sections.append(intro_section)

        for idx, sec in enumerate(chap["sections"]):
            sec_start = sec["start"]
            sec_end = chap["sections"][idx+1]["start"] if idx+1 < len(chap["sections"]) else end
            sec_text = full_text[sec_start:sec_end].strip()
            sections.append({
                "title": clean_title(sec["title"]),
                "page": sec["page"],
                "body": clean_section_text(sec_text)
            })

        chap["sections"] = sections

    # 5️⃣ Formater les résultats finaux
    results = []
    for chap in chapters:
        chap_dict = {"title": chap["title"], "page": chap["page"], "sections": chap["sections"]}
        results.append(chap_dict)

    return results




In [39]:
if __name__ == "__main__":
    pdf = "/kaggle/input/chatmed-set/datasetmedicalboot.pdf"
    structure = extract_chapters_and_sections(
        pdf,
        chapter_size=18,
        section_size=14,
        start_page=20,
        end_page=428
    )

In [40]:

#     for chap in structure:
#         print(f"CHAPITRE: {chap['title']} (page {chap['page']})")
#         for sec in chap['sections']:
#             print(f"  - Section: {sec['title']} (page {sec['page']}), {len(sec['body'])} caractères")

In [41]:
def remove_short_sections(chapters, min_length=100):
    """
    Supprime :
    - les sections de chaque chapitre ayant un corps de texte avec moins de `min_length` caractères.
    - les chapitres ne contenant plus aucune section après le filtrage.
    """
    cleaned_chapters = []
    
    for chap in chapters:
        original_section_count = len(chap["sections"])
        chap["sections"] = [sec for sec in chap["sections"] if len(sec["body"]) >= min_length]
        removed = original_section_count - len(chap["sections"])
        
        if removed:
            print(f"{removed} section(s) supprimée(s) dans le chapitre '{chap['title']}'")
        
        if chap["sections"]:
            cleaned_chapters.append(chap)
        else:
            print(f"Chapitre '{chap['title']}' supprimé car il ne contient aucune section valide.")
    
    return cleaned_chapters


In [42]:
if __name__ == "__main__":
    pdf = "/kaggle/input/chatmed-set/datasetmedicalboot.pdf"
    structure = extract_chapters_and_sections(
        pdf,
        chapter_size=18,
        section_size=14,
        start_page=20,
        end_page=428
    )

    # 🔻 Nettoyage : supprimer les sections < 100 caractères
    structure = remove_short_sections(structure, min_length=100)

    # # 🔍 Affichage
    # for chap in structure:
    #     print(f"CHAPITRE: {chap['title']} (page {chap['page']})")
    #     for sec in chap['sections']:
    #         print(f"  - Section: {sec['title']} (page {sec['page']}), {len(sec['body'])} caractères")


1 section(s) supprimée(s) dans le chapitre 'First aid'
Chapitre 'First aid' supprimé car il ne contient aucune section valide.
1 section(s) supprimée(s) dans le chapitre 'FIRST AID ON BOARD'
8 section(s) supprimée(s) dans le chapitre 'A BASIC LIFE SUPPORT SEQUENCE'
1 section(s) supprimée(s) dans le chapitre 'BLEEDING'
2 section(s) supprimée(s) dans le chapitre 'ANATOMICAL NOTE'
1 section(s) supprimée(s) dans le chapitre 'Eye injuries and diseases'
Chapitre 'Eye injuries and diseases' supprimé car il ne contient aucune section valide.
5 section(s) supprimée(s) dans le chapitre 'EYE INJURIES'
4 section(s) supprimée(s) dans le chapitre 'NONINFECTIOUS EYE DISEASES'
4 section(s) supprimée(s) dans le chapitre 'INFECTIOUS EYE DISEASES'
22 section(s) supprimée(s) dans le chapitre 'SPECIFIC INJURIES'
Chapitre 'Abdominal and chest injuries' supprimé car il ne contient aucune section valide.
Chapitre 'Abdominal and chest injuries' supprimé car il ne contient aucune section valide.
2 section(s) su

In [43]:
def afficher_sections_chapitre(chapters, chapitre_titre, nb_exemples=2, extrait_longueur=200):
    """
    Affiche les premières sections d’un chapitre spécifique pour vérification.
    """
    for chap in chapters:
        if chap['title'].strip().lower() == chapitre_titre.strip().lower():
            print(f"\n📘 CHAPITRE : {chap['title']} (page {chap['page']})")
            print(f"Nombre de sections : {len(chap['sections'])}")
            print("-" * 60)

            for i, sec in enumerate(chap['sections'][:nb_exemples]):
                print(f"🔹 Section: {sec['title']} (page {sec['page']})")
                print(f"Longueur: {len(sec['body'])} caractères")
                extrait = sec['body'][:extrait_longueur].strip().replace('\n', ' ')
                print(f"Extrait: {extrait}...")
                print("-" * 40)
            return
    
    print(f"Aucun chapitre trouvé avec le titre '{chapitre_titre}'.")


In [44]:
afficher_sections_chapitre(structure, "Pain management", nb_exemples=3, extrait_longueur=700)



📘 CHAPITRE : Pain management (page 37)
Nombre de sections : 10
------------------------------------------------------------
🔹 Section: Introduction (page 37)
Longueur: 1261 caractères
Extrait: Pain management Pain is the result of the way in which the brain – and consequently the mind or consciousness  – interprets information about a sensation that the body is  experiencing. The brain receives the information in the form of signals that travel via nerve pathways to the brain. The sensation itself may originate in a tissue such as the skin or a bone, or in an internal organ, or even somewhere along the nerve pain pathways. How the brain receives or reacts to these signals to produce the perception of “pain” can be affected by many factors; for example stress or anxiety can make the mind more sensitive to pain, which is then experi- enced more intensely; inflammation of nerve pathw...
----------------------------------------
🔹 Section: Note On Assessing The Severity Of Pain (page 37)
L

In [45]:
for chap in structure:
    for section in chap["sections"]:
        word_count = len(section["body"].split())
        if word_count > 1000:
            print("=" * 80)
            print(f"📘 Chapitre: {chap['title']}")
            print(f"📄 Section: {section['title']} (Page {section['page']})")
            print(f"📝 Nombre de mots: {word_count}")
            print("-" * 80)
            print(section["body"][:1000] + "...")
            print("=" * 80 + "\n")


📘 Chapitre: THE PHYSICAL EXAMINATION
📄 Section: Introduction (Page 129)
📝 Nombre de mots: 1496
--------------------------------------------------------------------------------
THE PHYSICAL EXAMINATION Unlike the fi rst stage of the basic medical examination, which focuses on subjective s ymptoms experienced and described by the patient, the physical examination looks for objective signs of abnormal functioning of the body.
Again, a systematic head-to-toe approach should be adopted .
Organ or system Ask the patient about past or present occurrences of Head  wounds , headache; Eyes  blurred vision, double vision, pain, yellow colour of the sclera , pain or discomfort on looking at a light source; Ears  loss of hearing, dizziness, pain, or drainage of fl uid; Nose  bleeding, runny, or stuﬀ y; Mouth and throat  sores, pain, diﬃ  culty swallowing; Neck  stiﬀ ness, enlarged lymph glands, pain; Respiratory system  coughing, sputum production, coughing up of blood, chest pain when breathing, s

In [46]:
import json

with open("structure.json", "w", encoding="utf-8") as f:
    json.dump(structure, f, indent=2, ensure_ascii=False)


# aproche 1

### 

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Aplatir les chapitres et sous-sections extraites en une liste de documents
flat_subsections = []
for chapter in structure:
    for section in chapter['sections']:
        flat_subsections.append({
            "section": chapter['title'],
            "subheader": section['title'],
            "text": section['body']
        })

# 1) Aplatir tes sous-sections en une liste de documents
docs = [f"{item['subheader']}. {item['text']}" for item in flat_subsections]

# 2) Construire et entraîner le TF–IDF
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',    # ou 'english' selon la langue de ton contenu
    ngram_range=(1,2),       # unigrams + bigrams
    min_df=2,                # ignorer les termes qui apparaissent < 2 docs
)
tfidf_matrix = vectorizer.fit_transform(docs)  # shape = (n_docs, n_features)

# 3) Fonction de récupération top-k par similarité cosinus
def retrieve_tfidf_for_test(question, k=3):
    # Vectoriser la question
    q_vec = vectorizer.transform([question])
    # Calculer les similarités
    sims = cosine_similarity(q_vec, tfidf_matrix)[0]
    # Récupérer les indices des k meilleurs scores
    topk_idx = np.argsort(sims)[-k:][::-1]
    results = []
    for idx in topk_idx:
        results.append({
            "section": flat_subsections[idx]["section"],
            "subheader": flat_subsections[idx]["subheader"],
            "text": flat_subsections[idx]["text"],
            "score": float(sims[idx])
        })
    return results

# 4) Test rapide
question =" how applicate Mouth-To-Nose Rescue Breathing?"
top3 = retrieve_tfidf_for_test(question, k=3)
for r in top3:
    print(f"→ {r['section']} / {r['subheader']} (score={r['score']:.3f})")
    print(r['text'][:200].replace('\n',' ') + "...\n")


→ A BASIC LIFE SUPPORT SEQUENCE / Mouth-To-Nose Rescue Breathing (score=0.384)
the patient’s mouth cannot be opened; a tight seal cannot be obtained around the patient’s lips; an obstruction cannot be removed from the patient’s mouth; the patient has been rescued from water and ...

→ A BASIC LIFE SUPPORT SEQUENCE / Mouth-To-Mouth Rescue Breathing (score=0.371)
MOUTH-TO-MOUTH RESCUE BREATHING With one hand under the patient’s neck, keep the patient’s head tilted as far back as it will go – unless you suspect spinal injury, in which case use minimal tilt. Pla...

→ A BASIC LIFE SUPPORT SEQUENCE / Using A Bag And Mask Resuscitator (score=0.347)
USING A BAG AND MASK RESUSCITATOR A bag and mask resuscitator can be used for rescue breathing to replace mouth- to-mouth or mouth-to-nose breathing. The advantages of a bag and mask resuscitator are ...



In [48]:
question=" how applicate Mouth-To-Nose Rescue Breathing?"


In [49]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

model = T5ForConditionalGeneration.from_pretrained(model_name)

# Fusionner les 3 meilleurs contextes pour donner à T5
combined_context = " ".join([r["text"] for r in top3])

# Préparer le prompt pour le modèle
input_text = f"question: {question} context: {combined_context} answer:"
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

# Générer la réponse
output = model.generate(**inputs, max_length=300)
answer = tokenizer.decode(output[0], skip_special_tokens=True)

print("🧠 Réponse générée:", answer)


🧠 Réponse générée: 10–12 times per minute


# approche 2

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer, util
import spacy

# 🔹 Charger les modèles
qa_model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(qa_model_name)
qa_model = T5ForConditionalGeneration.from_pretrained(qa_model_name)

sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
nlp = spacy.load("en_core_web_sm")

# 🔹 Encoder les documents une seule fois
doc_embeddings = sbert_model.encode(docs, convert_to_tensor=True)

# 🔍 Récupération des documents pertinents avec score
def retrieve_relevant_docs(question, k=3):
    question_embedding = sbert_model.encode(question, convert_to_tensor=True)
    cosine_scores = util.cos_sim(question_embedding, doc_embeddings)[0]
    top_k = torch.topk(cosine_scores, k=k)

    # Renvoyer les documents + leurs scores
    return [(docs[i], float(cosine_scores[i])) for i in top_k.indices]




Batches:   0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
# ❓ Question exemple
question = question=" how applicate Mouth-To-Nose Rescue Breathing?"


# 🔍 Récupérer les documents pertinents avec scores
relevant_docs = retrieve_relevant_docs(question)

print("\n🔹 Top documents with similarity scores:")
for doc, score in relevant_docs:
    print(f"- 📈 Score de similarité : {score:.4f} | Document: {doc}")





In [None]:
########################################  utilisation   ################################

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer, util
import spacy

# 🔹 Charger les modèles
qa_model_name = "google/flan-t5-base"
tokenizer     = T5Tokenizer.from_pretrained(qa_model_name)
qa_model      = T5ForConditionalGeneration.from_pretrained(qa_model_name)

sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
nlp         = spacy.load("en_core_web_sm")

# 🔹 Préparer tes docs (inchangé)
doc_embeddings = sbert_model.encode(docs, convert_to_tensor=True)

def retrieve_relevant_docs(question, k=3):
    q_emb         = sbert_model.encode(question, convert_to_tensor=True)
    cosine_scores = util.cos_sim(q_emb, doc_embeddings)[0]
    top_k         = torch.topk(cosine_scores, k=k)
    return [docs[i] for i in top_k.indices]

def extract_key_sentences(context, question, max_sentences=3):
    doc               = nlp(context)
    keywords          = [t.text.lower() for t in nlp(question) if not t.is_stop]
    relevant_sentences= []
    for sent in doc.sents:
        if any(kw in sent.text.lower() for kw in keywords):
            relevant_sentences.append(sent.text)
    return " ".join(relevant_sentences[:max_sentences])

# ➊ Extraction du passage de réponse
def extract_answer_span(question, context):
    prompt = f"""
You are an information extraction assistant.
Extract only the sentence(s) from the context that answer the question.

Context:
{context}

Question:
{question}

Answer (extract):
"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    out    = qa_model.generate(**inputs, max_length=100, num_beams=5, early_stopping=True)
    return tokenizer.decode(out[0], skip_special_tokens=True)

# ➋ Paraphrase dans un style friendly
def paraphrase_text(text):
    prompt = f"""
Paraphrase the following text and details in your own words, using a warm conversational tone,
and end by inviting the user to ask a follow-up question .

Text to paraphrase:
\"\"\"{text}\"\"\"

Paraphrased answer:
"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    out    = qa_model.generate(
        **inputs,
        min_length=20,
        max_length=300,
        num_beams=8,
        length_penalty=1.2,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    return tokenizer.decode(out[0], skip_special_tokens=True)

# ➌ Pipeline final
def chat_response(question, k_docs=3):
    # Récupérer et filtrer le contexte
    docs           = retrieve_relevant_docs(question, k=k_docs)
    ctx_raw        = " ".join(docs)
    refined_ctx    = extract_key_sentences(ctx_raw, question)
    # Extraction puis paraphrase
    span           = extract_answer_span(question, refined_ctx)
    answer         = paraphrase_text(span)
    return answer




In [None]:
# — Exemple d’utilisation —
if __name__ == "__main__":
    question = "Post-concussion syndrome is more common in women or men  ?"
    response = chat_response(question)
    print("💬 Bot :", response)
    # print("Similarity Score (with question):", response["scores"]["similarity_with_question"])
    # print("Similarity Score (with context):", response["scores"]["similarity_with_context"])

In [None]:
# — Exemple d’utilisation —
if __name__ == "__main__":
    question =" Low blood pressure does necessarily mean that the patient is in shock if not why ?"
    response = chat_response(question)
    print("💬 Bot :", response)
  

In [None]:
# — Exemple d’utilisation —
if __name__ == "__main__":
    question =" what is The first step in dealing with an eye injury?"
    response = chat_response(question)
    print("💬 Bot :", response)

In [None]:
# — Exemple d’utilisation —
if __name__ == "__main__":
    question =" what is the  most the most common cause of  shock ?"
    response = chat_response(question)
    print("💬 Bot :", response)

In [None]:
# — Exemple d’utilisation —
if __name__ == "__main__":
    question =" what are the types of shock ?"
    response = chat_response(question)
    print("💬 Bot :", response)

In [None]:
# — Exemple d’utilisation —
if __name__ == "__main__":
    question ="what are the domages cause by head inguries?"
    response = chat_response(question)
    print("💬 Bot :", response)

In [None]:
# — Exemple d’utilisation —
if __name__ == "__main__":
    question ="what is first aid"
    response = chat_response(question)
    print("💬 Bot :", response)

In [None]:
# — Exemple d’utilisation —
if __name__ == "__main__":
    question ="what is pain"
    response = chat_response(question)
    print("💬 Bot :", response)

In [None]:
# — Exemple d’utilisation —
if __name__ == "__main__":
    question ="how applicate Mouth-To-Nose Rescue Breathing?"
    response = chat_response(question)
    print("💬 Bot :", response)

In [None]:
# — Exemple d’utilisation —
if __name__ == "__main__":
    question ="How should abdominal thrusts be performed on a conscious person??"
    response = chat_response(question)
    print("💬 Bot :", response)

In [None]:
# — Exemple d’utilisation —
if __name__ == "__main__":
    question ="What are the two key questions to ask when managing a bleeding patient??"
    response = chat_response(question)
    print("💬 Bot :", response)

In [None]:
# — Exemple d’utilisation —
if __name__ == "__main__":
    question ="what are Signs And Symptoms of Shock?"
    response = chat_response(question)
    print("💬 Bot :", response)

In [None]:
torch.save(doc_embeddings, "doc_embeddings.pt")


In [None]:
# doc_embeddings = torch.load("doc_embeddings.pt")


# streamlit 

In [None]:
# import streamlit as st
# import torch
# import json
# from sentence_transformers import SentenceTransformer, util
# from transformers import T5Tokenizer, T5ForConditionalGeneration
# import spacy

# # --- Configuration GPU/CPU ---
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # 👇 Ceci doit être AVANT tout autre st.*
# st.set_page_config(page_title="ChatMed Bot (CUDA)", layout="wide")

# # 🔒 Caching des modèles et ressources lourdes
# @st.cache_resource
# def load_models():
#     qa_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base").to(device)
#     tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
#     sbert = SentenceTransformer("all-MiniLM-L6-v2", device=device)
#     nlp = spacy.load("en_core_web_sm")
#     return qa_model, tokenizer, sbert, nlp

# @st.cache_data
# def load_structure(path="structure (1).json"):
#     with open(path, "r", encoding="utf-8") as f:
#         return json.load(f)

# @st.cache_data
# def load_doc_embeddings(path="doc_embeddings.pt"):
#     return torch.load(path, map_location=device)

# # ⏬ Chargement des modèles et données
# qa_model, tokenizer, sbert_model, nlp = load_models()
# structure = load_structure()
# doc_embeddings = load_doc_embeddings()

# # 🔄 Reconstruction des docs (plat)
# docs = [f"{section['title']}. {section['body']}" 
#         for chapter in structure 
#         for section in chapter['sections']]

# # 🔍 Récupération des docs pertinents
# def retrieve_relevant_docs(question, k=3):
#     question_embedding = sbert_model.encode(
#         question, convert_to_tensor=True, device=device
#     )
#     cosine_scores = util.cos_sim(question_embedding, doc_embeddings)[0]
#     top_k = torch.topk(cosine_scores, k=k)
#     return [docs[i] for i in top_k.indices], top_k.values.tolist()

# # 🔬 Extraction des phrases clés
# def extract_key_sentences(context, question, max_sentences=3):
#     doc = nlp(context)
#     question_keywords = [token.text.lower() for token in nlp(question) if not token.is_stop]
#     relevant_sentences = []
#     for sent in doc.sents:
#         if any(keyword in sent.text.lower() for keyword in question_keywords):
#             relevant_sentences.append(sent.text)
#     return " ".join(relevant_sentences[:max_sentences])

# # 🧠 Génération de réponse
# def generate_answer(question, context):
#     prompt = f"""
# Based on the following context, answer the question concisely and include details.

# Context:
# {context}

# Question:
# {question}

# Answer:"""
#     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
#     output = qa_model.generate(
#         **inputs,
#         max_length=150,
#         num_beams=5,
#         early_stopping=True
#     )
#     answer = tokenizer.decode(output[0], skip_special_tokens=True)
#     return answer

# # 🎯 Interface Streamlit
# st.title("🧠 Semantic QA - Medical Document Assistant ")
# st.markdown("Posez une question sur un document médical structuré.")

# question = st.text_input("❓ Votre question")
# if st.button("Get Answer") and question:
#     with st.spinner("🔍 Recherche en cours..."):
#         relevant_docs, scores = retrieve_relevant_docs(question, k=3)
#         context = extract_key_sentences(" ".join(relevant_docs), question)
#         answer = generate_answer(question, context)

#     st.subheader("📚 Documents pertinents (Top 3)")
#     for i, (doc, score) in enumerate(zip(relevant_docs, scores), 1):
#         st.markdown(f"**Doc {i} - Similarité cosinus :** `{score:.4f}`")
#         st.markdown(f"> {doc[:300]}...")  # Affiche un extrait (300 caractères)

#     st.subheader("✅ Réponse générée")
#     st.write(answer)

# # Footer
# st.markdown("---")
# st.markdown(f"**Device utilisé:** {device}")


In [None]:
# streamlit run app.py


In [None]:
# project/
# ├── app.py
# ├── structure.json
# ├── doc_embeddings.pt
