# 05 - Orchestration LLM

**Objectif** : Combiner tous les composants pour r√©pondre en langage naturel

**Flow** :
1. Question utilisateur
2. Recherche vectorielle Mediatech ‚Üí datasets pertinents
3. MCP datagouv ‚Üí donn√©es fra√Æches
4. LLM Albert ‚Üí r√©ponse avec sources

## 1. Configuration et imports

In [None]:
import os
import json
import re
import numpy as np
import duckdb
import httpx
from dotenv import load_dotenv

load_dotenv("../.env")

ALBERT_API_KEY = os.getenv("ALBERT_API_KEY")
ALBERT_API_URL = os.getenv("ALBERT_API_URL", "https://albert.api.etalab.gouv.fr/v1")
MCP_URL = os.getenv("MCP_DATAGOUV_URL", "https://mcp.data.gouv.fr/mcp")

print(f"‚úÖ Albert API : {ALBERT_API_URL}")
print(f"‚úÖ MCP URL : {MCP_URL}")

## 2. Charger Mediatech (recherche vectorielle)

In [None]:
%%time
# Charger les donn√©es Mediatech
PARQUET_GLOB = "../huggingface/data_gouv_datasets_catalog_part_*.parquet"
con = duckdb.connect()

df = con.execute(f"""
    SELECT 
        doc_id,
        title,
        organization,
        description,
        url,
        quality_score,
        "embeddings_bge-m3" as embedding_json
    FROM read_parquet('{PARQUET_GLOB}')
    WHERE "embeddings_bge-m3" IS NOT NULL
""").df()

print(f"‚úÖ {len(df):,} datasets charg√©s")

In [None]:
%%time
# Parser et normaliser les embeddings
embeddings_list = [json.loads(e) for e in df["embedding_json"]]
embeddings_matrix = np.array(embeddings_list, dtype=np.float32)

# Pr√©-normaliser pour la similarit√© cosinus
norms = np.linalg.norm(embeddings_matrix, axis=1, keepdims=True)
embeddings_normalized = embeddings_matrix / norms

print(f"‚úÖ Embeddings : {embeddings_matrix.shape}")

## 3. Fonctions Albert API

In [None]:
def get_embedding(text: str) -> np.ndarray:
    """
    Obtenir l'embedding BGE-M3 d'un texte via Albert API.
    """
    url = f"{ALBERT_API_URL}/embeddings"
    headers = {
        "Authorization": f"Bearer {ALBERT_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "BAAI/bge-m3",
        "input": text
    }
    
    with httpx.Client(timeout=30) as client:
        response = client.post(url, headers=headers, json=payload)
        response.raise_for_status()
        
    data = response.json()
    embedding = data["data"][0]["embedding"]
    return np.array(embedding, dtype=np.float32)


# Mod√®le LLM disponible sur Albert API
LLM_MODEL = "mistralai/Mistral-Small-3.2-24B-Instruct-2506"  # alias: albert-large


def chat_completion(messages: list, model: str = LLM_MODEL) -> str:
    """
    Appeler le LLM Albert pour g√©n√©rer une r√©ponse.
    """
    url = f"{ALBERT_API_URL}/chat/completions"
    headers = {
        "Authorization": f"Bearer {ALBERT_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model,
        "messages": messages,
        "temperature": 0.3,
        "max_tokens": 1024
    }
    
    with httpx.Client(timeout=60) as client:
        response = client.post(url, headers=headers, json=payload)
        response.raise_for_status()
        
    data = response.json()
    return data["choices"][0]["message"]["content"]


# Test
print(f"ü§ñ Mod√®le LLM : {LLM_MODEL}")
test_response = chat_completion([{"role": "user", "content": "Dis bonjour en une phrase."}])
print(f"‚úÖ LLM test : {test_response}")

## 4. Recherche vectorielle

In [None]:
def search_datasets(query: str, top_k: int = 5) -> list[dict]:
    """
    Recherche s√©mantique dans Mediatech.
    """
    # Embedding de la requ√™te
    query_embedding = get_embedding(query)
    query_norm = query_embedding / np.linalg.norm(query_embedding)
    
    # Similarit√© cosinus
    similarities = embeddings_normalized @ query_norm
    
    # Top-k
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        row = df.iloc[idx]
        results.append({
            "doc_id": row["doc_id"],
            "title": row["title"],
            "organization": row["organization"],
            "description": row["description"][:500] if row["description"] else "",
            "url": row["url"],
            "similarity": float(similarities[idx])
        })
    
    return results

# Test
results = search_datasets("qualit√© de l'air", top_k=3)
for r in results:
    print(f"[{r['similarity']:.3f}] {r['title']}")

## 5. Client MCP

In [None]:
class MCPClient:
    """Client MCP simplifi√©."""
    
    def __init__(self, base_url: str):
        self.base_url = base_url
        self.session_id = None
        self._request_id = 0
    
    def _call(self, method: str, params: dict = None) -> dict:
        self._request_id += 1
        payload = {
            "jsonrpc": "2.0",
            "id": self._request_id,
            "method": method,
            "params": params or {}
        }
        
        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json, text/event-stream"
        }
        if self.session_id:
            headers["Mcp-Session-Id"] = self.session_id
        
        with httpx.Client(timeout=60, http2=False) as client:
            response = client.post(self.base_url, json=payload, headers=headers)
            if "mcp-session-id" in response.headers:
                self.session_id = response.headers["mcp-session-id"]
            response.raise_for_status()
        
        # Parser SSE si n√©cessaire
        if "text/event-stream" in response.headers.get("content-type", ""):
            for line in response.text.split("\n"):
                if line.startswith("data:"):
                    data = line[5:].strip()
                    if data:
                        return json.loads(data).get("result", {})
            return {}
        
        return response.json().get("result", {})
    
    def initialize(self):
        return self._call("initialize", {
            "protocolVersion": "2024-11-05",
            "capabilities": {},
            "clientInfo": {"name": "poc-datagouv", "version": "0.1.0"}
        })
    
    def call_tool(self, name: str, arguments: dict = None) -> str:
        """Appelle un tool et retourne le texte de r√©ponse."""
        result = self._call("tools/call", {"name": name, "arguments": arguments or {}})
        if result.get("content"):
            content = result["content"][0]
            if content.get("type") == "text":
                return content["text"]
        return ""

# Initialiser
mcp = MCPClient(MCP_URL)
mcp.initialize()
print(f"‚úÖ MCP connect√© (session: {mcp.session_id[:8]}...)")

## 6. Assistant complet

In [None]:
SYSTEM_PROMPT = """Tu es un assistant sp√©cialis√© dans les donn√©es publiques fran√ßaises (data.gouv.fr).

Tu r√©ponds aux questions en te basant sur les datasets fournis dans le contexte.
- Sois pr√©cis et factuel
- Cite tes sources (titre du dataset, organisation)
- Si tu ne trouves pas l'information, dis-le clairement
- R√©ponds en fran√ßais
"""


def ask(question: str, top_k: int = 3, fetch_mcp: bool = True) -> str:
    """
    Pipeline complet : question ‚Üí recherche ‚Üí enrichissement ‚Üí r√©ponse.
    
    Args:
        question: Question en langage naturel
        top_k: Nombre de datasets √† r√©cup√©rer
        fetch_mcp: Si True, enrichit avec les donn√©es MCP
    """
    print(f"üîç Recherche de datasets pertinents...")
    
    # 1. Recherche vectorielle
    datasets = search_datasets(question, top_k=top_k)
    
    # 2. Construire le contexte
    context_parts = []
    
    for i, ds in enumerate(datasets, 1):
        part = f"""### Dataset {i}: {ds['title']}
- Organisation: {ds['organization']}
- URL: {ds['url']}
- Score de pertinence: {ds['similarity']:.2f}
- Description: {ds['description'][:300]}..."""
        
        # Enrichir avec MCP si demand√©
        if fetch_mcp:
            try:
                # Essayer de r√©cup√©rer plus d'infos via MCP
                mcp_info = mcp.call_tool("get_dataset_info", {"dataset_id": ds["doc_id"]})
                if mcp_info:
                    part += f"\n- Infos MCP:\n{mcp_info[:500]}"
            except:
                pass  # Ignorer les erreurs MCP
        
        context_parts.append(part)
    
    context = "\n\n".join(context_parts)
    
    # 3. Appeler le LLM
    print(f"ü§ñ G√©n√©ration de la r√©ponse...")
    
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"""Contexte (datasets trouv√©s sur data.gouv.fr) :

{context}

---

Question : {question}

R√©ponds de mani√®re concise en citant les sources."""}
    ]
    
    response = chat_completion(messages)
    
    return response


print("‚úÖ Fonction ask() pr√™te")

## 7. Tests

In [None]:
# Test 1
question = "O√π trouver les donn√©es sur les bornes de recharge √©lectrique ?"
print(f"‚ùì {question}\n")

response = ask(question, top_k=3, fetch_mcp=False)
print(f"\nüí¨ R√©ponse :\n{response}")

In [None]:
# Test 2 - avec enrichissement MCP
question = "O√π trouver les donn√©es sur les bornes de recharge √©lectrique ?"
print(f"‚ùì {question}\n")

response = ask(question, top_k=3, fetch_mcp=True)
print(f"\nüí¨ R√©ponse :\n{response}")

In [None]:
# Test 3
question = "Quels sont les jeux de donn√©es les plus populaires sur les transports en commun ?"
print(f"‚ùì {question}\n")

response = ask(question, top_k=5, fetch_mcp=False)
print(f"\nüí¨ R√©ponse :\n{response}")

## 8. R√©sum√©

**Pipeline impl√©ment√©** :
```
Question ‚Üí Embedding (Albert) ‚Üí Recherche vectorielle (Mediatech)
    ‚Üí Enrichissement (MCP) ‚Üí G√©n√©ration r√©ponse (Albert LLM)
```

**Fonction principale** : `ask(question, top_k, fetch_mcp)`

---

## Prochaine √©tape

**Notebook 06** : Interface utilisateur finale