# 06 - D√©mo : Assistant data.gouv.fr

Interface interactive pour interroger les donn√©es publiques fran√ßaises en langage naturel.

## 1. Initialisation

In [None]:
import os
import json
import numpy as np
import duckdb
import httpx
import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output
from dotenv import load_dotenv

load_dotenv("../.env")

ALBERT_API_KEY = os.getenv("ALBERT_API_KEY")
ALBERT_API_URL = os.getenv("ALBERT_API_URL", "https://albert.api.etalab.gouv.fr/v1")
MCP_URL = os.getenv("MCP_DATAGOUV_URL", "https://mcp.data.gouv.fr/mcp")
LLM_MODEL = "mistralai/Mistral-Small-3.2-24B-Instruct-2506"

print("‚è≥ Chargement des donn√©es...")

In [None]:
%%time
# Charger Mediatech
PARQUET_GLOB = "../huggingface/data_gouv_datasets_catalog_part_*.parquet"
con = duckdb.connect()

df = con.execute(f"""
    SELECT 
        doc_id, title, organization, description, url, quality_score,
        "embeddings_bge-m3" as embedding_json
    FROM read_parquet('{PARQUET_GLOB}')
    WHERE "embeddings_bge-m3" IS NOT NULL
""").df()

# Parser et normaliser les embeddings
embeddings_list = [json.loads(e) for e in df["embedding_json"]]
embeddings_matrix = np.array(embeddings_list, dtype=np.float32)
norms = np.linalg.norm(embeddings_matrix, axis=1, keepdims=True)
embeddings_normalized = embeddings_matrix / norms

print(f"‚úÖ {len(df):,} datasets charg√©s")

In [None]:
# Fonctions API
def get_embedding(text: str) -> np.ndarray:
    url = f"{ALBERT_API_URL}/embeddings"
    headers = {"Authorization": f"Bearer {ALBERT_API_KEY}", "Content-Type": "application/json"}
    payload = {"model": "BAAI/bge-m3", "input": text}
    with httpx.Client(timeout=30) as client:
        response = client.post(url, headers=headers, json=payload)
        response.raise_for_status()
    return np.array(response.json()["data"][0]["embedding"], dtype=np.float32)


def chat_completion(messages: list) -> str:
    url = f"{ALBERT_API_URL}/chat/completions"
    headers = {"Authorization": f"Bearer {ALBERT_API_KEY}", "Content-Type": "application/json"}
    payload = {"model": LLM_MODEL, "messages": messages, "temperature": 0.3, "max_tokens": 1024}
    with httpx.Client(timeout=60) as client:
        response = client.post(url, headers=headers, json=payload)
        response.raise_for_status()
    return response.json()["choices"][0]["message"]["content"]


def search_datasets(query: str, top_k: int = 5) -> list[dict]:
    query_embedding = get_embedding(query)
    query_norm = query_embedding / np.linalg.norm(query_embedding)
    similarities = embeddings_normalized @ query_norm
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        row = df.iloc[idx]
        results.append({
            "doc_id": row["doc_id"],
            "title": row["title"],
            "organization": row["organization"],
            "description": row["description"][:400] if row["description"] else "",
            "url": row["url"],
            "similarity": float(similarities[idx])
        })
    return results


SYSTEM_PROMPT = """Tu es un assistant sp√©cialis√© dans les donn√©es publiques fran√ßaises (data.gouv.fr).
Tu r√©ponds aux questions en te basant sur les datasets fournis.
- Sois pr√©cis et concis
- Cite tes sources avec les liens
- R√©ponds en fran√ßais
- Utilise le format Markdown"""


def ask(question: str, top_k: int = 5) -> str:
    datasets = search_datasets(question, top_k=top_k)
    
    context_parts = []
    for i, ds in enumerate(datasets, 1):
        context_parts.append(f"""### Dataset {i}: {ds['title']}
- Organisation: {ds['organization']}
- URL: {ds['url']}
- Description: {ds['description']}...""")
    
    context = "\n\n".join(context_parts)
    
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"""Contexte (datasets data.gouv.fr) :\n\n{context}\n\n---\n\nQuestion : {question}"""}
    ]
    
    return chat_completion(messages)


print("‚úÖ Fonctions pr√™tes")

## 2. Interface interactive

In [None]:
# Widgets
question_input = widgets.Textarea(
    value="",
    placeholder="Posez votre question sur les donn√©es publiques fran√ßaises...",
    layout=widgets.Layout(width="100%", height="80px")
)

top_k_slider = widgets.IntSlider(
    value=5, min=1, max=10, step=1,
    description="Nb datasets:",
    style={"description_width": "100px"}
)

submit_button = widgets.Button(
    description="üîç Rechercher",
    button_style="primary",
    layout=widgets.Layout(width="150px")
)

output_area = widgets.Output()

# Exemples de questions
examples = [
    "O√π trouver les donn√©es sur les bornes de recharge √©lectrique ?",
    "Quels datasets contiennent des informations sur la qualit√© de l'air ?",
    "Donn√©es d√©mographiques par commune en France",
    "Statistiques sur les transports en commun √† Paris",
    "Donn√©es ouvertes sur les √©coles et √©tablissements scolaires"
]

example_buttons = [
    widgets.Button(description=ex[:50] + "..." if len(ex) > 50 else ex, 
                   layout=widgets.Layout(width="auto"),
                   button_style="info")
    for ex in examples
]

def on_example_click(b):
    idx = example_buttons.index(b)
    question_input.value = examples[idx]

for btn in example_buttons:
    btn.on_click(on_example_click)

def on_submit(b):
    question = question_input.value.strip()
    if not question:
        return
    
    with output_area:
        clear_output(wait=True)
        print("‚è≥ Recherche en cours...")
        
        try:
            # Recherche
            datasets = search_datasets(question, top_k=top_k_slider.value)
            
            clear_output(wait=True)
            print("‚è≥ G√©n√©ration de la r√©ponse...")
            
            # R√©ponse LLM
            response = ask(question, top_k=top_k_slider.value)
            
            clear_output(wait=True)
            
            # Afficher la r√©ponse
            display(Markdown(f"## üí¨ R√©ponse\n\n{response}"))
            
            # Afficher les sources
            display(Markdown("\n---\n## üìö Sources"))
            for i, ds in enumerate(datasets, 1):
                display(Markdown(f"""**{i}. [{ds['title']}]({ds['url']})**  
*{ds['organization']}* ‚Äî Score: {ds['similarity']:.2f}"""))
                
        except Exception as e:
            clear_output(wait=True)
            print(f"‚ùå Erreur : {e}")

submit_button.on_click(on_submit)

# Permettre Enter pour soumettre
def on_enter(event):
    if event["name"] == "value" and event["new"].endswith("\n"):
        question_input.value = event["new"].strip()
        on_submit(None)

print("‚úÖ Interface pr√™te")

In [None]:
# Afficher l'interface
display(Markdown("# üá´üá∑ Assistant data.gouv.fr"))
display(Markdown("*Posez vos questions sur les donn√©es publiques fran√ßaises*"))

display(Markdown("### üí° Exemples de questions"))
display(widgets.HBox(example_buttons[:3]))
display(widgets.HBox(example_buttons[3:]))

display(Markdown("### ‚ùì Votre question"))
display(question_input)
display(widgets.HBox([submit_button, top_k_slider]))

display(Markdown("---"))
display(output_area)

## 3. Mode conversation (optionnel)

In [None]:
# Pour tester directement sans interface
def demo(question: str):
    """Fonction simple pour tester."""
    print(f"‚ùì {question}\n")
    print("‚è≥ Recherche...")
    
    datasets = search_datasets(question, top_k=5)
    response = ask(question, top_k=5)
    
    print(f"\nüí¨ R√©ponse :\n{response}")
    print("\nüìö Sources :")
    for i, ds in enumerate(datasets, 1):
        print(f"{i}. {ds['title']} ({ds['organization']})")
        print(f"   {ds['url']}")

In [None]:
# Exemple d'utilisation directe
# demo("Quelles donn√©es sont disponibles sur le logement social ?")

---

## üìã R√©sum√© du POC

**Architecture** :
```
Question ‚Üí Embedding (Albert BGE-M3) ‚Üí Recherche vectorielle (Mediatech)
    ‚Üí Contexte ‚Üí LLM (Albert Mistral) ‚Üí R√©ponse avec sources
```

**Composants** :
- **Mediatech** : 99k datasets pr√©-vectoris√©s
- **Albert API** : Embeddings + LLM souverain
- **MCP data.gouv** : Donn√©es fra√Æches (optionnel)

**Notebooks** :
1. `01_setup_test` - Configuration
2. `02_mediatech_exploration` - Exploration donn√©es
3. `03_vector_search` - Recherche s√©mantique
4. `04_mcp_client` - Client MCP
5. `05_orchestration` - Pipeline complet
6. `06_demo` - Interface finale