In [None]:
import os
import json

from dotenv import load_dotenv
from docling.document_converter import DocumentConverter

from schemas.json_schema import JSON_SCHEMA

In [None]:
load_dotenv()
API_KEY = os.getenv("ALBERT_API_KEY")
ENDPOINT = "https://albert.api.etalab.gouv.fr/v1/agents/completions"

In [None]:
document_path = [
  "https://aides-redevances.eau-loire-bretagne.fr/files/live/sites/aides-redevances/files/Aides-12prog/Fiches-actions/AGRI_1.pdf",
  "./2017-561_final.pdf"
]

In [None]:
def get_document_context(file_path):
  converter = DocumentConverter()
  doc = converter.convert(file_path).document
  return doc.export_to_text()

In [None]:
import requests

def get_header():
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {API_KEY}",  # Remplace par ta vraie clé
        "Content-Type": "application/json",
    }
    return headers

def get_body_text(model_name, user_message, **kwargs):
    payload = {
        "model": model_name,
        "messages": [
            {
                "role": "user",
                "content": user_message
            }
        ],
        "response_format": {
            "type": "text"
        },
        **kwargs
    }

    return payload

def get_body(model_name, user_message, json_schema, **kwargs):
    payload = {
        "model": model_name,
        "messages": [
            {
                "role": "user",
                "content": user_message
            }
        ],
        "response_format": {
            "type": "json_schema",
            "json_schema": json_schema
        },
        **kwargs
    }

    return payload

def albert_structured_output(model_name, user_message, json_schema, **kwargs):
    url = ENDPOINT
    headers = get_header()
    body = get_body(model_name, user_message, json_schema, **kwargs)

    response = requests.post(url, headers=headers, json=body)

    # Vérification du statut et affichage
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Erreur {response.status_code}: {response.text}")

In [None]:
INSTRUCTION_PROMPT = """
Tu es un modèle de langage spécialisé dans l’analyse et la synthèse d’informations.
Ta mission est de produire un résultat strictement structuré en JSON, conforme au schéma spécifié ci-dessous.
- Tu dois suivre fidèlement la structure : aucune information hors du format demandé.
- Ne mets aucun commentaire, texte explicatif ou balise en dehors du JSON.
- Les valeurs doivent respecter le type indiqué (string, integer, boolean, etc.).
- Si une donnée est manquante, utilise null plutôt qu’un texte libre.
"""

In [None]:
responses = []

for idx, path in enumerate(document_path):
  print(f"Analyzing document {idx}")
  document_context = get_document_context(path)
  final_prompt = INSTRUCTION_PROMPT + document_context
  full_response = albert_structured_output("albert-large", final_prompt, JSON_SCHEMA, temperature=1, max_completion_tokens=300)
  responses.append(full_response)

In [None]:
print(responses[0]["choices"][0]["message"]["content"])

In [None]:
print(responses[1]["choices"][0]["message"]["content"])

In [None]:
for idx, res in enumerate(responses):
  print(f"Token number for response {idx}: {res["usage"]["prompt_tokens"]}")