In [None]:
import langextract as lx
import textwrap
import os
from openai import OpenAI


class AlbertSchema(lx.schema.BaseSchema):
    """Schema implementation for Albert structured output."""

    def __init__(self, schema_dict: dict):
        """Initialize the schema with a dictionary."""
        self._schema_dict = schema_dict

    @property
    def schema_dict(self) -> dict:
        """Return the schema dictionary."""
        return self._schema_dict

    @classmethod
    def from_examples(cls, examples_data, attribute_suffix="_attributes"):
        """Build schema from example extractions.

        Args:
            examples_data: Sequence of ExampleData objects.
            attribute_suffix: Suffix for attribute fields.

        Returns:
            A configured AlbertSchema instance.
        """
        extraction_types = {}
        for example in examples_data:
            for extraction in example.extractions:
                class_name = extraction.extraction_class
                if class_name not in extraction_types:
                    extraction_types[class_name] = set()
                if extraction.attributes:
                    extraction_types[class_name].update(extraction.attributes.keys())

        schema_dict = {
            "type": "object",
            "properties": {
                "extractions": {
                    "type": "array",
                    "items": {"type": "object"}
                }
            },
            "required": ["extractions"]
        }

        return cls(schema_dict)

    def to_provider_config(self) -> dict:
        """Convert to provider-specific configuration.

        Returns:
            Dictionary of provider-specific configuration.
        """
        return {
            "response_schema": self._schema_dict,
            "structured_output": True
        }

    @property
    def supports_strict_mode(self) -> bool:
        """Whether this schema guarantees valid structured output.

        Returns:
            True if the provider enforces valid JSON output.
        """
        return False  # Set to True only if your provider guarantees valid JSON


@lx.providers.registry.register(r'^albert', priority=10)
class MyProviderLanguageModel(lx.inference.BaseLanguageModel):
    def __init__(self, model_id: str, api_key: str = None, **kwargs):
        super().__init__()
        self.model_id = model_id
        self.api_key = api_key or os.environ.get('ALBERT_API_KEY')
        # Initialize your client
        # The user is trying to connect to an Albert API. The URL suggests it might be compatible with the OpenAI API format.
        # If so, the `openai` python client can be used.
        self.client = OpenAI(
            base_url=kwargs.get("model_url"),
            api_key=self.api_key,
        )

    def infer(self, batch_prompts, **kwargs):
        # Implement inference
        for prompt in batch_prompts:
            response = self.client.chat.completions.create(
                model=self.model_id,
                messages=[{"role": "user", "content": prompt}],
                # **kwargs
            )
            result = response.choices[0].message.content
            yield [lx.inference.ScoredOutput(score=1.0, output=result)]

            
# 1. Define the prompt and extraction rules
prompt = textwrap.dedent("""

.""")

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text="""                                                                                                         DEVIS N° 01828383

                                                                                                           Date : 22/01/2025

                                                                                                    Code client : DIV45679457484
                                                                                                                Tél. :  /




 2 Avenue du Général Leclerc
 76250 DEVILLE LES ROUEN

 Tél. : 0138764979                                                                     Mr et Mme BORIS ET SARAH HERVOT
                                                                                       ET DELIMI

                                                                                       217 Rue Grieu
 E-mail : contact@gilles-fermetures.fr
                                                                                       76000 ROUEN

 Votre contact : Frédérick Horville                   Certificat n° E-E182016

 Délai : 10 à 12 semaines
 Date de visite préalable : 21/01/2025

 Adresse du chantier : Mr et Mme BORIS ET SARAH HERVOT ET DELIMI - 217 Rue Grieu -  - 76000 ROUEN
 
 
                                                        MEB Fermetures

                                           EURL au capital de 10 000€ - Siret 18736703100010 - APE 4332A
                                                            TVA FR20879267011
                                        Médiateur de consommation : CM2C Représenté par Monsieur René Jalin
                                                       AREAS contrat n° : 03652857F087

""",

        extractions=[
            lx.data.Extraction(
                extraction_class="adresses",
                extraction_text="2 Avenue du General Leclerc",
            ),
            lx.data.Extraction(
                extraction_class="adresses",
                extraction_text="217 Rue Grieu, 76000 ROUEN",
            ),
            lx.data.Extraction(
                extraction_class="assurances",
                extraction_text="03652857F087",
            ),
            lx.data.Extraction(
                extraction_class="capital_social",
                extraction_text="10 000",
            ),
            lx.data.Extraction(
                extraction_class="sirets",
                extraction_text="18736703100010",
            ),
            lx.data.Extraction(
                extraction_class="telephones",
                extraction_text="0138764979",
            ),
            lx.data.Extraction(
                extraction_class="email",
                extraction_text="contact@gilles-fermetures.fr",
            ),
        ]
    )
]

# The input text to be processed
input_text = """M. Thery Abraham
                                                                    37 Avenue de Castres
JF
Puylaurens                                                                                                 En date du : 09/11/2023

France                                                                                               Valable jusqu'au : 24/11/2023
TVA N° FR03834957664                                                                             Début des travaux le : 05/02/2024
Tél : 06 48 35 22 86                                                                                 Durée estimée à : 3 semaines

Email : florian.boisneault@gmail.com



                                                  rénovation énergétique
"""

# Run the extraction
result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="albert-small",
    model_url="https://albert.api.etalab.gouv.fr/v1",
    api_key=os.getenv("ALBERT_API_KEY")
)

  class AlbertSchema(lx.schema.BaseSchema):
  return inference.__getattr__(name)
  return inference.__getattr__(name)


In [2]:
result.extractions

[Extraction(extraction_class='prenom', extraction_text='Thery', char_interval=CharInterval(start_pos=3, end_pos=8), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=1, group_index=0, description=None, attributes=None),
 Extraction(extraction_class='nom', extraction_text='Abraham', char_interval=CharInterval(start_pos=9, end_pos=16), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=2, group_index=0, description=None, attributes=None),
 Extraction(extraction_class='adresse', extraction_text='37 Avenue de Castres', char_interval=CharInterval(start_pos=85, end_pos=105), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=3, group_index=0, description=None, attributes=None),
 Extraction(extraction_class='code_postal', extraction_text='31250', char_interval=CharInterval(start_pos=175, end_pos=180), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=4, group_index=0, descript