In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.display import display_markdown, Markdown
import logging
from tqdm import tqdm

from archaeo_super_prompt.dataset import MagohDataset
from archaeo_super_prompt.dataset.thesaurus import load_comune
from archaeo_super_prompt.modeling.pdf_to_text import VLLM_Preprocessing
import archaeo_super_prompt.modeling.entity_extractor.model as ner_module
from archaeo_super_prompt.modeling.entity_extractor import NamedEntityField
from archaeo_super_prompt.visualization.entities import visualize_entities

# Chunk pre-selection with Named-Entity matching

For some fields with a know thesaurus set, the values are directly occuring in the document. Then, to infer the class of those fields, we figure out with Named-Entity-Extraction (NER) model if their thesaurus are present and in which text chunks.

At the end of this pre-selection, the following information are providable to data extraction model :
- the $k$ more relevant chunks with the presence of some thesaurus of the field
- the thesaurus values already identified

The LLM data extraction model can then just read the few chunks and output the more likely thesaurus among the provided ones through a simple ChainOfThought

## Sample ingestion

In [None]:
MAX_SAMPLES_FETCHED = 300
SEED = 0.5

dataset = MagohDataset(MAX_SAMPLES_FETCHED, SEED, True)
_selected_ids = [35983, 31298]
selected_ids = set(_selected_ids)
inputs = dataset.get_files_for_batch(selected_ids)


def ingest():
    logging.basicConfig(level=logging.INFO)
    TIMEOUT_PER_PAGE = 60 * 3
    # Example using the Granite Vision model with Ollama:
    doc_converter = VLLM_Preprocessing(
        model="granite3.2-vision:latest",
        prompt="OCR this part of Italian document for markdown-based processing.",
        embedding_model_hf_id="nomic-ai/nomic-embed-text-v1.5",
        max_chunk_size=512,
        allowed_timeout=TIMEOUT_PER_PAGE,
    )
    results = doc_converter.transform(inputs)
    return results


ready_to_be_processed = ingest()

In [None]:
ready_to_be_processed

## Named-Entity Extraction

In [None]:
# ner_module functions: fetch_entities, postprocess_entities, filter_entities, extract_wanted_entities

batch_entities = ner_module.fetch_entities(
    list(
        map(lambda row: row.chunk_content, ready_to_be_processed.itertuples())
    )
)

In [None]:
batch_entities

In [None]:
ppss = ner_module.postrocess_entities(batch_entities, 0.70)

In [None]:
ppss

In [None]:
visualized = [
    visualize_entities(content, entities)
    for content, entities in zip(
        list(
            map(
                lambda row: row.chunk_content,
                ready_to_be_processed.itertuples(),
            )
        ),
        ppss,
    )
]

In [None]:
def visualize(visualized):
    for vi in visualized:
        display_markdown(Markdown(vi))
        print("--" * 5)


visualize(visualized)

## Filter extracted entities

In [None]:
filtered_ppss = list(ner_module.filter_entities(ppss, ["LUOGO", "INDIRIZZO", "CODICE_POSTALE"]))

f_visualized = [
    visualize_entities(content, entities)
    for content, entities in zip(
        list(
            map(
                lambda row: row.chunk_content,
                ready_to_be_processed.itertuples(),
            )
        ),
        filtered_ppss,
    )
]

visualize(f_visualized)

## Filter chunks according to occured entities

In [None]:
comune_field_to_be_figured_out = NamedEntityField(
    "comune",
    {"INDIRIZZO", "CODICE_POSTALE", "LUOGO"},
    load_comune
)

In [None]:
with_comune = ner_module.extract_wanted_entities(
    list(
            map(
                lambda row: row.chunk_content,
                ready_to_be_processed.itertuples(),
            )
        ),
    ner_module.filter_entities(
        ppss, comune_field_to_be_figured_out.compatible_entities
    ),
    comune_field_to_be_figured_out.thesaurus_values,
    0.95
)

In [None]:
for entities_matches in with_comune:
    print(entities_matches)