In [78]:
import time
import os

import dspy
from dspy.evaluate import Evaluate
from dotenv import load_dotenv

load_dotenv()

lm = dspy.LM(
    model="openrouter/deepseek/deepseek-r1-0528-qwen3-8b",
    # model="openrouter/moonshotai/kimi-dev-72b:free",
    api_base="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)

dspy.configure(lm=lm)


# 1. Define the signature for the task.
class LabResultSignature(dspy.Signature):
    """Extract result information from a lab result document."""

    document_text = dspy.InputField(desc="The full text of a lab result PDF.")
    results: dict[str, str] = dspy.OutputField(
        desc="The results of the lab test. The key is the name of the test and the value is the result. "
        "The result can be something like 'Inferior a 7 nmol/L' or 'Desprezível' or just regular number with units."
    )


In [79]:
import unicodedata

def normalize_key(key: str) -> str:
    """Remove accents and convert to lowercase for key comparison."""
    # Remove accents by decomposing and filtering out combining characters
    normalized = unicodedata.normalize('NFD', key)
    without_accents = ''.join(char for char in normalized if unicodedata.category(char) != 'Mn')
    return without_accents.lower()

def find_matching_key(target_key: str, available_keys: list[str]) -> str | None:
    """Find a matching key using accent-insensitive substring matching."""
    normalized_target = normalize_key(target_key)
    
    for available_key in available_keys:
        normalized_available = normalize_key(available_key)
        
        # Check if one contains the other
        if normalized_target in normalized_available or normalized_available in normalized_target:
            return available_key
    
    return None

def contain_result(example, pred, trace=None) -> bool:
    """Check if the prediction contains all the keys from the example's result with correct values."""
    # The LM may fail to return a dictionary.
    if not hasattr(pred, "results") or not isinstance(pred.results, dict):
        raise ValueError("The prediction does not have a 'results' field or is not a dictionary.")

    # The example must have the 'results' field.
    if not hasattr(example, "results") or not isinstance(example.results, dict):
        raise ValueError("The example does not have a 'results' field or is not a dictionary.")

    gold_results = example.results
    pred_results = pred.results
    pred_keys = list(pred_results.keys())

    # Check if all keys from the gold results are in the predicted results with the correct values.
    for key, value in gold_results.items():
        matching_key = find_matching_key(key, pred_keys)
        
        if matching_key is None:
            print(f"No matching key found for: {key}")
            print(f"Available keys: {pred_keys}")
            return False
            
        if pred_results[matching_key] != value:
            print(f"{key} (matched with {matching_key}): {pred_results[matching_key]} != {value}")
            return False

    return True


In [80]:
full_texts = []
for i in range(8):
    if i:
        with open(f'data/text_{i}.md', 'r') as file:
            text = file.read()
            full_texts.append(text)

dataset = [
    dspy.Example(document_text=full_texts[0][:10_000], results={
                        "Lipoproteina A (composto)": "Inferior a 7 nmol/L",
                        }).with_inputs("document_text"),
    dspy.Example(document_text=full_texts[1][:10_000], results={
                        "Ferritina": "68,4 ng/mL",
                        }).with_inputs("document_text"),
    dspy.Example(document_text=full_texts[2][:10_000], results={
                        "UREIA": "45,0 mg/dL",
                        "CREATININA": "0,99 mg/dL",
                        }).with_inputs("document_text"),
    dspy.Example(document_text=full_texts[3][:10_000], results={
                        "HEMACIAS": "4,63 milhões/mm3",
                        "HEMOGLOBINA": "13,6 g/dL",
                        "LEUCOCITOS": "5.780 /mm3",
                        "HEMATOCRITO": "40,2 %",
                        }).with_inputs("document_text"),
    dspy.Example(document_text=full_texts[4][:10_000], results={
                        "HEMÁCIAS": "5,62 milhões/mm3",
                        "HEMOGLOBINA": "16,7 g/dL",
                        "LEUCOCITOS": "8.840 /mm3",
                        "VCM": "84,9 fl",
                        "HCM": "29,7 pg",
                        }).with_inputs("document_text"),
    dspy.Example(document_text=full_texts[5][:10_000], results={
                        "GLICOSE": "112.00 mg/dL",
                        }).with_inputs("document_text"),
    dspy.Example(document_text=full_texts[6][:10_000], results={
                        "DOSAGEM DE GLICOSE EM JEJUM": "100 mg/dL",
                        "COLESTEROL TOTAL": "257 mg/dL",
                        "TRIGLICERÍDEOS": "65 mg/dL",
                        }).with_inputs("document_text"),
]

In [81]:
# Show the first 300 characters of the file
with open('data/text_1.md', 'r') as file:
    text = file.read()
    print(text[:5000])

print("nmol" in text.lower())

![](_page_0_Picture_0.jpeg)

Atendimento ao cliente: Rio de Janeiro(21) 2538 3939

![](_page_0_Figure_2.jpeg)

DASA - Rua Xavier Pinheiro, 439 Quadra 29 - Pq. Duque de Caxias CNES:7402074 - Duque de Caxias, RJ

*Lipoproteina A (composto)*

A interpretação dos resultados deste(s) exame(s) e a conclusão diagnóstica são atos médicos, dependem de análise conjunta dos dados clínicos e demais exames do(a) paciente.

Data da geração: 10/06/2024 • 11:08 Sob a responsabilidade do Dr. Cristovam Scapulatempo Neto - CRM nº 52- 0105890-8 Laudo também disponível ao médico prescritor no [Nav PRO](http://nav.pro.br/)

Laboratório registrado no CRM/RJ sob o número 0111212-0 Licença de funcionamento 52425

Valide seu laudo com o QR Code valida.dasa.com.br Token: **bFdCGM8EC**

![](_page_0_Picture_9.jpeg)

Pág. 1 de 1

NAM - Núcleo de Assessoria Médica: 4020-2446
False


In [82]:
# Create the predictor instance
lab_result_predictor = dspy.Predict(LabResultSignature)


In [83]:
evaluator = Evaluate(devset=dataset, num_threads=7, display_progress=True, display_table=7)


In [84]:
start_time = time.time()
evaluator(lab_result_predictor, metric=contain_result)
end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")


No matching key found for: Lipoproteina A (composto)
Available keys: []
Average Metric: 6.00 / 7 (85.7%): 100%|██████████| 7/7 [00:00<00:00, 3642.24it/s] 

2025/07/27 01:42:26 INFO dspy.evaluate.evaluate: Average Metric: 6 / 7 (85.7%)





Unnamed: 0,document_text,example_results,pred_results,contain_result
0,![](_page_0_Picture_0.jpeg) Atendimento ao cliente: Rio de Janeiro...,{'Lipoproteina A (composto)': 'Inferior a 7 nmol/L'},{},
1,![](_page_0_Picture_0.jpeg) | Marcia Regina de Padua | | CPF:<br>2...,"{'Ferritina': '68,4 ng/mL'}","{'Ferritina': '68,4 ng/mL', 'Ácido Fólico': '12,48 ng/mL'}",✔️ [True]
2,![](_page_0_Picture_0.jpeg) | NOME:<br>D.N.:<br>SOLICITANTE:<br>CO...,"{'UREIA': '45,0 mg/dL', 'CREATININA': '0,99 mg/dL'}","{'UREIA': '45,0 mg/dL', 'CREATININA': '0,99 mg/dL', 'GLICEMIA': '1...",✔️ [True]
3,![](_page_0_Picture_0.jpeg) ![](_page_0_Picture_1.jpeg) **Nome :**...,"{'HEMACIAS': '4,63 milhões/mm3', 'HEMOGLOBINA': '13,6 g/dL', 'LEUC...","{'HEMOGRAMA COMPLETO - HEMACIAS': '4,63 milhões/mm3', 'HEMOGRAMA C...",✔️ [True]
4,![](_page_0_Picture_0.jpeg) **Nome :** RAFAEL BATISTA COSTA **RG :...,"{'HEMÁCIAS': '5,62 milhões/mm3', 'HEMOGLOBINA': '16,7 g/dL', 'LEUC...","{'HEMÁCIAS': '5,62 milhões/mm3', 'HEMOGLOBINA': '16,7 g/dL', 'HEMA...",✔️ [True]
5,![](_page_0_Picture_0.jpeg) VIDA IMAGEM BOA VIAGEM Pag: 1 de **Val...,{'GLICOSE': '112.00 mg/dL'},"{'GLICOSE': '112.00 mg/dL', 'Hemoglobina Glicada': '7.07 %', 'Glic...",✔️ [True]
6,![](_page_0_Picture_0.jpeg) ![](_page_0_Picture_1.jpeg) ![](_page_...,"{'DOSAGEM DE GLICOSE EM JEJUM': '100 mg/dL', 'COLESTEROL TOTAL': '...","{'DOSAGEM DE GLICOSE EM JEJUM': '100 mg/dL', 'COLESTEROL TOTAL': '...",✔️ [True]


Time taken: 0.031038999557495117 seconds


In [85]:
# pred = lab_result_predictor(document_text=dataset[1].document_text)
# contain_result(dataset[1], pred)

for example in dataset:
    pred = lab_result_predictor(document_text=example.document_text)
    print(contain_result(example, pred))
    print("-" * 100)


No matching key found for: Lipoproteina A (composto)
Available keys: []
False
----------------------------------------------------------------------------------------------------
True
----------------------------------------------------------------------------------------------------
True
----------------------------------------------------------------------------------------------------
True
----------------------------------------------------------------------------------------------------
True
----------------------------------------------------------------------------------------------------
True
----------------------------------------------------------------------------------------------------
True
----------------------------------------------------------------------------------------------------
