In [43]:
import time
import os

import dspy
from dspy.evaluate import Evaluate
from dotenv import load_dotenv

load_dotenv()

lm = dspy.LM(
    model="openrouter/deepseek/deepseek-r1-0528-qwen3-8b",
    # model="openrouter/moonshotai/kimi-dev-72b:free",
    api_base="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY",
    cache=False),
)

dspy.configure(lm=lm)


# 1. Define the signature for the task.
class LabResultSignature(dspy.Signature):
    """Extract result information from a lab result document."""

    document_text = dspy.InputField(desc="The full text of a lab result PDF.")
    results: dict[str, str] = dspy.OutputField(
        desc="The results of the lab test. The key is the name of the test and the value is the result. "
        "The result can be something like 'Inferior a 7 nmol/L' or 'Desprezível' or just regular number with units."
    )


TypeError: getenv() got an unexpected keyword argument 'cache'

In [None]:
import unicodedata

def normalize_text(text: str) -> str:
    """Remove accents, convert to lowercase, and normalize superscript numerals."""
    # Remove accents by decomposing and filtering out combining characters
    normalized = unicodedata.normalize('NFD', text)
    without_accents = ''.join(char for char in normalized if unicodedata.category(char) != 'Mn')
    
    # Replace superscript numerals with regular numerals
    without_accents = without_accents.replace('²', '2').replace('³', '3')
    
    return without_accents.lower()

def find_matching_key(target_key: str, available_keys: list[str]) -> str | None:
    """Find a matching key using accent-insensitive substring matching."""
    normalized_target = normalize_text(target_key)
    
    for available_key in available_keys:
        normalized_available = normalize_text(available_key)
        
        # Check if one contains the other
        if normalized_target in normalized_available or normalized_available in normalized_target:
            return available_key
    
    return None

def values_match(expected_value: str, actual_value: str) -> bool:
    """Check if two values match after normalization."""
    normalized_expected = normalize_text(expected_value)
    normalized_actual = normalize_text(actual_value)
    
    # Direct match
    if normalized_expected == normalized_actual:
        return True
    
    # Check if the core measurement matches (allowing for extra text)
    if normalized_expected in normalized_actual or normalized_actual in normalized_expected:
        return True
    
    return False

def contain_result(example, pred, trace=None) -> bool:
    """Check if the prediction contains all the keys from the example's result with correct values."""
    # The LM may fail to return a dictionary.
    if not hasattr(pred, "results") or not isinstance(pred.results, dict):
        raise ValueError("The prediction does not have a 'results' field or is not a dictionary.")

    # The example must have the 'results' field.
    if not hasattr(example, "results") or not isinstance(example.results, dict):
        raise ValueError("The example does not have a 'results' field or is not a dictionary.")

    gold_results = example.results
    pred_results = pred.results
    pred_keys = list(pred_results.keys())

    # Check if all keys from the gold results are in the predicted results with the correct values.
    for key, value in gold_results.items():
        matching_key = find_matching_key(key, pred_keys)
        matching_value = values_match(value, pred_results[matching_key])
        
        if matching_key is None:
            print(f"No matching key found for: {key}")
            print(f"Available keys: {pred_keys}")
            return False
            
        if not matching_value:
            print(f"{key} (matched with {matching_key}): {pred_results[matching_key]} != {value}")
            return False

    return True


In [None]:
# Test normalization functions
print("Testing superscript normalization:")
print(f"mm³ -> {normalize_text('mm³')}")
print(f"mm² -> {normalize_text('mm²')}")
print(f"4,63 milhões/mm³ -> {normalize_text('4,63 milhões/mm³')}")

print("\nTesting value normalization:")
print(f"'68,4 ng/mL' -> '{normalize_text('68,4 ng/mL')}'")
print(f"'68,4 ng/mL Dentro do intervalo de referência' -> '{normalize_text('68,4 ng/mL Dentro do intervalo de referência')}'")

print("\nTesting value matching:")
print(f"values_match('68,4 ng/mL', '68,4 ng/mL Dentro do intervalo de referência'): {values_match('68,4 ng/mL', '68,4 ng/mL Dentro do intervalo de referência')}")
print(f"values_match('4,63 milhões/mm³', '4,63 milhões/mm3'): {values_match('4,63 milhões/mm³', '4,63 milhões/mm3')}")


Testing superscript normalization:
mm³ -> mm3
mm² -> mm2
4,63 milhões/mm³ -> 4,63 milhoes/mm3

Testing value normalization:
'68,4 ng/mL' -> '68,4 ng/ml'
'68,4 ng/mL Dentro do intervalo de referência' -> '68,4 ng/ml dentro do intervalo de referencia'

Testing value matching:
values_match('68,4 ng/mL', '68,4 ng/mL Dentro do intervalo de referência'): True
values_match('4,63 milhões/mm³', '4,63 milhões/mm3'): True


In [None]:
# OCR_USED = "ocr_marker"
OCR_USED = "ocr_mistral"

full_texts = []
for i in range(8):
    if i:
        with open(f'data/text_{OCR_USED}_{i}.md', 'r') as file:
            text = file.read()
            full_texts.append(text)

dataset = [
    dspy.Example(document_text=full_texts[0][:10_000], results={
                        "Lipoproteina A (composto)": "Inferior a 7 nmol/L",
                        }).with_inputs("document_text"),
    dspy.Example(document_text=full_texts[1][:10_000], results={
                        "Ferritina": "68,4 ng/mL",
                        }).with_inputs("document_text"),
    dspy.Example(document_text=full_texts[2][:10_000], results={
                        "UREIA": "45,0 mg/dL",
                        "CREATININA": "0,99 mg/dL",
                        }).with_inputs("document_text"),
    dspy.Example(document_text=full_texts[3][:10_000], results={
                        "HEMACIAS": "4,63 milhões/mm3",
                        "HEMOGLOBINA": "13,6 g/dL",
                        "LEUCOCITOS": "5.780 /mm3",
                        "HEMATOCRITO": "40,2 %",
                        }).with_inputs("document_text"),
    dspy.Example(document_text=full_texts[4][:10_000], results={
                        "HEMÁCIAS": "5,62 milhões/mm3",
                        "HEMOGLOBINA": "16,7 g/dL",
                        "LEUCOCITOS": "8.840 /mm3",
                        "VCM": "84,9 fl",
                        "HCM": "29,7 pg",
                        }).with_inputs("document_text"),
    dspy.Example(document_text=full_texts[5][:10_000], results={
                        "GLICOSE": "112.00 mg/dL",
                        }).with_inputs("document_text"),
    dspy.Example(document_text=full_texts[6][:10_000], results={
                        "DOSAGEM DE GLICOSE EM JEJUM": "100 mg/dL",
                        "COLESTEROL TOTAL": "257 mg/dL",
                        "TRIGLICERÍDEOS": "65 mg/dL",
                        }).with_inputs("document_text"),
]

In [None]:
# Show the first 300 characters of the file
with open('data/text_1.md', 'r') as file:
    text = file.read()
    print(text[:5000])

print("nmol" in text.lower())

![](_page_0_Picture_0.jpeg)

Atendimento ao cliente: Rio de Janeiro(21) 2538 3939

![](_page_0_Figure_2.jpeg)

DASA - Rua Xavier Pinheiro, 439 Quadra 29 - Pq. Duque de Caxias CNES:7402074 - Duque de Caxias, RJ

*Lipoproteina A (composto)*

A interpretação dos resultados deste(s) exame(s) e a conclusão diagnóstica são atos médicos, dependem de análise conjunta dos dados clínicos e demais exames do(a) paciente.

Data da geração: 10/06/2024 • 11:08 Sob a responsabilidade do Dr. Cristovam Scapulatempo Neto - CRM nº 52- 0105890-8 Laudo também disponível ao médico prescritor no [Nav PRO](http://nav.pro.br/)

Laboratório registrado no CRM/RJ sob o número 0111212-0 Licença de funcionamento 52425

Valide seu laudo com o QR Code valida.dasa.com.br Token: **bFdCGM8EC**

![](_page_0_Picture_9.jpeg)

Pág. 1 de 1

NAM - Núcleo de Assessoria Médica: 4020-2446
False


In [None]:
# Create the predictor instance
lab_result_predictor = dspy.Predict(LabResultSignature)


In [None]:
evaluator = Evaluate(devset=dataset, num_threads=7, display_progress=True, display_table=7)


In [None]:
start_time = time.time()
evaluator(lab_result_predictor, metric=contain_result)
end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")


2025/07/29 01:03:26 ERROR dspy.utils.parallelizer: Error for Example({'document_text': 'Nome: TEREZA CRISTINA APOLIANO HOMSI Médico: ISNADIA COSTA SILVA Convênio: CAFAZ Data Atend.: 21/03/2023\n\nEntrega: INTERNET CPF: 259.762.813-20 RG: 8912002003648\n\nPág.: 1 de 7 Pedido: 1643971-00 Cliente: 92002-00 Nasc.: 03/10/1966\n\n# DOSAGEM DE GLICOSE EM JEJUM:\n\n## RESULTADO: 100 mg/dL\n\nÚltimos resultados em mg/dL: 86[03/03/2023]; 96[16/11/2022]; 95[21/09/2022]; 89[27/07/2022]; 84[10/06/2022];\n\nVALORES DE REFERÊNCIA: $70-99 \\mathrm{mg} / \\mathrm{dL}$ Método: UV Enzimático (Hexoquinase) Material........: Soro ou Plasma. Data da coleta..: 22/03/2023 08:10:23 Liberado por: Dr. Luís Gonzaga Moura Xavier - CRM-CE: 4106\n\n## COLESTEROL TOTAL:\n\n## RESULTADO: $257 \\mathrm{mg} / \\mathrm{dL}$\n\nÚltimos resultados em mg/dL: 217[17/01/2023]; 196[16/11/2022]; 209[21/09/2022]; 185[27/07/2022]; 181[10/06/2022];\n\nVALORES DE REFERÊNCIA:\n\n|   | Com jejum | Sem jejum | Categoria Referêncial  |

Average Metric: 6.00 / 6 (100.0%): 100%|██████████| 7/7 [00:00<00:00, 2395.57it/s]

2025/07/29 01:03:26 INFO dspy.evaluate.evaluate: Average Metric: 6.0 / 7 (85.7%)





Unnamed: 0,document_text,example_results,pred_results,contain_result,results
0,# Lámina Atendimento ao cliente: Rio de Janeiro(21) 2538 3939 **Ma...,{'Lipoproteina A (composto)': 'Inferior a 7 nmol/L'},{'Lipoproteína A': 'Inferior a 7 nmol/L'},✔️ [True],
1,# Lavoisier LABORATÓRIO E IMAGEN Núcleo de Atendimento ao cliente:...,"{'Ferritina': '68,4 ng/mL'}","{'Ferritina': '68,4 ng/mL Dentro do intervalo de referência', 'Áci...",✔️ [True],
2,"# GlPAX Laboratório Praça Afonso Pena, 246 - Centro - São José dos...","{'UREIA': '45,0 mg/dL', 'CREATININA': '0,99 mg/dL'}","{'UREIA': '45,0 mg/dL', 'CREATININA': '0,99 mg/dL', 'GLICEMIA': '1...",✔️ [True],
3,![img-0.jpeg](img-0.jpeg) **Nome** : LUCIANA FERREIRA PINTO DA SIL...,"{'HEMACIAS': '4,63 milhões/mm3', 'HEMOGLOBINA': '13,6 g/dL', 'LEUC...","{'HEMACIAS': '4,63 milhōes/mm³', 'HEMOGLOBINA': '13,6 g/dL', 'HEMA...",✔️ [True],
4,# 40 <br> 405 <br> sabin <br> DIAGNÓSTICO E SAÚDE Nome : RAFAEL BA...,"{'HEMÁCIAS': '5,62 milhões/mm3', 'HEMOGLOBINA': '16,7 g/dL', 'LEUC...","{'HEMÁCIAS': '5,62 milhões/mm3', 'HEMOGLOBINA': '16,7 g/dL', 'HEMA...",✔️ [True],
5,# hapvida # HAPVIDA DIAGNÓSTICO Diagnóstico VIDA IMAGEM BOA VIAGEM...,{'GLICOSE': '112.00 mg/dL'},{'GLICOSE': '112.00 mg/dL'},✔️ [True],
6,Nome: TEREZA CRISTINA APOLIANO HOMSI Médico: ISNADIA COSTA SILVA C...,,,,"{'DOSAGEM DE GLICOSE EM JEJUM': '100 mg/dL', 'COLESTEROL TOTAL': '..."


Time taken: 0.02391982078552246 seconds


In [None]:
# pred = lab_result_predictor(document_text=dataset[1].document_text)
# contain_result(dataset[1], pred)

# for example in dataset:
#     pred = lab_result_predictor(document_text=example.document_text)
#     print(contain_result(example, pred))
#     print("-" * 100)


In [None]:
with open(f'data/text_{OCR_USED}_{7}.md', 'r') as file:
            text = file.read()
            full_texts.append(text)
lab_result_predictor(document_text=full_texts[0])
# OCR_USED 

Prediction(
    results={'Lipoproteína A': 'Inferior a 7 nmol/L'}
)