## **Detección condiciones médicas con LLMs**

Qwen 2.5 3B

In [2]:
#%pip install langchain pymupdf openai openpyxl --quiet

In [1]:
from typing import List
from pydantic import BaseModel, Field, ValidationError
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.exceptions import OutputParserException
import requests
import json
import re
from openai import OpenAI
import openai
import httpx
import pandas as pd
import numpy as np
import os
import openpyxl

anotation_file = "C:\\Users\\cpalo\\OneDrive - UPV\\ARA\\2425_ARA\\dataset\\Anotadas.xlsx"

## Diabetes

In [7]:
from datetime import datetime
from typing import Dict, Optional
from pydantic import BaseModel, Field, RootModel 
from langchain_core.exceptions import OutputParserException

class DMDiagnosis(BaseModel):
    DM: int
    Fecha_Dx_DM: Optional[datetime] = Field(None, description="Date of DM diagnosis")

# Root model whose single value is Dict[str, Diagnosis]
class ArgumentResponse(RootModel[Dict[str, DMDiagnosis]]):
    pass

pydantic_parser = PydanticOutputParser(pydantic_object=ArgumentResponse)

def extract_json_block(text: str) -> str:
    """Extract a JSON object from LLM output (robust to markdown and extra text)."""
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if not match:
        raise ValueError(f"Could not find a JSON object in output:\n{text}")
    return match.group(0)

example_json = {
    "38945530.pdf": {
        "DM": 0,
        "Fecha_Dx_DM": "2019-05-21"
    }
}


def extract_arguments_json(text: str, condition: str) -> ArgumentResponse:
    format_instructions = pydantic_parser.get_format_instructions()
    prompt = PromptTemplate(
        template=(
            "You are an medical diagnosis assistant reading patient medical histories in pdfs.\n\n"
            "I want to extract if the patient has Diabetes (DM = 1 if not DM=0) and its date of diagnosis (YYYY-MM-DD) if present.\n\n"
            "Extract pdf name, diagnosis and date ONLY in regard to this specific medical condition: \"{condition}\" directly from the text below.\n\n"
            "Text:\n\"\"\"\n{text}\n\"\"\""
            "Return **exactly one JSON object** whose keys are the PDF file names and whose "
            "values follow this example:\n"
            "   json.dumps(example_json, indent=2)\n\n"
            "{format_instructions}"
            "Do not include markdown (```), explanations, or JSON schemas. Just output the raw JSON object without fields like properties or required."
        ),
        input_variables=["text", "condition"],
        partial_variables={"format_instructions": format_instructions},
    )

    final_prompt = prompt.format_prompt(text=text, condition=condition).to_string()

    payload = {
        "model": "qwen2.5:3b",
        "prompt": final_prompt,
        "temperature": 0,
        "stream": False
    }

    response = requests.post("http://localhost:11434/api/generate", json=payload)
    if response.status_code != 200:
        raise Exception(f"Ollama error: {response.text}")

    raw_output = response.json()["response"]

    print("Model Output:", raw_output)
        # ---- try to parse; on error just return None
    try:
        return pydantic_parser.parse(raw_output)
    
    except OutputParserException as err:
        print("⚠️  Parse failed:", err)
        return None

import json
from datetime import datetime

def write_ndjson(pdf_name: str, diag_dict: dict, path: str = "diagnoses_diabetes.txt"):
    # Convert any datetime objects to ISO strings so json.dumps can handle them
    for k, v in diag_dict.items():
        if isinstance(v, datetime):
            diag_dict[k] = v.date().isoformat()   # "YYYY-MM-DD"

    record = {pdf_name: diag_dict}               # wrap with the filename key
    line = json.dumps(record, ensure_ascii=False)

    # Append to the file, adding a newline
    with open(path, "a", encoding="utf-8") as f:
        f.write(line + "\n")

In [7]:
len(df[df['Patologia']=='Diabetes'])

4642

In [8]:
df['Tipo_Ocurrencia'].unique()

array(['E10', 'Diabetes', 'Diabetes Mellitus', 'N19',
       'Insuficiencia Renal', 'E13', 'Insuficiencia Renal Cronica',
       'Hipertension', 'Hipertension esencial', 'Hipertension arterial',
       'DM', 'I10X', 'ERC', 'HTA', 'Enfermedad Renal Cronica',
       'Hipertensión', 'Hipertensión arterial', 'E11',
       'Insuficiencia Renal Crónica'], dtype=object)

In [9]:
df[df['Patologia']=='Diabetes']['Tipo_Ocurrencia'].unique()

array(['E10', 'Diabetes', 'Diabetes Mellitus', 'E13', 'DM', 'E11'],
      dtype=object)

In [10]:
from collections import defaultdict
import difflib                           

pdf_to_snippets: dict[str, list[str]] = defaultdict(list)

for key in x_test.keys():               
    pdf_name, _ = key.rsplit("_", 1)
    if x_test[key]['Patologia'] == 'Diabetes':   
        pdf_to_snippets[pdf_name].append(x_test[key]["Contexto_recuperado"])

def deduplicate(snippets: list[str], similarity=0.85) -> list[str]:
    kept: list[str] = []
    for s in snippets:
        duplicate = False
        for t in kept:
            if s in t or t in s:                  # exact / containment
                duplicate = True
                break
            if difflib.SequenceMatcher(None, s, t).quick_ratio() >= similarity:
                duplicate = True                  # high overlap
                break
        if not duplicate:
            kept.append(s)
    return kept

pdf_to_text: dict[str, str] = {
    pdf: "\n".join(deduplicate(snips))
    for pdf, snips in pdf_to_snippets.items()
}

print("Number of PDFs with diabetes snippets:", len(pdf_to_text))
pdf_to_text

Number of PDFs with diabetes snippets: 85


{'CC38990979.pdf': '2019 00 50 11 01 2019 14 00 Urgencias Hospitalizacion No R104 OTROS DOLORES ABDOMINALES Y LOS NO ESPECIFICADOS 20 03 2019 13 21 20 03 2019 13 36 Ambulatoria E109 DIABETES MELLITUS INSULINODEPENDIE NTE SIN MENCION DE COMPLICACION 21 06 2019 13 15 21 06 2019 13 48 Ambulatoria ENFERMEDAD GENERAL E109 DIABETES MELLITUS INSULINODEPENDIE NTE SIN MENCION DE COMPLICACION\n15 03 2021 09 32 Ambulatoria ENFERMEDAD GENERAL E108 DIABETES MELLITUS INSULINODEPENDIE NTE CON COMPLICACIONES NO ESPECIFICADAS 07 04 2021 16 02 07 04 2021 16 22 Ambulatoria ENFERMEDAD GENERAL E136 DIABETES MELLITUS ESPECIFICADA CON OTRAS COMPLICACIONES ESPECIFICADAS Fecha de ingreso Fecha de egreso Tipo de atención Causa externa Diagnóstico principal Si No 07 07 2021 15 32 07 07 2021\n2021 15 32 07 07 2021 15 47 Ambulatoria ENFERMEDAD GENERAL E136 DIABETES MELLITUS ESPECIFICADA CON OTRAS COMPLICACIONES ESPECIFICADAS 15 07 2021 09 54 15 07 2021 09 55 Ambulatoria E136 DIABETES MELLITUS ESPECIFICADA CON OTRA

In [11]:
for i, (pdf_name, merged_text) in enumerate(pdf_to_text.items(), start=1):
    print(len(merged_text.split()), "tokens in", pdf_name)

1343 tokens in CC38990979.pdf
915 tokens in CC21376431.pdf
732 tokens in CC2392028.pdf
974 tokens in CC66811116.pdf
183 tokens in CC4640234.pdf
366 tokens in CC94281430.pdf
958 tokens in CC31260873.pdf
1037 tokens in CC38440317.pdf
732 tokens in CC29802554.pdf
425 tokens in CC29325288.pdf
61 tokens in CC31252756.pdf
61 tokens in CC41306363.pdf
976 tokens in CC2550458.pdf
61 tokens in CC31219907.pdf
182 tokens in CC38960494.pdf
61 tokens in CC6494067.pdf
610 tokens in CC4589307.pdf
180 tokens in CC29298149.pdf
181 tokens in CC14986444.pdf
671 tokens in CC24268447.pdf
181 tokens in CC26390865.pdf
121 tokens in CC14994724.pdf
240 tokens in CC6256890.pdf
242 tokens in CC31239230.pdf
362 tokens in CC25264743.pdf
182 tokens in CC6086951.pdf
661 tokens in CC31889024.pdf
60 tokens in CC31236581.pdf
120 tokens in CC66975500.pdf
60 tokens in CC29804260.pdf
240 tokens in CC27322738.pdf
60 tokens in CC66846793.pdf
976 tokens in CC31988364.pdf
610 tokens in CC41377296.pdf
121 tokens in CC6085419.pd

In [12]:
condition = "Diabetes Mellitus (DM) or Diabetes or E10 or E13 or E11"
out_path  = "diagnoses_diabetes_qwen2-5_3b_wholetext.txt"

for i, (pdf_name, merged_text) in enumerate(pdf_to_text.items(), start=1):
    if i % 10 == 0:
        print(f"Registro: {i} de {len(pdf_to_text)}")

    prompt_text = f"pdf name: {pdf_name}\n{merged_text}"
    result = extract_arguments_json(prompt_text, condition)

    if result is None or len(result.root)==0:  
        continue

    diag = next(iter(result.root.values()))         
    write_ndjson(
        pdf_name,                                  
        {"DM": diag.DM, "Fecha_Dx_DM": diag.Fecha_Dx_DM},
        path=out_path,
    )

Model Output: {
  "CC38990979.pdf": {
    "DM": 1,
    "Fecha_Dx_DM": null
  }
}
Model Output: {
    "CC21376431.pdf": {
        "DM": 0,
        "Fecha_Dx_DM": null
    }
}
Model Output: {"CC2392028.pdf": {"DM": 1, "Fecha_Dx_DM": null}}
Model Output: {"CC66811116.pdf": {"DM": 0, "Fecha_Dx_DM": null}}
Model Output: {"CC4640234.pdf": {"DM": 1, "Fecha_Dx_DM": null}}
Model Output: {"CC94281430.pdf": {"DM": 0, "Fecha_Dx_DM": null}}
Model Output: {"CC31260873.pdf": {"DM": 0, "Fecha_Dx_DM": null}}
Model Output: {"CC38440317.pdf": {"DM": 1, "Fecha_Dx_DM": "2019-03-28"}}
Model Output: {"CC29802554.pdf": {"DM": 1, "Fecha_Dx_DM": null}, "COXARTROSIS_NO_ESPECIFICADA.pdf": {"DM": 0, "Fecha_Dx_DM": null}}
Registro: 10 de 85
Model Output: {"CC29325288.pdf": {"DM": 0, "Fecha_Dx_DM": null}}
Model Output: {"CC31252756.pdf": {"DM": 0, "Fecha_Dx_DM": null}}
Model Output: {"CC41306363.pdf": {"DM": 0, "Fecha_Dx_DM": null}}
Model Output: {"CC2550458.pdf": {"DM": 1, "Fecha_Dx_DM": null}}
Model Output: {"CC31

### Enfermedad Renal Crónica

In [9]:
from datetime import datetime
from typing import Dict, Optional
from pydantic import BaseModel, Field, RootModel 

class ERCDiagnosis(BaseModel):
    ERC: int
    Fecha_Dx_ERC: Optional[datetime] = Field(None, description="Date of ERC diagnosis")

# Root model whose single value is Dict[str, Diagnosis]
class ArgumentResponse(RootModel[Dict[str, ERCDiagnosis]]):
    pass

pydantic_parser = PydanticOutputParser(pydantic_object=ArgumentResponse)

def extract_json_block(text: str) -> str:
    """Extract a JSON object from LLM output (robust to markdown and extra text)."""
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if not match:
        raise ValueError(f"Could not find a JSON object in output:\n{text}")
    return match.group(0)


def extract_arguments_json(text: str, condition: str) -> ArgumentResponse:
    format_instructions = pydantic_parser.get_format_instructions()
    prompt = PromptTemplate(
        template=(
            "You are an medical diagnosis assistant reading patient medical histories in pdfs.\n\n"
            "I want to extract if the patient has {condition} (ERC = 1 if not ERC=0) and its date of diagnosis (YYYY-MM-DD) if present.\n\n"
            "Extract pdf name, diagnosis and date ONLY in regard to this specific medical condition: \"{condition}\" directly from the text below.\n\n"
            "Text:\n\"\"\"\n{text}\n\"\"\""
            "Return **exactly one JSON object** whose keys are the PDF file names and whose values follow this format: "
            "{format_instructions}"
            "Do not include markdown (```), explanations, or JSON schemas. Just output the raw JSON object without fields like properties or required."
        ),
        input_variables=["text", "condition"],
        partial_variables={"format_instructions": format_instructions},
    )

    final_prompt = prompt.format_prompt(text=text, condition=condition).to_string()

    payload = {
        "model": "qwen2.5:3b",
        "prompt": final_prompt,
        "temperature": 0,
        "stream": False
    }

    response = requests.post("http://localhost:11434/api/generate", json=payload)
    if response.status_code != 200:
        raise Exception(f"Ollama error: {response.text}")

    raw_output = response.json()["response"]
    print("Model Output:", raw_output)
    try:
        return pydantic_parser.parse(raw_output)
    
    except OutputParserException as err:
        print("⚠️  Parse failed:", err)
        return None

import json
from datetime import datetime

def write_ndjson(pdf_name: str, diag_dict: dict, path: str = "diagnoses_ERC.txt"):
    # Convert any datetime objects to ISO strings so json.dumps can handle them
    for k, v in diag_dict.items():
        if isinstance(v, datetime):
            diag_dict[k] = v.date().isoformat()   # "YYYY-MM-DD"

    record = {pdf_name: diag_dict}               # wrap with the filename key
    line = json.dumps(record, ensure_ascii=False)

    # Append to the file, adding a newline
    with open(path, "a", encoding="utf-8") as f:
        f.write(line + "\n")

def deduplicate(snippets: list[str], similarity=0.85) -> list[str]:
    kept: list[str] = []
    for s in snippets:
        duplicate = False
        for t in kept:
            if s in t or t in s:                  # exact / containment
                duplicate = True
                break
            if difflib.SequenceMatcher(None, s, t).quick_ratio() >= similarity:
                duplicate = True                  # high overlap
                break
        if not duplicate:
            kept.append(s)
    return kept

In [11]:
df[df['Patologia']=='Enfermedad Renal Crónica']['Tipo_Ocurrencia'].unique()

array(['N19', 'Insuficiencia Renal', 'Insuficiencia Renal Cronica', 'ERC',
       'Enfermedad Renal Cronica', 'Insuficiencia Renal Crónica'],
      dtype=object)

In [12]:
len(df[df['Patologia']=='Enfermedad Renal Crónica'])

9948

In [10]:
from collections import defaultdict
import difflib                           

pdf_to_snippets: dict[str, list[str]] = defaultdict(list)

for key in x_test.keys():               
    pdf_name, _ = key.rsplit("_", 1)
    if x_test[key]['Patologia'] == 'Enfermedad Renal Crónica':   
        pdf_to_snippets[pdf_name].append(x_test[key]["Contexto_recuperado"])

pdf_to_text: dict[str, str] = {
    pdf: "\n".join(deduplicate(snips))
    for pdf, snips in pdf_to_snippets.items()
}

print("Number of PDFs with ERC snippets:", len(pdf_to_text))


condition = "Enfermedad Renal Cronica (ERC) or Chronic Kidney Disease or Insuficiencia Renal or N19 "
out_path  = "diagnoses_ERC_qwen2-5_3b_wholetext.txt"

list(pdf_to_text.keys())

Number of PDFs with ERC snippets: 100


['CC38990979.pdf',
 'CC4302926.pdf',
 'CC21376431.pdf',
 'CC2392028.pdf',
 'CC66811116.pdf',
 'CC27293711.pdf',
 'CC4640234.pdf',
 'CC94281430.pdf',
 'CC31260873.pdf',
 'CC38440317.pdf',
 'CC29802554.pdf',
 'CC29325288.pdf',
 'CC31252756.pdf',
 'CC29049220.pdf',
 'CC41306363.pdf',
 'CC2550458.pdf',
 'CC31219907.pdf',
 'CC38960494.pdf',
 'CC6494067.pdf',
 'CC4589307.pdf',
 'CC29298149.pdf',
 'CC14986444.pdf',
 'CC6279058.pdf',
 'CC24268447.pdf',
 'CC26390865.pdf',
 'CC14994724.pdf',
 'CC6256890.pdf',
 'CC31239230.pdf',
 'CC25264743.pdf',
 'CC6086951.pdf',
 'CC31889024.pdf',
 'CC31236581.pdf',
 'CC66975500.pdf',
 'CC29804260.pdf',
 'CC27322738.pdf',
 'CC66846793.pdf',
 'CC1232518.pdf',
 'CC31988364.pdf',
 'CC38433864.pdf',
 'CC41377296.pdf',
 'CC29061348.pdf',
 'CC6085419.pdf',
 'CC14438430.pdf',
 'CC27508135.pdf',
 'CC31856748.pdf',
 'CC34532892.pdf',
 'CC16679174.pdf',
 'CC31253879.pdf',
 'CC14870542.pdf',
 'CC29768571.pdf',
 'CC29938688.pdf',
 'CC29408150.pdf',
 'CC29013852.pdf',
 'CC

In [None]:
from collections import defaultdict
import difflib                           

pdf_to_snippets: dict[str, list[str]] = defaultdict(list)

for key in x_test.keys():               
    pdf_name, _ = key.rsplit("_", 1)
    if x_test[key]['Patologia'] == 'Enfermedad Renal Crónica':   
        pdf_to_snippets[pdf_name].append(x_test[key]["Contexto_recuperado"])

pdf_to_text: dict[str, str] = {
    pdf: "\n".join(deduplicate(snips))
    for pdf, snips in pdf_to_snippets.items()
}

print("Number of PDFs with ERC snippets:", len(pdf_to_text))


condition = "Enfermedad Renal Cronica (ERC) or Chronic Kidney Disease or Insuficiencia Renal or N19 "
out_path  = "diagnoses_ERC_qwen2-5_3b_wholetext.txt"

for i, (pdf_name, merged_text) in enumerate(pdf_to_text.items(), start=1):
    if i % 10 == 0:
        print(f"Registro: {i} de {len(pdf_to_text)}")

    prompt_text = f"pdf name: {pdf_name} \n text: {merged_text}"
    result = extract_arguments_json(prompt_text, condition)

    if result is None or len(result.root)==0:  
        continue

    diag = next(iter(result.root.values()))         
    write_ndjson(
        pdf_name,                                  
        {"ERC": diag.ERC, "Fecha_Dx_ERC": diag.Fecha_Dx_ERC},
        path=out_path,
    )

Number of PDFs with ERC snippets: 100
Model Output: {"CC38990979.pdf": {"ERC": 1, "Fecha_Dx_ERC": null}}
Model Output: {"C4B870F1528B469FB3B79396EB9AFCF7": {"ERC": 3, "Fecha_Dx_ERC": null}, "B5F44BF8BA5D4C4FBE1E7F55DF5FE405": {"ERC": 4, "Fecha_Dx_ERC": null}}
Model Output: {"CC21376431.pdf": {"ERC": 0, "Fecha_Dx_ERC": null}}
Model Output: {
  "CC2392028.pdf": {
    "ERCDiagnosis": null
  }
}
⚠️  Parse failed: Failed to parse ArgumentResponse from completion {"CC2392028.pdf": {"ERCDiagnosis": null}}. Got: 1 validation error for ArgumentResponse
`CC2392028.pdf`.ERC
  Field required [type=missing, input_value={'ERCDiagnosis': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
Model Output: {"CC66811116.pdf": {"ERC": 0, "Fecha_Dx_ERC": null}}
Model Output: {"CC27293711.pdf": {"ERC": 0, "Fecha_Dx_ERC": null}}
Model Output: {"CC4640234

### Hipertensión Arterial

In [13]:
from datetime import datetime
from typing import Dict, Optional
from pydantic import BaseModel, Field, RootModel 

class HTADiagnosis(BaseModel):
    HTA: int
    Fecha_Dx_HTA: Optional[datetime] = Field(None, description="Date of HTA diagnosis")

# Root model whose single value is Dict[str, Diagnosis]
class ArgumentResponse(RootModel[Dict[str, HTADiagnosis]]):
    pass

pydantic_parser = PydanticOutputParser(pydantic_object=ArgumentResponse)

def extract_json_block(text: str) -> str:
    """Extract a JSON object from LLM output (robust to markdown and extra text)."""
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if not match:
        raise ValueError(f"Could not find a JSON object in output:\n{text}")
    return match.group(0)

example_json = {
    "38945530.pdf": {
        "HTA": 0,
        "Fecha_Dx_HTA": "2019-05-21"
    }
}


def extract_arguments_json(text: str, condition: str) -> ArgumentResponse:
    format_instructions = pydantic_parser.get_format_instructions()
    prompt = PromptTemplate(
        template=(
            "You are an medical diagnosis assistant reading patient medical histories in pdfs.\n\n"
            "I want to extract if the patient has {condition} (HTA = 1 if not HTA=0) and its date of diagnosis (YYYY-MM-DD) if present.\n\n"
            "Extract pdf name, diagnosis and date ONLY in regard to this specific medical condition: \"{condition}\" directly from the text below.\n\n"
            "Text:\n\"\"\"\n{text}\n\"\"\""
            "Return **exactly one JSON object** whose keys are the PDF file names and whose values follow this format: "
            "{format_instructions}"
            "Do not include markdown (```), explanations, or JSON schemas. Just output the raw JSON object without fields like properties or required."
        ),
        input_variables=["text", "condition"],
        partial_variables={"format_instructions": format_instructions},
    )

    final_prompt = prompt.format_prompt(text=text, condition=condition).to_string()

    payload = {
        "model": "qwen2.5:3b",
        "prompt": final_prompt,
        "temperature": 0,
        "stream": False
    }

    response = requests.post("http://localhost:11434/api/generate", json=payload)
    if response.status_code != 200:
        raise Exception(f"Ollama error: {response.text}")

    raw_output = response.json()["response"]

    print("Model Output:", raw_output)
        # ---- try to parse; on error just return None
    try:
        return pydantic_parser.parse(raw_output)
    
    except OutputParserException as err:
        print("⚠️  Parse failed:", err)
        return None

import json
from datetime import datetime

def write_ndjson(pdf_name: str, diag_dict: dict, path: str = "diagnoses_hipertensionarterial.txt"):
    # Convert any datetime objects to ISO strings so json.dumps can handle them
    for k, v in diag_dict.items():
        if isinstance(v, datetime):
            diag_dict[k] = v.date().isoformat()   # "YYYY-MM-DD"

    record = {pdf_name: diag_dict}               # wrap with the filename key
    line = json.dumps(record, ensure_ascii=False)

    # Append to the file, adding a newline
    with open(path, "a", encoding="utf-8") as f:
        f.write(line + "\n")

In [14]:
df[df['Patologia']=='Hipertensión Arterial']['Tipo_Ocurrencia'].unique()

array(['Hipertension', 'Hipertension esencial', 'Hipertension arterial',
       'I10X', 'HTA', 'Hipertensión', 'Hipertensión arterial'],
      dtype=object)

In [15]:
from collections import defaultdict
import difflib                           

pdf_to_snippets: dict[str, list[str]] = defaultdict(list)

for key in x_test.keys():               
    pdf_name, _ = key.rsplit("_", 1)
    if x_test[key]['Patologia'] == 'Hipertensión Arterial':   
        pdf_to_snippets[pdf_name].append(x_test[key]["Contexto_recuperado"])

pdf_to_text: dict[str, str] = {
    pdf: "\n".join(deduplicate(snips))
    for pdf, snips in pdf_to_snippets.items()
}

print("Number of PDFs with HTA snippets:", len(pdf_to_text))


condition = "HiperTension Arterial (HTA) or High Blood Pressure or Hipertension or I10X"
out_path  = "diagnoses_HTA_qwen2-5_3b_wholetext.txt"

for i, (pdf_name, merged_text) in enumerate(pdf_to_text.items(), start=1):
    if i % 10 == 0:
        print(f"Registro: {i} de {len(pdf_to_text)}")

    prompt_text = f"pdf name: {pdf_name} \n text: {merged_text}"
    result = extract_arguments_json(prompt_text, condition)

    if result is None or len(result.root)==0:  
        continue

    diag = next(iter(result.root.values()))         
    write_ndjson(
        pdf_name,                                  
        {"HTA": diag.HTA, "Fecha_Dx_HTA": diag.Fecha_Dx_HTA},
        path=out_path,
    )

Number of PDFs with HTA snippets: 98
Model Output: {"CC38990979.pdf": {"HTA": 1, "Fecha_Dx_HTA": null}}
Model Output: {"CC4302926.pdf": {"HTA": 0, "Fecha_Dx_HTA": null}}
Model Output: {"CC21376431.pdf": {"I10X": 1, "Fecha_Dx_HTA": null}}
⚠️  Parse failed: Failed to parse ArgumentResponse from completion {"CC21376431.pdf": {"I10X": 1, "Fecha_Dx_HTA": null}}. Got: 1 validation error for ArgumentResponse
`CC21376431.pdf`.HTA
  Field required [type=missing, input_value={'I10X': 1, 'Fecha_Dx_HTA': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
Model Output: {"CC2392028.pdf": {"HTA": 1, "Fecha_Dx_HTA": null}}
Model Output: {"CC66811116.pdf": {"HTA": 0, "Fecha_Dx_HTA": null}}
Model Output: {"CC27293711.pdf": {"HTA": 0, "Fecha_Dx_HTA": null}}
Model Output: {"CC4640234.pdf": {"HTA": 0, "Fecha_Dx_HTA": null}}
Model Output: {"CC94281430

## Evaluacion resultados

### Diabetes

In [None]:
import json, numpy as np, pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# ───────────────────────────── 1. read predictions ────────────────────────────
txt_path = Path("../outputs/diagnoses_diabetes_qwen2-5_3b_wholetext.txt")

records = []
with txt_path.open(encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        outer = json.loads(line)
        full_key, inner = next(iter(outer.items()))
        records.append(
            {
                "Nombre_PDF"   : full_key,
                "DM_pred"     : inner["DM"],
                "Fecha_Dx_DM_pred": inner["Fecha_Dx_DM"],
            }
        )

df_pred = pd.DataFrame(records)
df_pred["DM_pred"] = np.where(df_pred["DM_pred"] > 1, 0, df_pred["DM_pred"])
df_pred["Fecha_Dx_DM_pred"] = pd.to_datetime(df_pred["Fecha_Dx_DM_pred"])

# ───────────────────────────── 2. merge with ground truth ─────────────────────
df_metrics = (
    df_info
    .rename(columns={"DM": "DM_gt", "Fecha_Dx_DM": "Fecha_Dx_DM_gt"})
    .merge(df_pred, on="Nombre_PDF", how="inner")
)

# ───────────────────────────── 3. helper to collect metrics ───────────────────
def metric_table(y_true, y_pred, name):
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    return pd.DataFrame([{
        "Metric set": name,
        "Accuracy"  : accuracy_score (y_true, y_pred),
        "Recall"    : recall_score   (y_true, y_pred, zero_division=0),
        "Precision" : precision_score(y_true, y_pred, zero_division=0),
        "F1"        : f1_score       (y_true, y_pred, zero_division=0),
    }])

results = []
results.append(metric_table(df_metrics["DM_gt"], df_metrics["DM_pred"], "DM diagnosis"))

# ───────────────────────────── 4. date-match that works for list or scalar ────
def date_match(row) -> int:
    gt   = row["Fecha_Dx_DM_gt"]      # NaT or Timestamp
    pred = row["Fecha_Dx_DM_pred"]    # NaT, Timestamp, or list

    # case 1: both “null”
    if pd.isna(gt):
        return int(pd.isna(pred) or (isinstance(pred, list) and len(pred) == 0))

    # ensure pred is iterable
    if not isinstance(pred, list):
        pred = [pred]
        return int(gt in pred)

df_metrics["date_ok"] = df_metrics.apply(date_match, axis=1)
df_metrics["date_ok_gt"]= np.where(df_metrics["Fecha_Dx_DM_gt"].isna(), 0, 1)
results.append(metric_table(df_metrics["date_ok_gt"], df_metrics["date_ok"], "Diagnosis DM date"))

# ───────────────────────────── 5. show nicely ────────────────────────────────
metrics_df_DM = pd.concat(results, ignore_index=True).round(3)

n_total = len(df_metrics)
n_pos   = int(df_metrics["DM_gt"].sum())
print(f"Evaluado sobre {n_total} historias médicas y {n_pos} pacientes con DM.")

display(
    metrics_df_DM.style
        .format("{:.3f}", subset=["Accuracy", "Recall", "Precision", "F1"])
        .set_properties(**{"text-align": "center"})
        .set_table_styles(
            [{"selector": "th", "props": [("text-align", "center")]}]
        )
)


Evaluado sobre 82 historias médicas y 38 pacientes con DM.


Unnamed: 0,Metric set,Accuracy,Recall,Precision,F1
0,DM diagnosis,0.634,0.289,0.786,0.423
1,Diagnosis DM date,0.012,0.028,0.021,0.024


### Enfermedad Renal Cronica

In [11]:
import json, numpy as np, pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# ───────────────────────────── 1. read predictions ────────────────────────────
txt_path = Path("../outputs/diagnoses_ERC_qwen2-5_3b_wholetext.txt")

records = []
with txt_path.open(encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        outer = json.loads(line)
        full_key, inner = next(iter(outer.items()))
        records.append(
            {
                "Nombre_PDF"   : full_key,
                "ERC_pred"     : inner["ERC"],
                "Fecha_Dx_ERC_pred": inner["Fecha_Dx_ERC"],
            }
        )

df_pred = pd.DataFrame(records)
df_pred["ERC_pred"] = np.where(df_pred["ERC_pred"] > 1, 0, df_pred["ERC_pred"])
df_pred["Fecha_Dx_ERC_pred"] = pd.to_datetime(df_pred["Fecha_Dx_ERC_pred"])

# ───────────────────────────── 2. merge with ground truth ─────────────────────
df_metrics = (
    df_info
    .rename(columns={"ERC": "ERC_gt", "Fecha_Dx_ERC": "Fecha_Dx_ERC_gt"})
    .merge(df_pred, on="Nombre_PDF", how="inner")
)

# ───────────────────────────── 3. helper to collect metrics ───────────────────
def metric_table(y_true, y_pred, name):
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    return pd.DataFrame([{
        "Metric set": name,
        "Accuracy"  : accuracy_score (y_true, y_pred),
        "Recall"    : recall_score   (y_true, y_pred, zero_division=0),
        "Precision" : precision_score(y_true, y_pred, zero_division=0),
        "F1"        : f1_score       (y_true, y_pred, zero_division=0),
    }])

results = []
results.append(metric_table(df_metrics["ERC_gt"], df_metrics["ERC_pred"], "ERC diagnosis"))

# ───────────────────────────── 4. date-match that works for list or scalar ────
def date_match(row) -> int:
    gt   = row["Fecha_Dx_ERC_gt"]      # NaT or Timestamp
    pred = row["Fecha_Dx_ERC_pred"]    # NaT, Timestamp, or list

    # case 1: both “null”
    if pd.isna(gt):
        return int(pd.isna(pred) or (isinstance(pred, list) and len(pred) == 0))

    # ensure pred is iterable
    if not isinstance(pred, list):
        pred = [pred]
        return int(gt in pred)

df_metrics["date_ok"] = df_metrics.apply(date_match, axis=1)
df_metrics["date_ok_gt"]= np.where(df_metrics["Fecha_Dx_ERC_gt"].isna(), 0, 1)
results.append(metric_table(df_metrics["date_ok_gt"], df_metrics["date_ok"], "Diagnosis ERC date"))

# ───────────────────────────── 5. show nicely ────────────────────────────────
metrics_df_ERC = pd.concat(results, ignore_index=True).round(3)

n_total = len(df_metrics)
n_pos   = int(df_metrics["ERC_gt"].sum())
print(f"Evaluado sobre {n_total} historias médicas y {n_pos} pacientes con ERC.")

display(
    metrics_df_ERC.style
        .format("{:.3f}", subset=["Accuracy", "Recall", "Precision", "F1"])
        .set_properties(**{"text-align": "center"})
        .set_table_styles(
            [{"selector": "th", "props": [("text-align", "center")]}]
        )
)


Evaluado sobre 80 historias médicas y 78 pacientes con ERC.


Unnamed: 0,Metric set,Accuracy,Recall,Precision,F1
0,ERC diagnosis,0.338,0.321,1.0,0.485
1,Diagnosis ERC date,0.0,0.0,0.0,0.0


In [9]:
import json, numpy as np, pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# ───────────────────────────── 1. read predictions ────────────────────────────
txt_path = Path("../outputs/diagnoses_HTA_qwen2-5_3b_wholetext.txt")

records = []
with txt_path.open(encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        outer = json.loads(line)
        full_key, inner = next(iter(outer.items()))
        records.append(
            {
                "Nombre_PDF"   : full_key,
                "HTA_pred"     : inner["HTA"],
                "Fecha_Dx_HTA_pred": inner["Fecha_Dx_HTA"],
            }
        )

df_pred = pd.DataFrame(records)
df_pred["HTA_pred"] = np.where(df_pred["HTA_pred"] > 1, 0, df_pred["HTA_pred"])
df_pred["Fecha_Dx_HTA_pred"] = pd.to_datetime(df_pred["Fecha_Dx_HTA_pred"])

# ───────────────────────────── 2. merge with ground truth ─────────────────────
df_metrics = (
    df_info
    .rename(columns={"HTA": "HTA_gt", "Fecha_Dx_HTA": "Fecha_Dx_HTA_gt"})
    .merge(df_pred, on="Nombre_PDF", how="inner")
)

# ───────────────────────────── 3. helper to collect metrics ───────────────────
def metric_table(y_true, y_pred, name):
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    return pd.DataFrame([{
        "Metric set": name,
        "Accuracy"  : accuracy_score (y_true, y_pred),
        "Recall"    : recall_score   (y_true, y_pred, zero_division=0),
        "Precision" : precision_score(y_true, y_pred, zero_division=0),
        "F1"        : f1_score       (y_true, y_pred, zero_division=0),
    }])

results = []
results.append(metric_table(df_metrics["HTA_gt"], df_metrics["HTA_pred"], "HTA diagnosis"))

# ───────────────────────────── 4. date-match that works for list or scalar ────
def date_match(row) -> int:
    gt   = row["Fecha_Dx_HTA_gt"]      # NaT or Timestamp
    pred = row["Fecha_Dx_HTA_pred"]    # NaT, Timestamp, or list

    # case 1: both “null”
    if pd.isna(gt):
        return int(pd.isna(pred) or (isinstance(pred, list) and len(pred) == 0))

    # ensure pred is iterable
    if not isinstance(pred, list):
        pred = [pred]
        return int(gt in pred)

df_metrics["date_ok"] = df_metrics.apply(date_match, axis=1)
df_metrics["date_ok_gt"]= np.where(df_metrics["Fecha_Dx_HTA_gt"].isna(), 0, 1)
results.append(metric_table(df_metrics["date_ok_gt"], df_metrics["date_ok"], "Diagnosis HTA date"))

# ───────────────────────────── 5. show nicely ────────────────────────────────
metrics_df_HTA = pd.concat(results, ignore_index=True).round(3)

n_total = len(df_metrics)
n_pos   = int(df_metrics["HTA_gt"].sum())
print(f"Evaluado sobre {n_total} historias médicas y {n_pos} pacientes con HTA.")

display(
    metrics_df_HTA.style
        .format("{:.3f}", subset=["Accuracy", "Recall", "Precision", "F1"])
        .set_properties(**{"text-align": "center"})
        .set_table_styles(
            [{"selector": "th", "props": [("text-align", "center")]}]
        )
)


Evaluado sobre 94 historias médicas y 93 pacientes con HTA.


Unnamed: 0,Metric set,Accuracy,Recall,Precision,F1
0,HTA diagnosis,0.213,0.204,1.0,0.339
1,Diagnosis HTA date,0.011,0.011,0.5,0.021


In [10]:
final_metrics = pd.concat([metrics_df_DM, metrics_df_ERC, metrics_df_HTA], ignore_index=True)
final_metrics


num_cols = ["Accuracy", "Recall", "Precision", "F1"]
totals = final_metrics[num_cols].mean().to_frame().T          # a 1-row DF
totals.insert(0, "Metric set", "Mean")                     # first column


final_metrics_with_avg = (
    pd.concat([final_metrics, totals], ignore_index=True)
      .round(3)                                               # optional rounding
)

final_metrics_with_avg


Unnamed: 0,Metric set,Accuracy,Recall,Precision,F1
0,DM diagnosis,0.634,0.289,0.786,0.423
1,Diagnosis DM date,0.012,0.028,0.021,0.024
2,ERC diagnosis,0.338,0.321,1.0,0.485
3,Diagnosis ERC date,0.0,0.0,0.0,0.0
4,HTA diagnosis,0.213,0.204,1.0,0.339
5,Diagnosis HTA date,0.011,0.011,0.5,0.021
6,Mean,0.201,0.142,0.551,0.215
