In [None]:
assert len(extrai_referencias(xlm)) == 53, len(extrai_referencias(xml))

In [8]:
from lxml import etree
from pathlib import Path
from typing import Union, List, Dict, Optional
import re

In [13]:
def _text(node: Optional[etree._Element]) -> Optional[str]:
    if node is None:
        return None
    txt = " ".join(node.itertext()).strip()
    return txt or None

def _first(match_list):
    return match_list[0] if match_list else None

def _authors(bibl_struct: etree._Element) -> List[str]:
    autores = []
    for path in [
        ".//*[local-name()='analytic']/*[local-name()='author']",
        ".//*[local-name()='monogr']/*[local-name()='author']",
        ".//*[local-name()='author']",
    ]:
        for auth in bibl_struct.xpath(path):
            pers_nodes = auth.xpath(".//*[local-name()='persName']")
            if pers_nodes:
                pers = pers_nodes[0]
                forename = _text(_first(pers.xpath(".//*[local-name()='forename']"))) or ""
                surname  = _text(_first(pers.xpath(".//*[local-name()='surname']"))) or ""
                nome = (forename + " " + surname).strip()
                if nome:
                    autores.append(nome)
                    continue
            nome = _text(auth)
            if nome:
                autores.append(nome)
    # remove duplicatas mantendo ordem
    seen, uniq = set(), []
    for a in autores:
        if a not in seen:
            seen.add(a)
            uniq.append(a)
    return uniq

def _year(bibl_struct: etree._Element) -> Optional[str]:
    date = _first(bibl_struct.xpath(".//*[local-name()='date']"))
    if date is not None:
        when = date.get("when")
        if when:
            return when[:4]
        txt = _text(date)
        if txt:
            m = re.search(r"\b(1[6-9]\d{2}|20\d{2}|21\d{2})\b", txt)
            if m:
                return m.group(0)
    return None

def _title(bibl_struct: etree._Element) -> Optional[str]:
    for path in [
        ".//*[local-name()='analytic']/*[local-name()='title']",
        ".//*[local-name()='monogr']/*[local-name()='title']",
        ".//*[local-name()='title']",
    ]:
        t = _first(bibl_struct.xpath(path))
        if t is not None:
            txt = _text(t)
            if txt:
                return txt
    return None

def _container_title(bibl_struct: etree._Element) -> Optional[str]:
    t = _first(bibl_struct.xpath(".//*[local-name()='monogr']/*[local-name()='title']"))
    return _text(t) if t is not None else None

def _idno(bibl_struct: etree._Element, idtype: str) -> Optional[str]:
    node = _first(bibl_struct.xpath(f".//*[local-name()='idno' and @type='{idtype}']"))
    return _text(node) if node is not None else None

def _pages(bibl_struct: etree._Element) -> Optional[str]:
    scope = _first(bibl_struct.xpath(".//*[local-name()='biblScope' and @type='pp']"))
    if scope is not None:
        from_p = scope.get("from")
        to_p = scope.get("to")
        if from_p and to_p:
            return f"{from_p}-{to_p}"
        txt = _text(scope)
        if txt:
            return txt
    return None

def extrai_referencias(xml_input: Union[str, bytes, Path, etree._ElementTree]) -> List[Dict]:
    """
    Extrai todas as referências de um arquivo TEI, ignorando o biblStruct do próprio paper.
    """
    if isinstance(xml_input, etree._ElementTree):
        tree = xml_input
    else:
        if isinstance(xml_input, (str, Path)) and Path(str(xml_input)).exists():
            tree = etree.parse(str(xml_input))
        else:
            parser = etree.XMLParser(remove_comments=True, recover=True)
            root = etree.fromstring(
                xml_input.encode("utf-8") if isinstance(xml_input, str) else xml_input,
                parser=parser
            )
            tree = etree.ElementTree(root)

    # Captura apenas referências dentro de <listBibl> ou <back>
    bibl_structs = tree.xpath(
        ".//*[local-name()='listBibl']//*[local-name()='biblStruct'] | "
        ".//*[local-name()='listBibl']//*[local-name()='bibl'] | "
        ".//*[local-name()='back']//*[local-name()='biblStruct'] | "
        ".//*[local-name()='back']//*[local-name()='bibl']"
    )

    refs: List[Dict] = []
    for b in bibl_structs:
        ref = {
            "xml_id": b.get("{http://www.w3.org/XML/1998/namespace}id"),
            "type": b.get("type"),
            "title": _title(b),
            "container_title": _container_title(b),
            "authors": _authors(b),
            "year": _year(b),
            "pages": _pages(b),
            "publisher": _text(_first(b.xpath(".//*[local-name()='publisher']"))),
            "pub_place": _text(_first(b.xpath(".//*[local-name()='pubPlace']"))),
            "doi": _idno(b, "DOI"),
            "isbn": _idno(b, "ISBN"),
            "issn": _idno(b, "ISSN"),
            "url": _text(_first(b.xpath(".//*[local-name()='ref' and @target]"))) or
                   _first([n.get("target") for n in b.xpath(".//*[local-name()='ref' and @target]")]) or
                   _idno(b, "URL"),
            "xml": etree.tostring(b, encoding="unicode")
        }
        refs.append(ref)

    return refs

# ---------------------------
# Exemplo de uso:
# ---------------------------
if __name__ == "__main__":
    caminho = "../DataFrames/2025.findings-acl.656.pdf.tei.xml"  # coloque seu arquivo TEI aqui
    referencias = extrai_referencias(caminho)
    print(f"Total de referências: {len(referencias)}")
    for r in referencias[:3]:
        print(r["title"], "—", ", ".join(r["authors"]), r.get("year"))

    # Lista apenas com os títulos das referências
    titulos = [r["title"] for r in referencias if r["title"]]
    
    print(f"Total de referências: {len(titulos)}")
    for t in titulos:
        print(t)



Total de referências: 53
DUnE: Dataset for Unified Editing — Afra Akyürek, Eric Pan, Garry Kuwanto, Derry Wijaya 2023
Bluex: A benchmark based on brazilian leading universities entrance exams — Thales Sales Almeida, Thiago Laitz, Giovana Bonás, Rodrigo Nogueira 2023
Revealing the structure of language model capabilities — Ryan Burnell, Han Hao, R Andrew, Jose Conway, Orallo Hernandez 2023
Total de referências: 53
DUnE: Dataset for Unified Editing
Bluex: A benchmark based on brazilian leading universities entrance exams
Revealing the structure of language model capabilities
Rethink reporting of evaluation results in AI
Stochastic Chameleons: Irrelevant Context Hallucinations Reveal Class-Based (Mis)Generalization in LLMs
MATHWELL: Generating Educational Math Word Problems Using Teacher Annotations
Training verifiers to solve math word problems
Coefficient alpha and the internal structure of tests
Word problems: a review of linguistic and numerical factors contributing to their difficult