In [None]:
import polars as pl
import gzip
from pathlib import Path
from lxml import etree
import json


In [None]:
papers = pl.read_ndjson("../data/papers.jsonl")

In [None]:
paper = papers.sample(1).to_dicts()[0]
print(f"Source Paper {paper['arxiv_id']}: {paper['title']}")

citations = paper['citations']

arxiv_citations_with_context = [c for c in citations if c['arxiv_id'] and c['reference_contexts']]

print("Referred to:")
for c in arxiv_citations_with_context:
    c.pop('citation_id')
    print(json.dumps(c, indent=2))
    print()

In [None]:
# --- Load and parse the gzipped TEI XML --
arxiv_id = paper['arxiv_id']
xml_path = Path(f"../data/xml_docs/{arxiv_id}.xml.gz")
with gzip.open(xml_path, "rb") as f:
    data = f.read()

parser = etree.XMLParser(recover=True)
root = etree.fromstring(data, parser=parser)

NS = {"tei": "http://www.tei-c.org/ns/1.0"}
tei_text = lambda el: " ".join(" ".join(el.itertext()).split()) if el is not None else ""


# --- Quick helpers ---
def get_title(root):
    xp = [
        ".//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title",
        ".//tei:fileDesc/tei:sourceDesc//tei:biblStruct/tei:analytic/tei:title[@type='main']",
        ".//tei:fileDesc/tei:sourceDesc//tei:biblStruct/tei:monogr/tei:title",
    ]
    for path in xp:
        node = root.find(path, NS)
        if node is not None and tei_text(node):
            return tei_text(node)
    return None


def get_abstract(root):
    node = root.find(".//tei:text/tei:front/tei:abstract", NS) or root.find(".//tei:profileDesc/tei:abstract", NS)
    return tei_text(node) if node is not None else None


def iter_sections(root):
    """Yield (path, level, title, type, text) for each section/subsection."""
    body = root.find(".//tei:text/tei:body", NS)
    if body is None:
        return
    stack = [(body, 0, [])]
    while stack:
        node, level, path = stack.pop()
        divs = list(node.findall("./tei:div", NS))
        for i, div in enumerate(divs, 1):
            new_path = path + [i]
            title = tei_text(div.find("./tei:head", NS))
            paras = [tei_text(p) for p in div.findall("./tei:p", NS)]
            text = "\n\n".join([p for p in paras if p])
            yield {
                "path": ".".join(map(str, new_path)),
                "level": level + 1,
                "title": title,
                "type": div.get("type"),
                "text": text,
            }
            # Push children for recursion
            for child in reversed(div.findall("./tei:div", NS)):
                stack.append((child, level + 1, new_path))


# --- Extract ---
title = get_title(root)
abstract = get_abstract(root)
sections = list(iter_sections(root))

print(title)
print(abstract)
print([s['title'] for s in sections])