In [None]:
import polars as pl
import gzip
from pathlib import Path
from lxml import etree
import json

In [None]:
papers = (
    pl.read_ndjson("../data/papers.jsonl")
    .with_columns(pl.col("published").str.to_date(format="%Y-%m-%d", exact=False))
)
papers['published'].dt.year().value_counts().sort('published').plot.bar(x='published', y='count')

times_cited = papers[['citations']].explode('citations').unnest('citations').filter(pl.col('arxiv_id').is_not_null())['arxiv_id'].value_counts(sort=True).rename({'count': 'times_cited'})
papers = papers.join(times_cited, on='arxiv_id', how='left')

with pl.Config(fmt_str_lengths=1000):
    display(
        papers.sort(['depth', 'times_cited'], descending=[False, True], nulls_last=True).head(20)[['arxiv_id', 'title', 'published','authors', 'depth', 'times_cited']]
    )

In [None]:
papers['authors'].explode().value_counts().sort('count', descending=True).head(20)

In [None]:
train_data = (
    papers
    .rename({'arxiv_id': 'source_arxiv_id', 'title': 'source_title', 'authors': 'source_authors'})
    .explode('citations').unnest('citations')
    .filter(pl.col('arxiv_id').is_not_null() & pl.col('num_references') > 0)
    .explode('reference_contexts')
    .drop('categories', 'pdf_url', 'arxiv_url', 'xml_file_path', 'num_references', 'num_citations', 'num_arxiv_citations', 'depth', 'processing_timestamp')
    .join(papers[['arxiv_id']], on='arxiv_id', how='inner')
)
train_data

In [None]:
sample = train_data.sample(1).to_dicts()[0]
print(f"[{sample['arxiv_id']}] {sample['published']}. {sample['authors']}")
print(sample['source_title'])
print('-' * 32)

print(sample['abstract'])
print('-' * 32)
print(sample['reference_contexts'])

print('='*32)
cited_info = papers.filter(pl.col('arxiv_id') == sample['arxiv_id']).to_dicts()[0]
print(f"[{cited_info['arxiv_id']}] {cited_info['published']}. {cited_info['authors']}")
print(cited_info['title'])
print(cited_info['abstract'])

In [None]:
import numpy as np
matched_citations = (
    papers[['arxiv_id', 'title', 'authors', 'citations']]
    .with_columns(
        pl.col('citations').list.eval(pl.element().struct['arxiv_id']).list.drop_nulls().list.unique().alias('cited_papers'),
        pl.col('citations').list.eval(pl.element().struct['authors']).list.eval(pl.element().explode().drop_nulls()).list.unique().alias('cited_authors')
    )
)

author_citations: dict[tuple[str, str], int] = {}
for citing_authors, cited_authors in matched_citations[['authors', 'cited_authors']].iter_rows():
    for citing_author in citing_authors:
        for cited_author in cited_authors:
            author_citations[(citing_author, cited_author)] = author_citations.get((citing_author, cited_author), 0) + 1

author_citations = [
    {'citing_author': citing_author, 'cited_author': cited_author, 'count': count}
    for (citing_author, cited_author), count in author_citations.items()
]



In [None]:
pl.DataFrame(author_citations).filter(pl.col('citing_author') == pl.col('cited_author')).sort('count', descending=True)

In [None]:
paper = papers.sample(1).to_dicts()[0]
print(f"Source Paper {paper['arxiv_id']}: {paper['title']}")

citations = paper['citations']

arxiv_citations_with_context = [c for c in citations if c['arxiv_id'] and c['reference_contexts']]

print("Referred to:")
for c in arxiv_citations_with_context:
    crawled = c['arxiv_id'] in papers['arxiv_id']
    c.pop('citation_id')
    c['crawled'] = crawled
    if crawled:
        print(json.dumps(c, indent=2))
        print()

In [None]:
# --- Load and parse the gzipped TEI XML --
arxiv_id = "2212.09748"
xml_path = Path(f"../data/xml_docs/{arxiv_id}.xml.gz")
with gzip.open(xml_path, "rb") as f:
    data = f.read()

parser = etree.XMLParser(recover=True)
root = etree.fromstring(data, parser=parser)

NS = {"tei": "http://www.tei-c.org/ns/1.0"}
tei_text = lambda el: " ".join(" ".join(el.itertext()).split()) if el is not None else ""


# --- Quick helpers ---
def get_title(root):
    xp = [
        ".//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title",
        ".//tei:fileDesc/tei:sourceDesc//tei:biblStruct/tei:analytic/tei:title[@type='main']",
        ".//tei:fileDesc/tei:sourceDesc//tei:biblStruct/tei:monogr/tei:title",
    ]
    for path in xp:
        node = root.find(path, NS)
        if node is not None and tei_text(node):
            return tei_text(node)
    return None


def get_abstract(root):
    node = root.find(".//tei:text/tei:front/tei:abstract", NS) or root.find(".//tei:profileDesc/tei:abstract", NS)
    return tei_text(node) if node is not None else None


def iter_sections(root):
    """Yield (path, level, title, type, text) for each section/subsection."""
    body = root.find(".//tei:text/tei:body", NS)
    if body is None:
        return
    stack = [(body, 0, [])]
    while stack:
        node, level, path = stack.pop()
        divs = list(node.findall("./tei:div", NS))
        for i, div in enumerate(divs, 1):
            new_path = path + [i]
            title = tei_text(div.find("./tei:head", NS))
            paras = [tei_text(p) for p in div.findall("./tei:p", NS)]
            text = "\n\n".join([p for p in paras if p])
            yield {
                "path": ".".join(map(str, new_path)),
                "level": level + 1,
                "title": title,
                "type": div.get("type"),
                "text": text,
            }
            # Push children for recursion
            for child in reversed(div.findall("./tei:div", NS)):
                stack.append((child, level + 1, new_path))


# --- Extract ---
title = get_title(root)
abstract = get_abstract(root)
sections = list(iter_sections(root))

print(title)
print(abstract)
print([s['title'] for s in sections])