In [18]:
from urllib import request
from collections import OrderedDict

from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import GrobidParser


def parse_arxiv_paper(arxiv_url: str) -> dict:
    """Parse an arXiv paper from a URL."""
    request.urlretrieve(arxiv_url, f"/tmp/arxiv_{arxiv_url.split('/')[-1]}.pdf")
    loader = GenericLoader.from_filesystem(
        "/tmp/",
        glob="arxiv_*",
        suffixes=[".pdf"],
        parser=GrobidParser(
            segment_sentences=False,
            grobid_server="http://grobid:8070/api/processFulltextDocument",
        ),
    )
    docs = loader.load()

    # Parse
    result = []
    sections = OrderedDict()
    for doc in docs:
        section_title = doc.metadata["section_title"]
        if section_title not in sections:
            sections[section_title] = {"section_title": section_title, "paragraphs": []}
        section = sections[section_title]
        section["paragraphs"].append(doc.page_content)

    for section in sections.values():
        result.append(section)

    return result

In [19]:
arxiv_url = "https://arxiv.org/pdf/2004.07606"
d = parse_arxiv_paper(arxiv_url)

In [20]:
d

[{'section_title': 'I. INTRODUCTION',
  'paragraphs': ['Rapid globalization of supply chains has led to serious problems, particularly with respect to traceability.The Organisation for Economic Co-operation and Development (OECD) reported that counterfeit products in international trade totaled 509 billion USD in 2016, up from 461 billion USD in 2013 [1].Furthermore, due to the complexity of the supply chain, ingredients contaminated with Escherichia coli could not be tracked, resulting in the 2015 E. coli outbreak at Chipotle Mexican Grill [2].',
   'To remedy these problems, supply chain systems based on blockchain have been proposed [3]- [5].These systems store distribution information of products in a public blockchain (PBC).Note that the distribution information in these systems are the records of ownership transfers.Smart contracts prevent the registration of fraudulent distribution information by setting appropriate conditions.Once information has been stored in the blockchain, 