In [1]:
import re

from langchain_community.document_loaders import PyPDFLoader


def remove_images(content):
    pattern = r'(?s)\n?!".+Fig.+\n?'
    return re.sub(pattern, "\n", content)


def process_text(text):
    # 이미지 제거
    text = remove_images(text)
    return text


arxiv_url = "https://arxiv.org/pdf/2004.07606"
docs = PyPDFLoader(arxiv_url).load()
doc = docs[0]
content = "".join([process_text(doc.page_content) for doc in docs])

In [2]:
sentences = content.split("\n")
clean_content = sentences[0]
for idx in range(1, len(sentences)):
    last_char = sentences[idx - 1][-1]
    first_char = sentences[idx][0]

    if (last_char == "." or last_char.isupper()) and (
        first_char.isupper() or first_char.isnumeric()
    ):
        clean_content += "\n" + sentences[idx]

print(clean_content)

arXiv:2004.07606v1  [cs.CR]  16 Apr 2020Short Paper: Design and Evaluation of
I. I NTRODUCTION
Rapid globalization of supply chains has led to serious
Furthermore, due to the complexity of the supply chain,
To remedy these problems, supply chain systems based on
However, there are privacy issues with these systems. Dis-
As a result, the privacy of distribution information is not
In this paper, we propose a method for preserving privacy
To evaluate the proposed method, we implement it on the
The rest of this paper is structured as follows. Section II
II. R ELATED WORK
Several blockchain-based systems have been proposed for
POMS [3] is a system for managing product ownership using
III. P ROPOSED METHOD
We propose a method for preserving the privacy of dis-
The proposed method consists of ManufacturerManager-
A. Preparation for distribution
Other information such as the name and phone number of the
These registration processes can be executed only by a
B. Products registration
PMC records

In [3]:
def split_text_to_sections(text):
    # 섹션을 구분하는 정규표현식 패턴
    section_pattern = r"(?:^|\n)(Abstract|[IVX\d]+\.\s+.+\n|REFERENCES)"

    # 텍스트를 섹션으로 분할
    sections = re.split(section_pattern, text)

    # 첫 번째 요소를 제거
    sections = sections[1:]

    # 결과를 저장할 리스트
    result = []

    # 섹션 제목과 내용을 쌍으로 처리
    for i in range(0, len(sections), 2):
        title = sections[i].strip()
        content = sections[i + 1].strip() if i + 1 < len(sections) else ""
        result.append(
            {"title": title, "content": content, "paragraphs": content.split("\n")}
        )

    # Reference 제거
    result.pop()

    return result

In [4]:
sections = split_text_to_sections(clean_content)

In [5]:
sections

[{'title': 'I. I NTRODUCTION',
  'content': 'Rapid globalization of supply chains has led to serious\nFurthermore, due to the complexity of the supply chain,\nTo remedy these problems, supply chain systems based on\nHowever, there are privacy issues with these systems. Dis-\nAs a result, the privacy of distribution information is not\nIn this paper, we propose a method for preserving privacy\nTo evaluate the proposed method, we implement it on the\nThe rest of this paper is structured as follows. Section II',
  'paragraphs': ['Rapid globalization of supply chains has led to serious',
   'Furthermore, due to the complexity of the supply chain,',
   'To remedy these problems, supply chain systems based on',
   'However, there are privacy issues with these systems. Dis-',
   'As a result, the privacy of distribution information is not',
   'In this paper, we propose a method for preserving privacy',
   'To evaluate the proposed method, we implement it on the',
   'The rest of this paper i

In [None]:
for section in sections:
    print(section["title"])

In [None]:
print(section["content"])

In [None]:
for paragraph in section["paragraphs"]:
    print(paragraph)
    print()