In [2]:
import fitz  # PyMuPDF
import re

def pdf_to_text(pdf_path):
    # 打开PDF文件
    with fitz.open(pdf_path) as pdf:
        text = ""
        # 遍历每一页
        for page_num in range(pdf.page_count):
            page = pdf[page_num]
            # 提取文本并追加
            text += page.get_text()

    return text

def clean_text(text):
    # # 移除多余的空格和换行
    # text = re.sub(r'\s+', ' ', text)
    # 移除页面编号、页眉页脚等
    text = re.sub(r'Page \d+|\f', '', text)
    # 移除不需要的符号
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # 移除非ASCII字符
    return text.strip()

def split_text_by_sentences(text, max_len=512):
    # 使用正则表达式按句号、问号和分号分割
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
    chunks = []
    current_chunk = ""

    # 根据BERT的最大输入长度（如512个词汇）分段
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_len:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def pdf_to_clean_text(pdf_path, output_txt_path):
    # PDF转文本
    raw_text = pdf_to_text(pdf_path)
    # 清洗文本
    cleaned_text = clean_text(raw_text)
    split_text_by_sentences(cleaned_text, max_len=512)
    # 写入到TXT文件
    with open(output_txt_path, 'w', encoding='utf-8') as f:
        f.write(cleaned_text)
    print(f"Cleaned text saved to {output_txt_path}")




In [3]:
pdf_path = '../data/AML_Sustainability_Report_FY2023.pdf'
output_txt_path = '../output/1030_split.txt'
pdf_to_clean_text(pdf_path, output_txt_path)

Cleaned text saved to ../output/1030_split.txt
