In [1]:
import os
import fitz  
import pytesseract
from PIL import Image
import io
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 手动设置 Tesseract 可执行文件路径
pytesseract.pytesseract.tesseract_cmd = r"D:\Program Files (x86)\tesseract.exe"




In [2]:
# 3️⃣  加载 NLP 模型
nlp = spacy.load("en_core_web_sm")
bert_model = SentenceTransformer("all-MiniLM-L6-v2")  # 预训练 Transformer

In [3]:
# 1️⃣  设置 PDF 目录
pdf_folder = "Raw_file_folder"
output_folder = "Chunk_file_folder"  # 存储处理后的文本

In [4]:
def extract_text_from_pdf(pdf_path):
    """ 使用 PyMuPDF (fitz) 提取 PDF 纯文本 """
    doc = fitz.open(pdf_path)
    text = []
    for page in doc:
        text.append(page.get_text("text"))
    return "\n".join(filter(None, text))


def extract_tables_from_pdf(pdf_path):
    """ 提取 PDF 中的表格 """
    import pdfplumber
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables.extend(page.extract_tables())
    return tables


def extract_text_from_images(pdf_path):
    """ 使用 PyMuPDF (fitz) 提取 PDF 中的图片并进行 OCR 解析 """
    doc = fitz.open(pdf_path)
    extracted_texts = []
    
    for page in doc:
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            text = pytesseract.image_to_string(image)
            extracted_texts.append(text.strip())

    return "\n".join(extracted_texts)


def split_into_sentences(text):
    """ 使用 spaCy 进行句子分割 """
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]


def segment_paragraphs(text, similarity_threshold=0.6, min_sentences=5):
    """ 使用 BERT 计算语义相似度，智能划分段落 """
    sentences = split_into_sentences(text)
    embeddings = bert_model.encode(sentences, convert_to_numpy=True)

    similarities = [
        cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
        for i in range(len(sentences) - 1)
    ]

    paragraphs = []
    current_paragraph = [sentences[0]]

    for i in range(1, len(sentences)):
        if similarities[i - 1] < similarity_threshold or len(current_paragraph) >= min_sentences:
            paragraphs.append(" ".join(current_paragraph))
            current_paragraph = [sentences[i]]
        else:
            current_paragraph.append(sentences[i])

    paragraphs.append(" ".join(current_paragraph))
    return paragraphs


In [5]:
# 处理 Raw_file_folder 目录中的所有 PDF 文件
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    print(f"processing：{pdf_file}")

    # 解析 PDF 文本、表格、OCR
    pdf_text = extract_text_from_pdf(pdf_path)
    ocr_text = extract_text_from_images(pdf_path)
    tables = extract_tables_from_pdf(pdf_path)

    # 合并文本进行段落智能划分
    full_text = pdf_text + "\n" + ocr_text
    segmented_paragraphs = segment_paragraphs(full_text, similarity_threshold=0.6, min_sentences=5)

    # 输出结果存储
    output_text_path = os.path.join(output_folder, pdf_file.replace(".pdf", "_text.txt"))
    output_table_path = os.path.join(output_folder, pdf_file.replace(".pdf", "_tables.txt"))

    # 存储文本
    with open(output_text_path, "w", encoding="utf-8") as f:
        for i, para in enumerate(segmented_paragraphs, 1):
            f.write(f"chunk {i} (Number of characters: {len(para)}):\n{para}\n\n")
            f.write("-" * 50 + "\n\n")

    with open(output_table_path, "w", encoding="utf-8") as f:
        for table in tables:
            for row in table:
                f.write("\t".join(map(str, row)) + "\n")
            f.write("\n" + "=" * 50 + "\n\n")

    print(f"Processing done! 文本存储在 {output_text_path}，表格存储在 {output_table_path}\n")

processing：Complex PTSD_ From Surviving to Thriving.pdf
Processing done! 文本存储在 Chunk_file_folder\Complex PTSD_ From Surviving to Thriving_text.txt，表格存储在 Chunk_file_folder\Complex PTSD_ From Surviving to Thriving_tables.txt

processing：GPMHSC-Suicide-prevention-and-first-aid-resource-for-GPs.pdf
Processing done! 文本存储在 Chunk_file_folder\GPMHSC-Suicide-prevention-and-first-aid-resource-for-GPs_text.txt，表格存储在 Chunk_file_folder\GPMHSC-Suicide-prevention-and-first-aid-resource-for-GPs_tables.txt

processing：therapists_guide_to_brief_cbtmanual.pdf
Processing done! 文本存储在 Chunk_file_folder\therapists_guide_to_brief_cbtmanual_text.txt，表格存储在 Chunk_file_folder\therapists_guide_to_brief_cbtmanual_tables.txt

