## 企业文档管理的三大痛点

企业内部文档散落在各处:技术文档是PDF,产品手册是Word,会议记录是Markdown,知识沉淀在网页中。

本文展示如何使用代码构建统一文档处理流水线。


In [None]:

import os

# 设置文档目录
DOCS_DIR = "./files/enterprise_docs"
OUTPUT_DIR = "./files/processed_docs"

os.makedirs(OUTPUT_DIR, exist_ok=True)


Caching the list of root modules, please wait!
(This will only be done once - type '%rehashx' to reset cache!)



In [4]:
# 2:实现多格式文档加载器
from typing import List
from langchain_classic.schema import Document
from langchain_community.document_loaders import (
    PyPDFLoader,
    UnstructuredMarkdownLoader,
    WebBaseLoader
)

class MultiFormatDocumentLoader:
    """多格式文档统一加载器"""
    
    def __init__(self, docs_dir: str):
        self.docs_dir = docs_dir
        self.supported_formats = {
            '.pdf': self._load_pdf,
            '.md': self._load_markdown,
            '.html': self._load_web
        }
    
    def _load_pdf(self, file_path: str) -> List[Document]:
        """加载PDF文档"""
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        # 添加元数据
        for doc in documents:
            doc.metadata['source_type'] = 'pdf'
            doc.metadata['file_path'] = file_path
        return documents
    
    def _load_markdown(self, file_path: str) -> List[Document]:
        """加载Markdown文档"""
        loader = UnstructuredMarkdownLoader(file_path)
        documents = loader.load()
        for doc in documents:
            doc.metadata['source_type'] = 'markdown'
            doc.metadata['file_path'] = file_path
        return documents
    
    def _load_web(self, url: str) -> List[Document]:
        """加载网页内容"""
        loader = WebBaseLoader(url)
        documents = loader.load()
        for doc in documents:
            doc.metadata['source_type'] = 'web'
            doc.metadata['url'] = url
        return documents
    
    def load_all(self) -> List[Document]:
        """加载目录下所有支持格式的文档"""
        all_documents = []
        
        for filename in os.listdir(self.docs_dir):
            file_path = os.path.join(self.docs_dir, filename)
            file_ext = os.path.splitext(filename)[1].lower()
            
            if file_ext in self.supported_formats:
                try:
                    loader_func = self.supported_formats[file_ext]
                    documents = loader_func(file_path)
                    all_documents.extend(documents)
                    print(f"✅ 已加载: {filename} ({len(documents)}个文档片段)")
                except Exception as e:
                    print(f"❌ 加载失败 {filename}: {str(e)}")
        
        return all_documents
    
    def load_from_urls(self, urls: List[str]) -> List[Document]:
        """从URL列表加载网页"""
        documents = []
        for url in urls:
            try:
                docs = self._load_web(url)
                documents.extend(docs)
                print(f"✅ 已加载网页: {url}")
            except Exception as e:
                print(f"❌ 加载网页失败 {url}: {str(e)}")
        return documents

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [5]:
# 步骤3: 文本清洗
import re
from typing import List
from langchain_classic.schema import Document

class TextCleaner:
    """文本清洗工具类"""

    @staticmethod
    def remove_extra_whitespace(text: str) -> str:
        """去除多余空白字符"""
        # 多个空格→一个空格
        text = re.sub(r' +', ' ', text)
        # 多个连续换行→两个换行
        text = re.sub(r'\n+', '\n\n', text)
        # 去除每行行首尾空格
        text = '\n'.join(line.strip() for line in text.split('\n'))
        return text.strip()

    @staticmethod
    def remove_special_chars(text: str) -> str:
        """去除特殊字符(保留中文、英文、数字和常用标点)"""
        # ⚠ 方括号需转义 \[ \]
        # ⚠ 单双引号用 \" \' 避免破坏字符串
        pattern = (
            r'[^\u4e00-\u9fa5a-zA-Z0-9\s'
            r'\.\,\!\?\;\:\(\)\[\]\{\}《》\"\'、。]'
        )
        return re.sub(pattern, '', text)

    @staticmethod
    def normalize_chinese_punctuation(text: str) -> str:
        """统一中文标点符号（可按需扩展）"""
        replacements = {
            ',': '、',
            '(': '（',
            ')': '）',
            '[': '【',
            ']': '】'
        }
        for a, b in replacements.items():
            text = text.replace(a, b)
        return text

    @classmethod
    def clean(cls, text: str, normalize_punctuation: bool = True) -> str:
        """执行完整清洗流程"""
        text = cls.remove_extra_whitespace(text)
        text = cls.remove_special_chars(text)
        if normalize_punctuation:
            text = cls.normalize_chinese_punctuation(text)
        return text


def clean_documents(documents: List[Document]) -> List[Document]:
    """批量清洗文档"""
    cleaned_docs = []
    for doc in documents:
        cleaned = TextCleaner.clean(doc.page_content)
        if len(cleaned.strip()) > 50:  # 至少保留50字符
            cleaned_docs.append(Document(
                page_content=cleaned,
                metadata=doc.metadata
            ))
    print(f"✅ 清洗完成: {len(documents)}篇 → {len(cleaned_docs)}篇有效文档")
    return cleaned_docs


In [7]:
# 步骤4: 语义分块
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
from langchain_classic.schema import Document
from typing import List

class SemanticTextSplitter:
    """语义文本分块器"""
    
    def __init__(
        self,
        chunk_size: int = 1000,
        chunk_overlap: int = 200
    ):
        """
        初始化分块器

        Args:
            chunk_size: 每个文本块的目标大小(字符数)
            chunk_overlap: 相邻块之间的重叠部分
        """
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=[
                "\n\n",  # 段落
                "\n",    # 换行
                "。",     # 中文句号
                ".",      # 英文句号
                ";",      # 分号
                ",",      # 逗号
                " ",      # 空格
                ""        # 字符
            ]
        )
    
    def split_documents(self, documents: List[Document]) -> List[Document]:
        """分块文档列表"""
        chunks = self.splitter.split_documents(documents)
        
        # 为每个 chunk 添加块索引和大小
        for i, chunk in enumerate(chunks):
            chunk.metadata['chunk_id'] = i
            chunk.metadata['chunk_size'] = len(chunk.page_content)
        
        print(f"✅ 分块完成: {len(documents)}篇文档 → {len(chunks)}个文本块")
        return chunks
    
    def get_statistics(self, chunks: List[Document]) -> dict:
        """获取分块统计信息"""
        sizes = [len(chunk.page_content) for chunk in chunks]
        return {
            'total_chunks': len(chunks),
            'avg_size': sum(sizes) / len(sizes) if sizes else 0,
            'min_size': min(sizes) if sizes else 0,
            'max_size': max(sizes) if sizes else 0
        }


In [10]:
def process_enterprise_documents(
    docs_dir: str,
    web_urls: List[str] = None,
    chunk_size: int = 1000,
    chunk_overlap: int = 200
) -> List[Document]:
    """
    企业文档处理完整流程
    """
    print("=" * 60)
    print("企业知识库文档处理流程")
    print("=" * 60)
    
    # 1. 加载文档
    print("\n[步骤1] 加载文档...")
    loader = MultiFormatDocumentLoader(docs_dir)
    documents = loader.load_all()
    
    if web_urls:
        web_docs = loader.load_from_urls(web_urls)
        documents.extend(web_docs)
    
    print(f"共加载 {len(documents)} 个文档片段")
    
    # 2. 清洗文档
    print("\n[步骤2] 清洗文档...")
    cleaned_docs = clean_documents(documents)
    
    # 3. 分块文档
    print("\n[步骤3] 分块文档...")
    splitter = SemanticTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_documents(cleaned_docs)
    
    # 4. 统计信息
    stats = splitter.get_statistics(chunks)
    print("\n[统计信息]")
    print(f"  - 总文本块数: {stats['total_chunks']}")
    print(f"  - 平均块大小: {stats['avg_size']:.0f} 字符")
    print(f"  - 最小块大小: {stats['min_size']} 字符")
    print(f"  - 最大块大小: {stats['max_size']} 字符")
    
    return chunks


# ======================================================
#                    示例运行部分
# ======================================================
if __name__ == "__main__":

    web_urls = [
        "https://cn.lipsum.com/",
    ]
    
    chunks = process_enterprise_documents(
        docs_dir="./files/enterprise_docs",
        web_urls=web_urls,
        chunk_size=1000,
        chunk_overlap=200
    )
    
    print("\n[示例文本块]")
    print("来源:", chunks[0].metadata.get("source_type", "unknown"))
    print("内容预览:", chunks[0].page_content[:200], "...")

企业知识库文档处理流程

[步骤1] 加载文档...
✅ 已加载网页: https://cn.lipsum.com/
共加载 1 个文档片段

[步骤2] 清洗文档...
✅ 清洗完成: 1篇 → 1篇有效文档

[步骤3] 分块文档...
✅ 分块完成: 1篇文档 → 9个文本块

[统计信息]
  - 总文本块数: 9
  - 平均块大小: 726 字符
  - 最小块大小: 65 字符
  - 最大块大小: 998 字符

[示例文本块]
来源: web
内容预览: Lorem Ipsum  All the facts  Lipsum generator

 Shqip   Catal 中文简体 Hrvatski esky Dansk Nederlands English Eesti Filipino Suomi Franais  Deutsch    Magyar Indonesia Italiano Latviski Lietuvikai  Melayu  ...
