In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from datetime import datetime


ZHIPUAI_API_KEY = "54c07d89321b45a6a917ba058252ab72.XvqOIXx1iXJk8wUe"

# 获取folder_path下所有文件路径，储存在file_paths里
file_paths = []
folder_path = '../data_base/psychology_book'
for root, dirs, files in os.walk(folder_path):
    for file in files:
        file_path = os.path.join(root, file)
        file_paths.append(file_path)
# print(file_paths[:3])

from langchain.document_loaders.pdf import PyMuPDFLoader
from langchain.document_loaders.markdown import UnstructuredMarkdownLoader

# 遍历文件路径并把实例化的loader存放在loaders里
loaders = []

for file_path in file_paths:

    file_type = file_path.split('.')[-1]
    if file_type == 'pdf':
        loaders.append(PyMuPDFLoader(file_path))
    elif file_type == 'md':
        loaders.append(UnstructuredMarkdownLoader(file_path))

# 下载文件并存储到text
texts = []

for loader in loaders: texts.extend(loader.load())

text = texts[1]
print(f"每一个元素的类型：{type(text)}.", 
    f"该文档的描述性数据：{text.metadata}",  
    sep="\n------\n")

from langchain.text_splitter import RecursiveCharacterTextSplitter

# 切分文档
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=50)

split_docs = text_splitter.split_documents(texts)

from zhipuai_embedding import ZhipuAIEmbeddings

embedding = ZhipuAIEmbeddings(zhipuai_api_key=ZHIPUAI_API_KEY)
persist_directory = '../data_base/knowledge_db'

from langchain.vectorstores.chroma import Chroma

if os.path.exists(persist_directory):
    # 加载已有数据库
    vectordb = Chroma(
        persist_directory=persist_directory,
        embedding_function=embedding
    )
    
    # 获取已处理的文件列表（通过文档元数据）
    collection_data = vectordb.get()
    existing_files = set()
    if collection_data['metadatas']:
        for metadata in collection_data['metadatas']:
            if 'source' in metadata:
                existing_files.add(metadata['source'])
    
    # 找出新增文件
    new_files = [p for p in file_paths if p not in existing_files]
    
    # 处理新增文件（保持原有逻辑）
    new_loaders = []
    for file_path in new_files:
        file_type = file_path.split('.')[-1]
        if file_type == 'pdf':
            new_loaders.append(PyMuPDFLoader(file_path))
        elif file_type == 'md':
            new_loaders.append(UnstructuredMarkdownLoader(file_path))
    
    # 加载并切分新文档
    new_texts = []
    for loader in new_loaders: 
        new_texts.extend(loader.load())
    new_split_docs = text_splitter.split_documents(new_texts)
    
    # 追加到数据库
    if new_split_docs:
        vectordb.add_documents(new_split_docs)
        vectordb.persist()
else:
    # 首次运行创建数据库
    vectordb = Chroma.from_documents(
        documents=split_docs,
        embedding=embedding,
        persist_directory=persist_directory
    )
    vectordb.persist()



print(f"向量库中存储的数量：{vectordb._collection.count()}")


每一个元素的类型：<class 'langchain_core.documents.base.Document'>.
------
该文档的描述性数据：{'source': '../data_base/psychology_book/日常生活的心理分析.pdf', 'file_path': '../data_base/psychology_book/日常生活的心理分析.pdf', 'page': 1, 'total_pages': 174, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'Pdftools SDK', 'creationDate': '', 'modDate': 'D:20250318170037Z', 'trapped': ''}
向量库中存储的数量：14336


  vectordb.persist()
