In [1]:
import sys
sys.path.append("../")
import logging
logging.basicConfig(level=logging.ERROR)

In [2]:
# 首先实现基本配置
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.document_loaders import UnstructuredFileLoader

from langchain_community.embeddings import QianfanEmbeddingsEndpoint
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from QA_Project.project.embedding.zhipuai_embedding import ZhipuAIEmbeddings

from langchain_community.llms import QianfanLLMEndpoint
from langchain.llms import HuggingFacePipeline

# 使用前配置自己的 api 到环境变量中如
import os
import sys

from dotenv import load_dotenv, find_dotenv

os.environ['HTTPS_PROXY'] = 'http://192.168.8.94:10809'
os.environ["HTTP_PROXY"] = 'http://192.168.8.94:10809'

# _ = load_dotenv(find_dotenv())
_ = load_dotenv('../QA_Project/.env')

# 获取环境变量 
wenxin_api_key = os.environ["wenxin_api_key"]
wenxin_secret_key = os.environ["wenxin_secret_key"]
zhipuai_api_key = os.environ["ZHIPUAI_API_KEY"]

In [3]:
#pdf
# 加载 PDF
loaders = [
    PyMuPDFLoader("../QA_Project/data_base/knowledge_db/pumpkin_book/pumpkin_book.pdf") # 机器学习,
]
docs = []
for loader in loaders:
    docs.extend(loader.load())
del docs[2:13] #删除目录的内容

In [4]:
#md
folder_path = "../QA_Project/data_base/knowledge_db/prompt_engineering/"
files = os.listdir(folder_path)
loaders = []
for one_file in files:
    loader = UnstructuredMarkdownLoader(os.path.join(folder_path, one_file))
    loaders.append(loader)
for loader in loaders:
    docs.extend(loader.load())


In [5]:
#mp4-txt
loaders = [
    UnstructuredFileLoader("../QA_Project/data_base/knowledge_db/easy__rl/强化学习入门指南.txt") # 机器学习,
]
for loader in loaders:
    docs.extend(loader.load())


In [6]:
# 定义 Embeddings
qianfan_embedding = QianfanEmbeddingsEndpoint(qianfan_ak=wenxin_api_key,
                                  qianfan_sk=wenxin_secret_key) 

In [7]:
# 切分文档
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=100)
split_docs = text_splitter.split_documents(docs)



# 定义持久化路径
persist_directory = '../QA_Project/data_base/vector_db/chroma'
os.makedirs(persist_directory, exist_ok=True) 
# os.chmod(persist_directory, 0o777)

# 加载数据库
vectordb = Chroma.from_documents(
    documents=split_docs,
    embedding=qianfan_embedding,
    persist_directory=persist_directory  # 允许我们将persist_directory目录保存到磁盘上
)


[INFO] [03-05 18:10:20] openapi_requestor.py:316 [t:140040502433600]: requesting llm api endpoint: /embeddings/embedding-v1
INFO:qianfan:requesting llm api endpoint: /embeddings/embedding-v1
[INFO] [03-05 18:10:20] oauth.py:207 [t:140040502433600]: trying to refresh access_token for ak `NSwW3Z***`
INFO:qianfan:trying to refresh access_token for ak `NSwW3Z***`
[INFO] [03-05 18:10:21] oauth.py:220 [t:140040502433600]: sucessfully refresh access_token
INFO:qianfan:sucessfully refresh access_token
[INFO] [03-05 18:10:21] openapi_requestor.py:316 [t:140040502433600]: requesting llm api endpoint: /embeddings/embedding-v1
INFO:qianfan:requesting llm api endpoint: /embeddings/embedding-v1
[INFO] [03-05 18:10:22] openapi_requestor.py:316 [t:140040502433600]: requesting llm api endpoint: /embeddings/embedding-v1
INFO:qianfan:requesting llm api endpoint: /embeddings/embedding-v1
[INFO] [03-05 18:10:23] openapi_requestor.py:316 [t:140040502433600]: requesting llm api endpoint: /embeddings/embeddin

In [8]:
vectordb.persist()

In [9]:
print(f"向量库中存储的数量：{vectordb._collection.count()}")

向量库中存储的数量：2196
