In [None]:
!pip install pinecone-client qianfan tiktoken langchain pymupdf

### pinecone初始化

初始化pinecone，并创建pinecone索引（index）

In [29]:
import getpass
PINECONE_API_KEY = getpass.getpass("input your pinecone api key:")
PINECONE_ENV = input("input your env name")

In [17]:
import pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

index_name = "qianfan-vdb"

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='euclidean',
        dimension=384,  # 1536 dim of text-embedding-ada-002
    )

使用千帆SDK前需要进行初始化鉴权

In [6]:
import os 

os.environ["QIANFAN_AK"] = getpass.getpass("input your qianfan app ak:")
os.environ["QIANFAN_SK"] = getpass.getpass("input your qianfan app sk:")

# s.environ["QIANFAN_ACCESS_KEY"] iam


嵌入准备，实际上将text转换成高维向量的表示，Embeddings对象基于qianfan 实现

In [30]:
from langchain.embeddings import QianfanEmbeddingsEndpoint
embeddings = QianfanEmbeddingsEndpoint(model="Embedding-V1")


初始化vectorstore，使用PyMuPDFLoader加载PDF文件

In [31]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("./example_data/ai-paper.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=384, chunk_overlap=0, separators=["\n\n", "\n", " ", "", "。", "，"])
docs_spilts = text_splitter.split_documents(documents)
print(f"{len(docs_spilts)} documents block loaded")

pinecone_vdb = Pinecone.from_documents(docs_spilts, embeddings, index_name=index_name)

33 documents block loaded


In [None]:
#新建Retriever
retriever = pinecone_vdb.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.5, "k":3 })
matched_docs = retriever.get_relevant_documents("DeepMind")



In [33]:
# 新建LLM类对象
from langchain.chat_models import QianfanChatEndpoint

llm = QianfanChatEndpoint(model="ERNIE-Bot-4")

In [27]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

CUSTOM_PROMPT = """
现在你是一个阅读理解机器人，你会阅读并深度理解我给你的文本内容并据此回答我所提出的问题。注意，我给出的问题是：{question} 你需要阅读理解的文本是：{context}
"""


qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever, chain_type_kwargs={"prompt": PromptTemplate.from_template(CUSTOM_PROMPT)}, return_source_documents=True)

In [None]:

query = "ERNIE1.0是什么时候发布的？" 
qa_chain({"query": query})