In [159]:
import os
import openai
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import chroma, Pinecone
import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from dotenv import load_dotenv, find_dotenv
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from IPython.display import display, Markdown
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain


_ = load_dotenv(find_dotenv()) # read local .env file
#openai key
openai.api_key = os.environ['OPENAI_API_KEY']
openai.model_name = os.environ['FAST_LLM_MODEL']
print(openai.model_name)

gpt-3.5-turbo-0613


### 加载PDF并做切片

In [None]:
loader = PyPDFLoader("../data/8B3608368D6AA693562457E20559FE6C.pdf")

In [None]:
data = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [None]:
embeddings = OpenAIEmbeddings()

## 用pinecone来做向量存储：

In [None]:
#init pinecone
pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'],
    environment=os.environ['PINECONE_ENV'])
index_name = "pdftest"
pinecone.create_index("pdftest2", dimension=1536, metric="euclidean")
#pinecone.list_indexes()

In [None]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [None]:
query = "请找出这家公司的生产的每一种设备，一一罗列出来"
docs = docsearch.similarity_search(query)
print(docs[0])

### 本地向量存储：

In [None]:
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.indexes import VectorstoreIndexCreator

In [None]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

In [None]:
template = ChatPromptTemplate.from_template(
    "This is a listing prospectus, suppose you a Stock investor,\
    Base on the information the information below:\
    {docs}, you need to find out the detial information \
     to answer the quetion of {query}\
     then output the anser in Chinese. NOte that, Do not summary \
     the information you found , but list the detail of them. \
     finally, output the answer in Chinese."
)
print(template)

In [None]:
llm = OpenAI(model_name="gpt-3.5-turbo-0613",temperature=0.0)
chain = load_qa_chain(llm, chain_type="stuff")

In [None]:
response = chain.run(input_documents=docs, question=query, template = template)
display(Markdown(response))