In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader
from langchain import ElasticVectorSearch
from langchain.embeddings import OpenAIEmbeddings
from elasticsearch import Elasticsearch
import yaml

with open("./config.yml", "r") as ymlfile:
    cfg = yaml.safe_load(ymlfile)

OPENAI_API_KEY = cfg["openai"]["OPENAI_API_KEY"]
ELASTICSEARCH_URL = cfg["es"]["elasticsearch_url"]
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


In [69]:
loader = PyMuPDFLoader("/Users/zhou/Dev_Work/ask_pdf/mmc2.pdf")

In [70]:
data = loader.load()

In [71]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 452 document(s) in your data
There are 3395 characters in your document


In [49]:
data[20]

Document(page_content='Residue\nModification\nIndividual 1\nIndividual 2\nQ346\nDeamidation\n2/16\n0/8\nN353\nDeamidation\n0/16\n2/8\nExtended Data Table 4\nCompatibility of ALS-associated TARDBP mutations \nwith the double-spiral fold\nMutation\nCompatible?\nNotes\nG287S\nYes\nSurface exposed\nG290A\nYes\nSurface exposed\nS292N\nYes\nWith hydrogen bonding to G294\nG294A\nYes\nSurface exposed\nG294V\nYes\nSurface exposed\nG295S\nYes\nSurface exposed\nG295R\nYes\nSurface exposed\nG298S\nNo*\nSteric clash\nM311V\nYes\n−\nA315T\nYes\nSurface exposed\nA315E\nYes\nSurface exposed\nA321G\nYes\n−\nQ331K\nNo*\nUncompensated charged group in interior\nS332N\nNo\nSteric clash\nG335D\nNo*\nUncompensated charged group in interior\nM337V\nYes\n−\nQ343R\nNo*\nUncompensated charged group in interior\nN345K\nNo*\nUncompensated charged group in interior\nG348C\nYes\nSurface exposed\nG348V\nYes\nSurface exposed\nG348R\nYes\nSurface exposed\nN352S\nYes\nSurface exposed\nG357R\nYes\nSurface exposed**\nG35

In [72]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(data)

In [51]:
print (f'Now you have {len(texts)} documents')
print(texts)

Now you have 86 documents
[Document(page_content="Structure of pathological TDP-43 filaments from ALS with FTLD\nDiana Arseni1, Masato Hasegawa2, Alexey G. Murzin1, Fuyuki Kametani2, Makoto Arai3, \nMari Yoshida4, Benjamin Ryskeldi-Falcon1,*\n1MRC Laboratory of Molecular Biology, Cambridge, UK\n2Department of Brain and Neurosciences, Tokyo Metropolitan Institute of Medical Science, Tokyo, \nJapan\n3Department of Psychiatry and Behavioural Sciences, Tokyo Metropolitan Institute of Medical \nScience, Tokyo, Japan\n4Institute for Medical Science of Aging, Aichi Medical University, Aichi, Japan\nSummary\nThe abnormal aggregation of transactive response DNA-binding protein of 43 kDa (TDP-43) in \nneurons and glia is the defining pathological hallmark of neurodegenerative diseases amyotrophic \nlateral sclerosis (ALS) and multiple forms of frontotemporal lobar degeneration (FTLD) 1,2 . It is \nalso common in other diseases, including Alzheimer's and Parkinson's. No disease-modifying", metada

In [73]:
docsearch = ElasticVectorSearch.from_texts(
    texts=[t.page_content for t in texts], 
    embedding=embeddings, 
    index_name='spark',
    metadatas=[t.metadata for t in texts],
    elasticsearch_url=ELASTICSEARCH_URL,
    ) # namespace

In [None]:
query = "How much fraud are we talking about, really?"
docs = docsearch.similarity_search(query, include_metadata=True, index_name="huangwei") # ?namespace
docs

In [57]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain

In [84]:
llm = ChatOpenAI(temperature=0.7, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")  # ?, return_source_documents=True

In [None]:
query = "gpt4如何与教育应用场景结合？"
docs = docsearch.similarity_search(query, include_metadata=True, namespace="spark")
docs

In [87]:
chain.run(input_documents=docs, question=query)

'GPT-4及其后继版本可以在教育领域中提供巨大的价值，它们可以引入新的效率和能力。例如，GPT-4可以被用于自然语言处理方面，可以使得机器自动回复问题，从而解放教师的时间，让他们有更多时间关注学生的学习。此外，GPT-4也可以被用于学生作文的自动评分，或者帮助学生对文本进行自动摘要等等。当然，还需要考虑到这些应用可能存在的问题，比如数据隐私和教育公平性等方面的问题。总之，GPT-4及其后继版本在教育应用场景中具有广阔的应用前景。'