In [None]:
import openai
import os
import langchain

In [None]:
# The embedding engine that will convert our text to vectors
from langchain.embeddings.openai import OpenAIEmbeddings


#llm = OpenAI(temperature=0, openai_api_key=openai_api_key)

In [None]:
# The vectorstore we'll be using
from langchain.vectorstores import FAISS

# The LangChain component we'll use to get the documents
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.llms import OpenAI

In [None]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
openai_api_key = ''
pdf_loader = OnlinePDFLoader("https://example-files.online-convert.com/document/pdf/example_complex.pdf")
doc = pdf_loader.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
print (f"You have {len(doc)} document")
print (f"You have {len(doc[0].page_content)} characters in that document")

You have 1 document
You have 3833 characters in that document


Now let's split our document into small pieces

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(doc)

In [None]:
#Get the total number of characters so that we can see the average later
num_total_characters = sum([len(x.page_content) for x in docs])
print (f"Now you have {len(docs)} documents that have an average of {num_total_characters / len(docs):,.0f} characters (smaller pieces)")

Now you have 5 documents that have an average of 906 characters (smaller pieces)


In [None]:
# Get your embeddings engine ready
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

#Embed your documents and combine wth the raw text in a pseudo db. Note: this
#will make a API call to OpenAI
docsearch = FAISS.from_documents(docs, embeddings)

Create your retrieval chain

In [None]:
qa = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0, openai_api_key=openai_api_key), chain_type="stuff", retriever=docsearch.as_retriever())

Now, its time to ask questions. The retriever will go get the similar documents and combine with your question for the LLM to reason through.

**Note: It might not seem like much, but the magic here is that we didn't have to pass in our original full document**

In [None]:
query="How has the name John Doe been referenced in popular culture?"
qa.run(query)

' The name John Doe has been referenced in popular culture in the Frank Capra film Meet John Doe and the 2002 American television series of the same name.'

In [None]:
query="What is the document file type being referenced?"
qa.run(query)

' PDF'

In [None]:
query="What is the version of the document being referenced being referenced?"
qa.run(query)

' 1.0'