In [6]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

### Load your data

In [7]:
loader = UnstructuredPDFLoader("../BookFinal2023.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [8]:
data = loader.load()

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


In [9]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 652306 characters in your document


### Chunk your data up into smaller documents

In [10]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [11]:
print (f'Now you have {len(texts)} documents')

Now you have 892 documents


### Create embeddings of your documents to get ready for semantic search

In [12]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [13]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_API_ENV = os.getenv('PINECONE_API_ENV')

In [14]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [15]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchaindk" # put in the name of your pinecone index here

In [16]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [17]:
query = "tell me an anecdote involving algoryhtms and sundails"
docs = docsearch.similarity_search(query, include_metadata=True)

In [18]:
# Here's an example of the first document that was returned
docs[0].page_content[:250]

"While Thales' anecdotes discuss the cosmological cycles that govern society, the next fragment identifies an actual instrument of time and society control, the sundial. It is a fragment of a Roman playwright attributed to Plautus and quoted in Aulus "

### Query those docs to get your answer back

In [19]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [20]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [21]:
query = "discribe the sundial anecdote as told by Plautus"
docs = docsearch.similarity_search(query, include_metadata=True)

In [22]:
chain.run(input_documents=docs, question=query)

' Plautus\' anecdote describes a city full of sundials that have turned its citizens into "hungry parasites" by creating a false scarcity of food. The sundials divide the day into arbitrary parts (hours) and deprive citizens of the pleasure of eating when they are hungry, as they are only allowed to eat when it suits the sundial.'