In [12]:
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [2]:
loader = PyPDFLoader("../data/pdf_data/epedagogy-guide.pdf")
pages = loader.load_and_split()

In [3]:
import tiktoken

# create a GPT-3.5 encoder instance, can use other models as well
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [4]:
# estimate the number of words and tokens. Check the cost of embedding in case it's too expensive
est_total_word_count = sum(len(doc.page_content.split()) for doc in pages)
est_total_token_count = sum(len(enc.encode(doc.page_content)) for doc in pages)

print(f"\nTotal word count: {est_total_word_count}")
print(f"\nEstimated tokens: {est_total_token_count}")
print(f"\nEstimated cost of embedding: ${est_total_token_count * 0.002 / 1000}") # 0.002 is the cost per 1k tokens


Total word count: 11487

Estimated tokens: 16982

Estimated cost of embedding: $0.033964


In [6]:
# Turn the PDF into embeddings, then store them in a FAISS index

embeddings = OpenAIEmbeddings()
vector_store = FAISS.from_documents(pages, embeddings)

In [7]:
# Show the most similar documents to the query
search_result = vector_store.similarity_search_with_score("who is this pdf for?")

In [14]:
vector_store.save_local("...data/vector_data")