In [56]:
import langchain

In [94]:
from dotenv import load_dotenv
import os

from langchain.chains import VectorDBQA
from langchain.llms import OpenAI
from langchain.document_loaders import PyMuPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

In [58]:
load_dotenv("../.env")
api_key = os.environ.get('OPENAI-API-KEY')

# Quickstart

In [59]:
fpath = "../data/powers2017.pdf"

In [60]:
loader = PyMuPDFLoader(fpath)

In [64]:
# this prevents the metadata from being None which causes errors with the vectorstore
def sanitize_metadata(data):
    for item in data:
        meta = item.metadata
        for key, value in meta.items():
            if value is None:
                meta[key] = ""
    return data

In [65]:
data = sanitize_metadata(loader.load())

# Embeddings


[Docs](https://langchain.readthedocs.io/en/latest/modules/indexes/examples/embeddings.html)

In [69]:
embeddings = OpenAIEmbeddings()

In [71]:
text = "This is a test document."
query_result = embeddings.embed_query(text)

In [98]:
# get page content from the doc loaded earlier
pages = [item.page_content for item in data]
# embed the pages
doc_result = embeddings.embed_documents(pages)
# check the length of the result
len(doc_result)

13

[-0.03442302346229553,
 0.0066263023763895035,
 0.0307107362896204,
 -0.024700986221432686,
 -0.02612878754734993,
 0.01421312615275383,
 -0.0025375946424901485,
 -0.019288314506411552,
 -0.003926457371562719,
 -0.03748630732297897,
 -0.02886757254600525,
 0.035824865102767944,
 -0.02245544083416462,
 0.013064393773674965,
 -0.0044099632650613785,
 0.006204451434314251,
 0.05441226065158844,
 0.0103905089199543,
 -0.0088653564453125,
 -0.014109285548329353,
 -0.01092268992215395,
 0.015563048422336578,
 -0.017613891512155533,
 -0.015952449291944504,
 -0.010546269826591015,
 0.0025359720457345247,
 0.026998449116945267,
 -0.03865450993180275,
 -0.015796689316630363,
 -0.009832368232309818,
 0.001992433564737439,
 0.0007662270218133926,
 -0.023636624217033386,
 -0.012123342603445053,
 0.013486244715750217,
 -0.028192611411213875,
 -0.002709579886868596,
 0.016770191490650177,
 0.03281350061297417,
 -0.0017506807344034314,
 -0.0033066610340029,
 0.003725266782566905,
 -0.01452464703470468

# Hypothetical Doc Embeddings

In [85]:
# config
base_embeddings = OpenAIEmbeddings()
llm = OpenAI()

In [86]:
embeddings = HypotheticalDocumentEmbedder.from_llm(llm, base_embeddings, "web_search")
query_embedding = embeddings.embed_query("Where is the Taj Mahal?")

In [92]:
prompt_template = """Please answer the user's question about the most recent state of the union address
Question: {question}
Answer:"""

prompt = PromptTemplate(input_variables=["question"], template=prompt_template)
llm_chain = LLMChain(llm=llm, prompt=prompt)

embeddings = HypotheticalDocumentEmbedder(llm_chain=llm_chain, base_embeddings=base_embeddings)

In [None]:
result = embeddings.embed_query("What did the president say about Ketanji Brown Jackson")
print(result)

Doing a semantic search

In [95]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
full_doc_text = " ".join(pages)
texts = text_splitter.split_text(full_doc_text)

In [97]:
docsearch = Chroma.from_texts(texts, embeddings)

query = "What did the president say about Ketanji Brown Jackson"
docs = docsearch.similarity_search(query)

Running Chroma using direct local API.
Using DuckDB in-memory for database. Data will be transient.


InvalidRequestError: This model's maximum context length is 8191 tokens, however you requested 10738 tokens (10738 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.