In [1]:
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


load_dotenv()

True

In [2]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x113ebbcb0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x1146d06e0>, model='text-embedding-3-large', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [3]:
embedding_dim = len(embeddings.embed_query("hello world"))
embedding_dim 

3072

In [4]:
index = faiss.IndexFlatL2(embedding_dim)

vector_store = FAISS(
    embedding_function=embeddings,
    index = index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x1146d1be0>

In [5]:
loader = PyPDFLoader("rlm.pdf")
docs = loader.load()

# docs

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap=200,
    add_start_index=True
)

all_splits = text_splitter.split_documents(docs)
print(f"Split blog post into {len(all_splits)} sub-documents.")


Split blog post into 130 sub-documents.


In [7]:
document_ids = vector_store.add_documents(documents=all_splits)

print(document_ids[:3])

['2148b813-6a8c-4ab1-9339-35d0b8723d7b', '8c198806-00ea-4749-911b-3e58c70e6ee6', '1589bb17-c57c-4f24-bfaa-711e0b1ed142']


This completes the Indexing portion of the pipeline.

RAG STARTS HERE

In [13]:
from langchain.tools import tool
'''
"Filters are used when we have hard constraints that aren't semantic in nature.
For example, if a user asks 'What was our revenue last quarter?', semantic search alone might return revenue data from any year. 
But with a date filter, we ensure we only search within Q3 2024 documents. 
Filters handle the structured part of the query, while embeddings handle the unstructured/semantic part."
'''
@tool
def retrieve(query: str):
    """ Retrieve information to help answer a query. """
    retriver_docs = vector_store.similarity_search(query, k=2)

    serialized = "\n\n".join(
        (f"source: {doc.metadata}\n Content: {doc.page_content}")
        for doc in retriver_docs
    )

    return serialized, retriver_docs


In [14]:
from langchain.agents import create_agent 

tools = [retrieve]
model = init_chat_model("gpt-4o")

prompt = (
    "You have access to a tool that returns context from a the provided research paper."
    "Use the tool to help answer user queries for this research paper."
)

agent = create_agent(model, tools, system_prompt = prompt)


In [None]:
query = "Who wrote this research paper??"
query = "what is this research paper about ?"

for event in agent.stream(
    {"messages" : [{"role": "user", "content": query}]},
    stream_mode="values"):

    event["messages"][-1].pretty_print()


In [None]:
'''
OCR is a technology that extracts text from images.
It converts images containing text (scanned documents, 
photos, screenshots, PDFs that are just images) into machine-readable text that you can search, edit, or process.
'''

OCR is a technology that extracts text from images.
It converts images containing text (scanned documents, photos, screenshots, PDFs that are just images) into machine-readable text that you can search, edit, or process.