In [28]:
import getpass
import os

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = "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"

In [29]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [30]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [31]:

from langchain.chat_models import init_chat_model

openai_api_key = os.environ["OPENAI_API_KEY"]

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

In [32]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph

In [33]:
# Load and chunk contents of the blog
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

In [34]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

In [35]:
# Index chunks
document_ids = vector_store.add_documents(documents=all_splits)

In [36]:
document_ids[0]

'e70b4e93-409e-4e09-bae6-aff49f565af4'

In [None]:
prompt = hub.pull("rlm/rag-prompt")

In [None]:
prompt

In [None]:
from typing_extensions import List, TypedDict

# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [None]:
# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [None]:
response = graph.invoke({"question": "What is Task Decomposition?"})
print(response["answer"])

In [None]:
# Debug: Check what's in the vector store and document_ids
print("=== DEBUGGING VECTOR STORE ===")

# Check if document_ids exists
if 'document_ids' in locals():
    print(f"document_ids exists: {document_ids}")
    print(f"document_ids length: {len(document_ids)}")
    print(f"First few IDs: {document_ids[:3]}")
else:
    print("document_ids is NOT defined in current scope")

# Check vector store internal state (using safer approach)
try:
    # Try to access the internal state
    internal_embeddings = getattr(vector_store, '_embeddings', {})
    print(f"\nVector store internal embeddings count: {len(internal_embeddings)}")
    print(f"Available document IDs in vector store: {list(internal_embeddings.keys())[:5]}")
except:
    print("\nCould not access vector store internal state")

# Check if all_splits exists
if 'all_splits' in locals():
    print(f"\nall_splits exists with {len(all_splits)} documents")
    print(f"First document preview: {all_splits[0].page_content[:100]}...")
else:
    print("\nall_splits is NOT defined")


In [None]:
# Fix: Re-add documents and get proper document IDs
print("=== RE-ADDING DOCUMENTS ===")

# Make sure we have all_splits
if 'all_splits' not in locals():
    print("ERROR: all_splits not found. Please run the text splitting cell first.")
else:
    # Clear the vector store first
    vector_store = InMemoryVectorStore(embeddings)
    
    # Re-add documents and capture the IDs
    document_ids = vector_store.add_documents(documents=all_splits)
    
    print(f"Successfully added {len(document_ids)} documents")
    print(f"Document IDs: {document_ids[:5]}")
    
    # Now try to retrieve the first document
    if document_ids:
        doc = vector_store.get_by_ids([document_ids[0]])
        print(f"\nRetrieved document 0: {len(doc)} documents")
        if doc:
            print(f"Content preview: {doc[0].page_content[:200]}...")
    else:
        print("ERROR: No document IDs returned")
