# Indexing

### Preface: chunking

we don't explicitly cover document chunking/splitting.

For an excelent review of document chunking, see the video from Greg Kamradt

https://www.youtube.com/watch?v=8OJC21T2SL4

Paper:
https://arxiv.org/abs/2312.06648

In [1]:
import os

from dotenv import load_dotenv

load_dotenv()

os.environ["LANGCHAIN_TRACING_V2"] = 'true'

os.environ["LANGCHAIN_ENDPOINT"] = 'https://api.smith.langchain.com'

os.environ["LANGCHAIN_API_KEY"] = os.getenv('LANGCHAIN_API_KEY')

Part 12: Multi-representation Indexing

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = WebBaseLoader("https://liliangweng.github.io/posts/2023-06-23-agent/")
docs = loader.load()

loader = WebBaseLoader("https://liliangweng.github.io/posts/2024-02-05-human-data-quality/")
docs.extend(loader.load())

In [None]:
import uuid

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatGoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=os.getenv("GEMINI_API_KEY"), temperature=0)
    | StrOutputParser()
)

summaries = chain.batch(docs, {"max_concurrency": 5})

In [None]:
from langchain.storage import InMemoryByteStore
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.retrievers.multi_vector import MultiVectorRetriever

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="summaries",
    embedding_function=GoogleGenerativeAIEmbeddings()
)

# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"

# The retriever
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

# Docs linked to summaries
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(list(zip(doc_ids, docs)))
]

In [None]:
query = "Memory in agents"

sub_docs = vectorstore.similarity_search(query, k=1)