# MultiVector Retriever
- https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector

In [3]:
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

In [4]:
loaders = [
    TextLoader("paul_graham_essay.txt"),
    TextLoader("state_of_the_union.txt"),
]
docs = []
for l in loaders:
    docs.extend(l.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)

In [6]:
print(len(docs))

12


# 1. Parent Doc key 와 함계 MultiVectorRetriever 생성 

In [27]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=OpenAIEmbeddings()
)
# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)


## Parent Doc ID 생성

In [None]:
import uuid

doc_ids = [str(uuid.uuid4()) for _ in docs]
doc_ids

## Parent 당 Child Chunk 생성 
- Child Chunk 는 Parent Doc ID 를 가지고 있음

In [10]:
# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

In [11]:
sub_docs = []
for i, doc in enumerate(docs):
    _id = doc_ids[i]
    _sub_docs = child_text_splitter.split_documents([doc])
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

In [13]:
sub_docs[0:2]

[Document(page_content="What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.", metadata={'source': 'paul_graham_essay.txt', 'doc_id': 'a5eb2588-d499-4476-898a-9837747c0002'}),
 Document(page_content='The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking', metadata={'source': 'paul_graham_essay.txt', 'doc_id': 'a5eb2588-d499-4476-898

## Child Chunks 를 Vectore Store 에 추가

In [14]:
retriever.vectorstore.add_documents(sub_docs)


## Parent Doc 을 Doc ID 와 함계 Vector Store 에 저장
-  We can also add the original chunks to the vectorstore you should use this instead of the docstore 
    - https://stackoverflow.com/questions/77325854/is-there-a-way-to-set-the-vectorstore-as-the-docstore-when-setting-up-a-langchai
```
for i, doc in enumerate(docs):
    doc.metadata[id_key] = doc_ids[i]
retriever.vectorstore.add_documents(docs)
```

In [None]:
retriever.docstore.mset(list(zip(doc_ids, docs)))

## Child Chunk 에 대한 유사 검색

In [15]:
# Vectorstore alone retrieves the small chunks
retriever.vectorstore.similarity_search("justice breyer")[0]

Document(page_content='Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', metadata={'doc_id': 'e8d6ba33-4b1c-4840-8fef-5b9d16a6270a', 'source': 'state_of_the_union.txt'})

## Child Chunk 의 유사 검색 결과 연결된 Parent Chunk 제공

In [16]:
# Retriever returns larger chunks
len(retriever.get_relevant_documents("justice breyer")[0].page_content)

9874

# 2. Summary
- Doc 을 요약하고, 요약한 것을 임베딩 , 임베딩을 Parent Doc 으로 제공

In [17]:
import uuid

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.document import Document
from langchain.schema.output_parser import StrOutputParser

In [18]:
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatOpenAI(max_retries=0)
    | StrOutputParser()
)

In [19]:
summaries = chain.batch(docs, {"max_concurrency": 5})

In [31]:
print(len(docs))
print(len(summaries))


12
12


In [20]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())
# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

In [21]:
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

In [29]:
summary_docs[0:2]

[Document(page_content='The author discusses their experiences and interests in writing and programming before college. They recall their early experiences with programming on the IBM 1401 and later with microcomputers. They also mention their initial plan to study philosophy in college but eventually switched to studying artificial intelligence (AI). They describe their fascination with AI and their decision to focus on Lisp programming. The author reflects on the limitations of AI at the time and their realization that the traditional approach to AI was not effective. They then discuss their decision to write a book about Lisp hacking and their preference for building lasting things rather than working on systems that would become obsolete. Finally, they share their visit to the Carnegie Institute and their realization that creating paintings could be a lasting and independent career path.', metadata={'doc_id': 'e382bd52-c1c8-43ff-9ad8-35dccaf8baf0'}),
 Document(page_content='The aut

In [22]:
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [23]:
sub_docs = vectorstore.similarity_search("justice breyer")

In [24]:
sub_docs[0]

Document(page_content='The document is a summary of a State of the Union address given by the President of the United States. The President discusses various issues, including the nomination of a Supreme Court Justice, securing the border and fixing the immigration system, protecting the rights of women, supporting LGBTQ+ Americans, passing bipartisan legislation, addressing the opioid epidemic, improving mental health services, supporting veterans, and ending cancer. The President expresses optimism about the future of America and calls for unity and resilience in the face of challenges.', metadata={'doc_id': '29979f2e-24cd-427c-9e3c-1ba0ffbdc7c1'})

In [25]:
retrieved_docs = retriever.get_relevant_documents("justice breyer")

In [26]:
len(retrieved_docs[0].page_content)

9194