# Parent Document Retriever 를 OpenSearch 로 구현

### Ref:  MultiVector Retriever
- https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector

# 1. Bedrock Client 생성

In [3]:
%load_ext autoreload
%autoreload 2

import sys, os
# module_path = "../../../utils"
# sys.path.append(os.path.abspath(module_path))
# print(os.path.abspath(module_path))

module_path = "../"
sys.path.append(os.path.abspath(module_path))
print(os.path.abspath(module_path))

/root/aws-ai-ml-workshop-kr/genai/aws-gen-ai-kr/20_applications/02_qa_chatbot/01_preprocess_docs


In [4]:
import json
import boto3
from pprint import pprint
from termcolor import colored
from utils import bedrock, print_ww
from utils.bedrock import bedrock_info

# ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----

# os.environ["AWS_DEFAULT_REGION"] = "<REGION_NAME>"  # E.g. "us-east-1"
# os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
# os.environ["BEDROCK_ASSUME_ROLE"] = "<YOUR_ROLE_ARN>"  # E.g. "arn:aws:..."
# os.environ["BEDROCK_ENDPOINT_URL"] = "<YOUR_ENDPOINT_URL>"  # E.g. "https://..."


boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    endpoint_url=os.environ.get("BEDROCK_ENDPOINT_URL", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
)

print(colored("\n== FM lists ==", "green"))
pprint(bedrock_info.get_list_fm_models())

Create new client
  Using region: us-east-1
  Using profile: None
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-east-1.amazonaws.com)

== FM lists ==
{'Claude-Instant-V1': 'anthropic.claude-instant-v1',
 'Claude-V1': 'anthropic.claude-v1',
 'Claude-V2': 'anthropic.claude-v2',
 'Command': 'cohere.command-text-v14',
 'Jurassic-2-Mid': 'ai21.j2-mid-v1',
 'Jurassic-2-Ultra': 'ai21.j2-ultra-v1',
 'Llama2-13b-Chat': 'meta.llama2-13b-chat-v1',
 'Titan-Embeddings-G1': 'amazon.titan-embed-text-v1',
 'Titan-Text-G1': 'TBD'}


# 2. Embedding 모델 로딩

In [5]:
Use_Titan_Embedding = True
Use_Cohere_English_Embedding = False

In [6]:
# We will be using the Titan Embeddings Model to generate our Embeddings.
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

if Use_Titan_Embedding:
    llm_emb = BedrockEmbeddings(client=boto3_bedrock, model_id = "amazon.titan-embed-text-v1")
    dimension = 1536
elif Use_Cohere_English_Embedding:
    llm_emb = BedrockEmbeddings(client=boto3_bedrock, model_id = "cohere.embed-english-v3")    
    dimension = 1024
else:
    lim_emb = None

llm_emb

BedrockEmbeddings(client=<botocore.client.BedrockRuntime object at 0x7f7bf7b31f90>, region_name=None, credentials_profile_name=None, model_id='amazon.titan-embed-text-v1', model_kwargs=None, endpoint_url=None)

# 3. Data Loading

In [7]:
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

In [8]:
loaders = [
    TextLoader("../backup/Parent_Document_Retriever/paul_graham_essay.txt"),
    # TextLoader("../backup/Parent_Document_Retriever/state_of_the_union.txt"),
]
docs = []
for l in loaders:
    docs.extend(l.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)

In [9]:
print(len(docs))

8


# 4. Index 정의

In [10]:
index_name = "genai-poc-knox-parent-child"



# 5. LangChain OpenSearch VectorStore 생성 


In [11]:
from utils.proc_docs import get_parameter

In [12]:
import boto3
ssm = boto3.client('ssm', 'us-east-1')

opensearch_domain_endpoint = get_parameter(
    boto3_client = ssm,
    parameter_name = 'knox_opensearch_domain_endpoint',
)

opensearch_user_id = get_parameter(
    boto3_client = ssm,
    parameter_name = 'knox_opensearch_userid',
)

opensearch_user_password = get_parameter(
    boto3_client = ssm,
    parameter_name = 'knox_opensearch_password',
)


In [13]:
opensearch_domain_endpoint = opensearch_domain_endpoint
rag_user_name = opensearch_user_id
rag_user_password = opensearch_user_password

http_auth = (rag_user_name, rag_user_password) # Master username, Master password

## OpenSearch Client 생성

In [14]:
from utils.opensearch import opensearch_utils

In [15]:
aws_region = os.environ.get("AWS_DEFAULT_REGION", None)

os_client = opensearch_utils.create_aws_opensearch_client(
    aws_region,
    opensearch_domain_endpoint,
    http_auth
)

## 오픈 서치 인덱스 생성 
- 오픈 서치에 해당 인덱스가 존재하면, 삭제 합니다. 

In [16]:
from utils.opensearch import opensearch_utils

In [18]:

index_exists = opensearch_utils.check_if_index_exists(
    os_client,
    index_name
)

if index_exists:
    opensearch_utils.delete_index(
        os_client,
        index_name
    )


index_name=genai-poc-knox-parent-child, exists=False


## 랭체인 인덱스 연결 오브젝트 생성

- [langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch.html)

In [19]:
from langchain.vectorstores import OpenSearchVectorSearch

In [20]:
vector_db = OpenSearchVectorSearch(
    index_name=index_name,
    opensearch_url=opensearch_domain_endpoint,
    embedding_function=llm_emb,
    http_auth=http_auth, # http_auth
    is_aoss =False,
    engine="faiss",
    space_type="l2",
    bulk_size=100000,
    timeout=60    
)
vector_db

<langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch at 0x7f7bec0028f0>

# 6. Parent Doc key 와 함계 MultiVectorRetriever 생성 

## Parent Doc 을 Doc ID 와 함계 Vector Store 에 저장
-  We can also add the original chunks to the vectorstore you should use this instead of the docstore 
    - https://stackoverflow.com/questions/77325854/is-there-a-way-to-set-the-vectorstore-as-the-docstore-when-setting-up-a-langchai
```
for i, doc in enumerate(docs):
    doc.metadata[id_key] = doc_ids[i]
retriever.vectorstore.add_documents(docs)
```

In [22]:
parent_ids = vector_db.add_documents(
                        documents = docs, 
                        vector_field = "vector_field",
                        bulk_size = 1000000
                    )


In [23]:
parent_ids

['2dc76df5-454a-4027-b0d0-5b104029c000',
 '0d47066b-f5da-4a57-847d-471482493dd9',
 '04b51e61-b183-471a-8bb0-c00e1d371bf0',
 '81708bd9-2034-484d-92ad-4b703ca2af7a',
 '0eaa92b4-0af7-4865-8ba0-dd0fd95e05eb',
 'c5398811-1c86-43d9-bc99-5a11ca642289',
 'c67d319f-5666-4c1c-bc84-d060c6ddabf4',
 'd735d96a-80d9-4eb2-8f21-fe9a586a6379']

In [24]:
total_count_docs = os_client.count(index = index_name)
print("total count docs: ", total_count_docs)


total count docs:  {'count': 8, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}}


# 7. Child Chunk 생성 및 OpenSearch 저장

## Parent 당 Child Chunk 생성 
- Child Chunk 는 Parent Doc ID 를 가지고 있음

In [25]:
# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

In [26]:
id_key = "doc_id"

sub_docs = []
for i, doc in enumerate(docs):
#     _id = doc_ids[i]
    _id = parent_ids[i]    
    _sub_docs = child_text_splitter.split_documents([doc])
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

In [27]:
sub_docs[0:1]

[Document(page_content="What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.", metadata={'source': '../backup/Parent_Document_Retriever/paul_graham_essay.txt', 'doc_id': '2dc76df5-454a-4027-b0d0-5b104029c000'})]

## Child Chunks 를 Vectore Store 에 추가

In [28]:
%%time

child_doc_ids = vector_db.add_documents(documents = sub_docs, 
                        vector_field = "vector_field",
                        bulk_size = 1000000)


CPU times: user 631 ms, sys: 44.7 ms, total: 675 ms
Wall time: 29.2 s


In [29]:
total_count_docs = os_client.count(index = index_name)
print("total count docs: ", total_count_docs)


total count docs:  {'count': 310, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}}


# 8. 벡터 스토어 검색 

## MultiVectorRetriever 정의

In [30]:
# The storage layer for the parent documents
store = InMemoryStore()

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore= vector_db,
    docstore=store,
    id_key=id_key,
)


## Child Chunk 에 대한 유사 검색 및 Parent Doc 검색

In [31]:
# Vectorstore alone retrieves the small chunks
# response = retriever.vectorstore.similarity_search("justice breyer")[0]
response = retriever.vectorstore.similarity_search("Robert Morris")[0]
print(response)
parent_doc_id = response.metadata["doc_id"]
print("\nParent doc id: ", parent_doc_id)

page_content='One day in 2010, when he was visiting California for interviews, Robert Morris did something astonishing: he offered me unsolicited advice. I can only remember him doing that once before. One day at Viaweb, when I was bent over double from a kidney stone, he suggested that it would be a good idea for him to take me to the hospital. That was what it took for Rtm to offer unsolicited advice. So I' metadata={'source': '../backup/Parent_Document_Retriever/paul_graham_essay.txt', 'doc_id': 'c67d319f-5666-4c1c-bc84-d060c6ddabf4'}

Parent doc id:  c67d319f-5666-4c1c-bc84-d060c6ddabf4


In [32]:
response = opensearch_utils.get_document(os_client, doc_id = parent_doc_id, index_name = index_name)
print("\nparent document id:" , response["_id"])
print("parent document text: \n" , response["_source"]["text"])




parent document id: c67d319f-5666-4c1c-bc84-d060c6ddabf4
parent document text: 
 One day in 2010, when he was visiting California for interviews, Robert Morris did something astonishing: he offered me unsolicited advice. I can only remember him doing that once before. One day at Viaweb, when I was bent over double from a kidney stone, he suggested that it would be a good idea for him to take me to the hospital. That was what it took for Rtm to offer unsolicited advice. So I remember his exact words very clearly. "You know," he said, "you should make sure Y Combinator isn't the last cool thing you do."

At the time I didn't understand what he meant, but gradually it dawned on me that he was saying I should quit. This seemed strange advice, because YC was doing great. But if there was one thing rarer than Rtm offering advice, it was Rtm being wrong. So this set me thinking. It was true that on my current trajectory, YC would be the last thing I did, because it was only taking up more of

## Child Chunk 의 유사 검색 결과 연결된 Parent Chunk 제공

In [33]:
retriever.docstore.mset(list(zip(parent_ids, docs)))

In [34]:
total_count_docs = os_client.count(index = index_name)
print("total count docs: ", total_count_docs)


total count docs:  {'count': 310, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}}


In [35]:
# Retriever returns larger chunks
retriever.get_relevant_documents("Robert Morris")[0].page_content

'One day in 2010, when he was visiting California for interviews, Robert Morris did something astonishing: he offered me unsolicited advice. I can only remember him doing that once before. One day at Viaweb, when I was bent over double from a kidney stone, he suggested that it would be a good idea for him to take me to the hospital. That was what it took for Rtm to offer unsolicited advice. So I remember his exact words very clearly. "You know," he said, "you should make sure Y Combinator isn\'t the last cool thing you do."\n\nAt the time I didn\'t understand what he meant, but gradually it dawned on me that he was saying I should quit. This seemed strange advice, because YC was doing great. But if there was one thing rarer than Rtm offering advice, it was Rtm being wrong. So this set me thinking. It was true that on my current trajectory, YC would be the last thing I did, because it was only taking up more of my attention. It had already eaten Arc, and was in the process of eating ess