# Amazon OpenSearch Serverless와 LangChain으로 빠르게 대화형 검색 구현하기

In [None]:
%pip install -U boto3
%pip install --upgrade --quiet  opensearch-py langchain-community

In [12]:
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain_text_splitters import CharacterTextSplitter
import textwrap
import json

In [2]:
import boto3

session = boto3.Session()
aoss_client = session.client(service_name="opensearchserverless")

In [3]:
collection_name = "opensearch-workshop-serverless"
region = "us-west-2"

In [13]:
# aws opensearchserverless create-security-policy \
#   --name logs-policy \
#   --type encryption --policy "{\"Rules\":[{\"ResourceType\":\"collection\",\"Resource\":[\"collection\/logs-application\"]}],\"AWSOwnedKey\":true}"

lsp_response = aoss_client.list_security_policies(
    resource=[
        "collection/" + collection_name,
    ],
    type="encryption",
)

if len(response["securityPolicySummaries"]) == 0:
    encryption_policy = {
        "Rules": [
            {
                "ResourceType": "collection",
                "Resource": ["collection/opensearch-workshop-serverless"],
            }
        ],
        "AWSOwnedKey": True,
    }

    response = aoss_client.create_security_policy(
        name="encrypt-policy", policy=json.dumps(encryption_policy), type="encryption"
    )
else:
    print("Encryption policy already exists")

Encryption policy already exists


In [14]:
# aws opensearchserverless create-security-policy \
#   --name logs-policy \
#   --type network --policy "[{\"Description\":\"Public access for logs collection\",\"Rules\":[{\"ResourceType\":\"dashboard\",\"Resource\":[\"collection\/logs-application\"]},{\"ResourceType\":\"collection\",\"Resource\":[\"collection\/logs-application\"]}],\"AllowFromPublic\":true}]"

lsp_response = aoss_client.list_security_policies(
    resource=[
        "collection/" + collection_name,
    ],
    type="network",
)

if len(response["securityPolicySummaries"]) == 0:
    network_policy = """
        [{"Description":"Public access for logs collection","Rules":[{"ResourceType":"dashboard","Resource":["collection/opensearch-workshop-serverless"]},{"ResourceType":"collection","Resource":["collection/opensearch-workshop-serverless"]}],"AllowFromPublic":true}]
    """

    response = aoss_client.create_security_policy(
        name="network-policy", policy=network_policy, type="network"
    )
else:
    print("Network policy already exists")

Network policy already exists


In [15]:
import time

# if opensearch-workshop-serverless collection already exists, delete it
response = aoss_client.list_collections(
    collectionFilters={
        "name": collection_name,
    }
)

if len(response["collectionSummaries"]) == 0:
    # response = aoss_client.delete_collection(id=response["collectionSummaries"][0]["id"])
    response = aoss_client.create_collection(
        name="opensearch-workshop-serverless", type="VECTORSEARCH"
    )
    # Collection이 완전히 지워질 때까지 5초 대기
    # time.sleep(5)
else:
    print(f"collection {collection_name} already exists")

collection_id = response["collectionSummaries"][0]["id"]
aoss_host = f"https://{collection_id}.{region}.aoss.amazonaws.com"
aoss_host

collection opensearch-workshop-serverless already exists


'https://ifhauku7omebeig375z9.us-west-2.aoss.amazonaws.com'

In [None]:
# Define the access policy
access_policy = """
[{
    "Rules": [
        {
            "ResourceType": "collection",
            "Resource": ["collection/opensearch-workshop-serverless"],
            "Permission": ["aoss:*"]
        },
        {
            "ResourceType": "index",
            "Resource": ["index/*"],
            "Permission": ["aoss:*"]
        }
    ]
    "Principal": {
        "AWS": ["*"]
    }
}]
"""

# Create the access policy
response = aoss_client.create_access_policy(
    name="full-data-access", type="data", policy=access_policy
)

# Print the response
print(response)

In [None]:
%%time

# Check the collection status
while True:
    response = aoss_client.list_collections(
        collectionFilters={
            "name": collection_name,
        }
    )

    if response["collectionSummaries"][0]["status"] == 'ACTIVE':
        print()
        print('OpenSearch Collection is created Successfully.')
        break
    else:
        print('-', end='')
        time.sleep(5)

In [None]:
credentials = session.get_credentials()
credentials

In [None]:
from requests_aws4auth import AWS4Auth

service = "aoss"  # must set the service as 'aoss'
awsauth = AWS4Auth(
    credentials.access_key,
    credentials.secret_key,
    "us-west-2",
    service,
    session_token=credentials.token,
)

In [None]:
from langchain_community.document_loaders import CSVLoader
from langchain_community.embeddings import BedrockEmbeddings
from opensearchpy import RequestsHttpConnection

loader = CSVLoader("./data/movies.csv")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)


len(docs)

In [None]:
embeddings = BedrockEmbeddings(
    model_id="amazon.titan-embed-text-v2:0",
    region_name="us-west-2",
)

# Test embedding models
vector = embeddings.embed_query("This is a content of the document")
len(vector)

In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
import json

aos_client = OpenSearch(
    hosts=[{"host": "ifhauku7omebeig375z9.us-west-2.aoss.amazonaws.com", "port": 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
)

request_body = {"analyzer": "nori", "text": "OpenSearch 워크샵에 오신 고객 여러분 환영합니다."}

# Send the request to the _analyze endpoint
response = aos_client.indices.analyze(body=request_body)

# Print the response
print(json.dumps(response, indent=4, ensure_ascii=False))

In [None]:
%%time

docsearch = OpenSearchVectorSearch.from_documents(
    docs,
    embeddings,
    opensearch_url=aoss_host,
    http_auth=awsauth,
    timeout=60,
    connection_class=RequestsHttpConnection,
    index_name="top_movies",
    engine="faiss",
    bulk_size=20000,
)

In [None]:
docs = docsearch.similarity_search(
    "건축학개론 줄거리를 알려줘",
    k=10,
    search_type="script_scoring",
)

In [None]:
docs

In [None]:
retriever = docsearch.as_retriever()

In [None]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory
from langchain_aws import ChatBedrock

model_kwargs = {  # anthropic
    "anthropic_version": "bedrock-2023-05-31",
    "max_tokens": 2048,
    "temperature": 0,
}

llm = ChatBedrock(
    model_id="anthropic.claude-3-sonnet-20240229-v1:0",  # 파운데이션 모델 지정
    model_kwargs=model_kwargs,
    region_name=region,
    streaming=True,
)  # Claude 속성 구성

memory = ConversationBufferWindowMemory(memory_key="chat_history", k=10, return_messages=True)

In [None]:
from langchain import PromptTemplate

prompt_template = """


Human: Here is the list of movies, inside <movies></movies> XML tags.

<movies>
{context}
</movies>

Only using the contex as above, answer the following question with the rules as below:
    - Don't insert XML tag such as <context> and </context> when answering.
    - Write as much as you can
    - Be courteous and polite
    - Only answer the question if you can find the answer in the context with certainty.
    - Answered in list format
    - Always put a short and concise explanation on why you are recommending this movies.

You are a best movie reviewer in Korea. Please explain a movies from the list above.

Question:
{question}

If the answer is not in the context, just say "추천해드릴만한 영화가 없습니다."


Assistant:"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [None]:
condense_template = """
Generate one standalone question based on the instructions.

<instrunctions>
- You will be given the following conversation between <chat-history> and </chat-history>
- You will be given the following follow up question between <follow-up-question> and </follow-up-question>
- Standalone question should have summary of the previous questions and answers.
</instructions>

<chat-history>
{chat_history}
</chat-history>

<follow-up-question>
{question}
</follow-up-question>

standalone question:
"""

CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(condense_template)

In [None]:
memory.clear()

conversation_with_retrieval = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": PROMPT},
    condense_question_prompt=CONDENSE_QUESTION_PROMPT,
    # verbose=True,
)

In [None]:
first_question = "영화 건축학개론의 줄거리가 뭐야?"
chat_response = conversation_with_retrieval.invoke({"question": first_question})

print(textwrap.fill(chat_response["answer"], 80))

In [None]:
second_question = "그 영화 평점은?"
chat_response = conversation_with_retrieval.invoke({"question": second_question})

print(textwrap.fill(chat_response["answer"], 80))

In [None]:
third_question = "비슷한 장르의 다른 영화는?"
chat_response = conversation_with_retrieval.invoke({third_question})

print(textwrap.fill(chat_response["answer"], 80))