초기 설정을 수행합니다.

In [11]:
%pip install boto3 botocore
%pip install opensearch-py
%pip install tqdm
%pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


1-1. 실행 권한 체크

In [13]:
from sagemaker import get_execution_role

strSageMakerRoleName = get_execution_role().rsplit('/', 1)[-1]
print (f"SageMaker Execution Role Name: {strSageMakerRoleName}")

ModuleNotFoundError: No module named 'sagemaker'

1-2. Bedrock Client 를 생성합니다.

In [15]:
import json
import boto3
import botocore
retry_config = botocore.config.Config(
    retries={"max_attempts": 10, "mode": "standard"}
)
bedrock_client = boto3.Session(
    region_name='us-west-2',
    
).client("bedrock-runtime", config=retry_config)

1-2. OpenSearch 클라이언트를 생성합니다.

In [16]:
import os
from utilities import get_parameter

In [17]:
import boto3
region = boto3.Session(region_name='us-west-2').region_name
ssm = boto3.client('ssm', region)

opensearch_domain_endpoint = get_parameter(
    boto3_clinet = ssm,
    parameter_name = 'opensearch_domain_endpoint',
)

opensearch_user_id = get_parameter(
    boto3_clinet = ssm,
    parameter_name = 'opensearch_user_id',
)

opensearch_user_password = get_parameter(
    boto3_clinet = ssm,
    parameter_name = 'opensearch_user_password',
)

In [18]:
from opensearchpy import OpenSearch, RequestsHttpConnection

http_auth = (opensearch_user_id, opensearch_user_password) # Master username, Master password
aws_region = os.environ.get("AWS_DEFAULT_REGION", None)
os_client = OpenSearch(
    hosts=[
        {
            'host': opensearch_domain_endpoint.replace("https://", ""),
            'port': 443
        }
    ],
    http_auth=http_auth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

2. 문서 전처리

- 문서 Chunk를 생성합니다. 사용할 문서는 'data/ks.json' 파일입니다.
- 모든 문서를 한번에 Context 추출에 사용하면 너무 오랜 시간이 걸리기 때문에, Lab에서는 임의로 20개의 Chunk를 하나의 Document로 설정합니다.

In [82]:
import json
import glob
import math
# folder_path = 'data/*.json'
folder_path = 'data/ks.json'

json_files = glob.glob(folder_path)
chunk_size = 20

doc_json_list = []
for filename in json_files:
    with open(filename) as f:
        data = json.load(f)
        
        # chunk_size 단위로 문서 분할
        total_chunks = len(data)
        num_documents = math.ceil(total_chunks / chunk_size)
        
        for doc_id in range(num_documents):
            # 현재 문서에 포함될 청크 계산
            start = doc_id * chunk_size
            end = min((doc_id + 1) * chunk_size, total_chunks)
            current_chunks = data[start:end]
            
            # 현재 문서의 전체 내용 생성
            doc_content = '\n'.join([item.get('content', '') for item in current_chunks])
            
            # 문서 객체 생성
            doc = {
                "content": doc_content,
                "chunks": current_chunks,
                "file_source": filename,
                "doc_index": doc_id,
                "chunk_range": f"{start}-{end-1}"
            }
            
            doc_json_list.append(doc)

print("json files loaded", len(doc_json_list))


json files loaded 5


- 전체 문서에 대한 Context를 바탕으로, 주어진 Chunk의 내용을 5줄 이내의 문장으로 설명하도록 합니다.
- 추가된 문서의 Context는 content 필드에 추가되어 저장됩니다.

In [83]:
# 170개 Chunk 기준 2분 가량의 시간이 필요합니다.
import time
from tqdm.notebook import tqdm

sys_prompt = """
You're an expert at providing a succinct context, targeted for specific text chunks.

<instruction>
- Offer 1-5 short sentences that explain what specific information this chunk provides within the document.
- Focus on the unique content of this chunk, avoiding general statements about the overall document.
- Clarify how this chunk's content relates to other parts of the document and its role in the document.
- If there's essential information in the document that backs up this chunk's key points, mention the details.
</instruction>
"""
for doc_index, document in enumerate(doc_json_list):
    doc_content = document['content']
   
    for chunk in tqdm(document['chunks']):
        # 재시도에 대한 멱등성을 확보하기 위해, 이미 Situate 작업이 완료된 Chunk는 Skip 합니다.
        if chunk['content'].startswith('Context.'):
            continue
            
        document_context_prompt = f"""
        <document>
        {doc_content}
        </document>
        """

        chunk_content = chunk['content']
        chunk_context_prompt = f"""
        Here is the chunk we want to situate within the whole document:

        <chunk>
        {chunk_content}
        </chunk>

        Skip the preamble and only provide the consise context.
        """
        usr_prompt = [{
                "role": "user", 
                "content": [
                    {"text": document_context_prompt},
                    {"text": chunk_context_prompt}
                ]
            }]
        
        try:
            response = bedrock_client.converse(
                modelId='us.anthropic.claude-3-haiku-20240307-v1:0',
                messages=usr_prompt,
                system=[{'text': sys_prompt}],
                inferenceConfig={
                    'maxTokens': 4096
                }
            )

            situated_context = response['output']['message']['content'][0]['text'].strip()
            chunk['content'] = f"Context:\n{situated_context}\n\nChunk:\n{chunk['content']}"
            time.sleep(2)

        except Exception as e:
            print(f"Error generating context for chunk: {e}")



  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [84]:
doc_json_list[0]['file_source']

'data/ks.json'

## 오픈 서치 인덱스 생성 
- 오픈 서치에 해당 인덱스가 존재하면, 삭제 합니다. 

In [85]:
index_name = 'idx-genai-contextual-retriever'
delete_index_if_exists = True

index_body = {
    "settings": {
        "index.knn": True,
        "index.knn.algo_param.ef_search": 512
    },
    "mappings": {
        "properties": {
            "metadata": {
                "properties": {
                    'source' : {'type': 'keyword'},
                    'parent_id' : {'type': 'keyword'},  
                    'family_tree': {'type': 'keyword'},
                    'last_updated': {'type': 'date'},
                    'project': {'type': 'keyword'},
                    'seq_num': {'type': 'long'},
                    'title': {'type': 'text'},  # For full-text search
                    'url': {'type': 'text'},  # For full-text search
                }
            },
            "text": {
                "type": "text",
                "analyzer": "standard"
            },
            "vector_field": {
                "type": "knn_vector",
                "dimension": 1024,
                "method": {
                    "engine": "faiss",
                    "name": "hnsw",
                    "parameters": {
                        "ef_construction": 512,
                        "m": 16
                    },
                    "space_type": "l2"
                }
            }
        }
    }
}



In [86]:
if delete_index_if_exists:
    if os_client.indices.exists(index=index_name):
        os_client.indices.delete(index=index_name)
    

os_client.indices.create(index=index_name, body=index_body)


{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'idx-genai-contextual-retriever'}

Context가 추가된 Chunk를 Embedding 하고, Opensearch Index에 저장합니다.

In [87]:
for doc_id, document in enumerate(doc_json_list):
    context = chunk['content']
    response = bedrock_client.invoke_model(
        modelId="amazon.titan-embed-text-v2:0",
        contentType="application/json",
        accept="application/json",
        body=json.dumps({"inputText": context})
    )
    chunk_embedding = json.loads(response['body'].read())['embedding']
    parent_chunk = {
        'metadata': {
            'family_tree': 'parent',
            'source': document['file_source'],
            'parent_id': None,
            'title': None,
            'url': None,
            'last_updated': None,
            'seq_num': doc_id
        },
        'text': document['content'],
        "vector_field": chunk_embedding
    }
    parent_id = os_client.index(
        index=index_name,
        body=parent_chunk #embedded_chunk
    )['_id']

    for chunk_id, chunk in enumerate(tqdm(document['chunks'])):
        context = chunk['content']
        
        response = bedrock_client.invoke_model(
            modelId="amazon.titan-embed-text-v2:0",
            contentType="application/json",
            accept="application/json",
            body=json.dumps({"inputText": context})
        )
        chunk_embedding = json.loads(response['body'].read())['embedding']

        if chunk_embedding:
            embedded_chunk = {
                "metadata": {
                    "family_tree": "child",
                    "source": document['file_source'],
                    "project": chunk['project'], 
                    "title": chunk['title'],
                    "url": chunk['url'],
                    "parent_id": parent_id,
                    "seq_num": chunk_id,
                },
                "text": chunk['content'],
                "vector_field": chunk_embedding
            }

        os_client.index(
            index=index_name,
            body=embedded_chunk
        )

        
print(f"Successfully embedded and stored documents in index '{index_name}'")

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

Successfully embedded and stored documents in index 'idx-genai-contextual-retriever'


검색 테스트

Lexical Search (BM-25), Semantic Search (knn)

In [88]:
from typing import List, Dict

def _format_search_result(hit: Dict, search_method: str) -> Dict:
    print (hit)
    
    return {
        "text": hit['_source']["text"],
        "score": hit['_score'],
        "metadata": hit['_source']['metadata'],
        "search_method": search_method
    }


def search_by_knn(os_client, vector: List[float], index_name: str, top_n: int = 80) -> List[Dict]:
    query = {
        "size": top_n,
        "_source": ["text", "metadata"],
        "query": {
            "knn": {
                "vector_field": {
                    "vector": vector,
                    "k": top_n
                }
            }
        }
    }


    response = os_client.search(index=index_name, body=query)
    return [_format_search_result(hit, 'knn') 
           for hit in response['hits']['hits']]

def search_by_bm25(os_client, query_text: str, index_name: str, top_n: int = 80) -> List[Dict]:
    query = {
        "size": top_n,
        "_source": ["text", "metadata"],
        "query": {
            "match": {
                "text": {
                    "query": query_text,
                    "operator": "or"
                }
            }
        }
    }

    response = os_client.search(index=index_name, body=query)
    return [_format_search_result(hit, 'bm25') 
           for hit in response['hits']['hits']]
    

In [89]:
def rag(question, index_name, os_client, bedrock_client, emb_model_id="amazon.titan-embed-text-v2:0", search_model_id='anthropic.claude-3-haiku-20240307-v1:0'):
    embedding_response = bedrock_client.invoke_model(
        modelId=emb_model_id,
        contentType="application/json",
        accept="application/json",
        body=json.dumps({"inputText": question})
    )
    embedding = json.loads(embedding_response['body'].read())['embedding']
    search_results = search_by_knn(os_client, embedding, index_name, 5)
    
    docs = ""
    for result in search_results:
        docs += f"- {result['text']}\n\n"
    
    messages = [{
        'role': 'user',
        'content': [{'text': f"{question}\n\nAdditional Information:\n{docs}"}]
    }]
    
    system_prompt = "You are a helpful AI assistant that provides accurate and concise information about given context."
    
    response = bedrock_client.converse(
        modelId=search_model_id,
        messages=messages,
        system=[{'text': system_prompt}],
        inferenceConfig={
            'maxTokens': 4096
        }
    )
    return response['output']['message']['content'][0]['text']

In [90]:
index_name = 'idx-genai-contextual-retriever'

In [91]:
generated = rag('what is knox?', index_name, os_client, bedrock_client)
generated

{'_index': 'idx-genai-contextual-retriever', '_id': '4VZGx5UBNux_l59BRqqF', '_score': 0.44962555, '_source': {'metadata': {'parent_id': '4FZGx5UBNux_l59BRqoh', 'family_tree': 'child', 'project': 'KS', 'source': 'data/ks.json', 'title': 'FAQ :  What is Knox Suite?', 'seq_num': 0, 'url': 'https://docs.samsungknox.com/admin/knox-suite/faq/#what-is-knox-suite'}, 'text': 'Context:\nThis chunk provides a high-level overview of what Knox Suite is, describing it as a bundled offering from Samsung that includes several individual enterprise mobility management (EMM) services. It highlights the key capabilities of Knox Suite, such as secure device management, automatic device enrollment, comprehensive device oversight, and advanced device analytics.\n\nChunk:\ntitle: FAQ :  What is Knox Suite?title: FAQ :  What is Knox Suite?\nKnox Suite is a bundled offering designed to help enterprise IT admins better manage your fleet of devices. It includes individual services such as Knox Platform for Enter

"Based on the information provided in the chunks, here's a summary of what Knox is:\n\nKnox is a suite of enterprise mobility management (EMM) solutions offered by Samsung. The key points about Knox Suite are:\n\n1. Knox Suite is a bundled offering that includes several individual Knox products/services for enterprises:\n   - Knox Platform for Enterprise\n   - Knox Mobile Enrollment\n   - Knox Manage\n   - Knox E-FOTA\n   - Knox Asset Intelligence\n   - Knox Remote Support\n   - Knox Capture\n   - Knox Authentication Manager\n\n2. Knox Suite provides a comprehensive set of features and capabilities to help enterprises securely deploy, manage, and analyze their fleet of mobile devices (Samsung Android phones, tablets, and Tizen watches).\n\n3. Key benefits of Knox Suite include:\n   - An all-in-one solution covering the entire enterprise mobility lifecycle\n   - Simplified licensing with a single license key for all included products\n   - Streamlined onboarding and seamless IT admin ex

In [92]:
# TODO: Hybrid Search + Reranker를 통한 Rank Fusion 구현하기
from reranker_service import RerankerService
def hybrid_rag():
    pass

전체 Document를 가져와서 Q&A 데이터 만들기
- Tool Use를 통해 Q&A 데이터를 생성합니다.

In [93]:
sys_template = {
    "complex": """
        You are an expert at generating practical questions based on given documentation.
        Your task is to generate complex, reasoning questions and answers.

        Follow these rules:
        1. Generate questions that reflect real user information needs related to the document's subject matter (e.g., technical docs : feature availability, implementation details)
        2. Ensure questions are relevant, concise, preferably under 25 words, and fully answerable with the provided information
        3. Focus on extracting key information that users are likely to seek, while avoiding narrow or less important questions.
        4. When provided with code blocks, focus on understanding the overall functionality rather than the specific syntax or variables. Feel free to request examples of how to use key APIs or features.
        5. Do not use phrases like 'based on the provided context' or 'according to the context'.
    """,
    "simple": """
        You are an expert at generating practical questions based on given documentation.
        Your task is to create simple, directly answerable questions from the given context.

        Follow these rules:
        1. Generate questions that reflect real user information needs related to the document's subject matter (e.g., technical docs : feature availability, implementation details)
        2. Ensure questions are relevant, concise, preferably under 10 words, and fully answerable with the provided information
        3. Focus on extracting key information that users are likely to seek, while avoiding narrow or less important questions.
        4. When provided with code blocks, focus on understanding the overall functionality rather than the specific syntax or variables. Feel free to request examples of how to use key APIs or features.
        5. Do not use phrases like 'based on the provided context' or 'according to the context'.
    """
}

In [94]:
tool_config = {
    "tools": [
        {
            "toolSpec": {
                "name": "QuestionAnswerGenerator",
                "description": "Generates questions and answers based on the given context.",
                "inputSchema": {
                    "json": {
                        "type": "object",
                        "properties": {
                            "question": {
                                "type": "string",
                                "description": "The generated question"
                            },
                            "answer": {
                                "type": "string",
                                "description": "The answer to the generated question"
                            }
                        },
                        "required": ["question", "answer"]
                    }
                }
            }
        }
    ]
}

In [95]:
import random
import json
import uuid
from tqdm.notebook import tqdm

context = '\n\n'.join([doc.get('content', '') for doc in doc_json_list])
qa_dataset = []

generated_question = {"simple": [], "complex": []}

for i in tqdm(range(4)):
    if i % 2 == 0:
        question_type = "complex"
    else:
        question_type = "simple"

    user_template = f"""
    Generate a {question_type} question and its answer based on the following context:

    Context: {context}

    Use the QuestionAnswerGenerator tool to provide the output.
    """

    sys_prompt = [{"text": sys_template[question_type]}]
    user_prompt = [{"role": "user", "content": [{"text": user_template}]}]
    temperature = 0.0
    top_p = 0.5
    inference_config = {"temperature": temperature, "topP": top_p}

    response = bedrock_client.converse(
        modelId='anthropic.claude-3-5-sonnet-20240620-v1:0',
        messages=user_prompt,
        toolConfig=tool_config,
        system=sys_prompt,
        inferenceConfig={
            'maxTokens': 4096
        }
    )

    stop_reason = response['stopReason']

    if stop_reason == 'tool_use':
        tool_requests = response['output']['message']['content']

        for tool_request in [x for x in tool_requests if 'toolUse' in x]:
            if tool_request['toolUse']['name'] == 'QuestionAnswerGenerator':
                res = tool_request['toolUse']['input']

                qa_item = {
                    "question": tool_request['toolUse']['input']['question'],
                    "ground_truth": tool_request['toolUse']['input']['answer'],
                    "question_type": question_type,
                    # "context": context
                }
                
                qa_dataset.append(qa_item)


  0%|          | 0/4 [00:00<?, ?it/s]

AccessDeniedException: An error occurred (AccessDeniedException) when calling the Converse operation: You don't have access to the model with the specified model ID.

In [None]:
qa_dataset

In [None]:
answer = rag(qa_dataset[0]['question'], index_name, os_client, bedrock_client)
answer

테스트 Q&A 데이터 셋을 활용하여, Contextual RAG의 답변의 유사도를 평가합니다.

In [None]:
evaluate_system_prompt = """
Evaluate the correctness of the generation on a continuous scale from 0 to 1. A generation can be considered correct (Score: 1) if it includes all the key facts from the ground truth and if every fact presented in the generation is factually supported by the ground truth or common sense.
Example:
Query: Can eating carrots improve your vision?
Answer: Yes, eating carrots significantly improves your vision, especially at night. This is why people who eat lots of carrots never need glasses. Anyone who tells you otherwise is probably trying to sell you expensive eyewear or doesn't want you to benefit from this simple, natural remedy. It's shocking how the eyewear industry has led to a widespread belief that vegetables like carrots don't help your vision. People are so gullible to fall for these money-making schemes.
Ground truth: Well, yes and no. Carrots won’t improve your visual acuity if you have less than perfect vision. A diet of carrots won’t give a blind person 20/20 vision. But, the vitamins found in the vegetable can help promote overall eye health. Carrots contain beta-carotene, a substance that the body converts to vitamin A, an important nutrient for eye health. An extreme lack of vitamin A can cause blindness. Vitamin A can prevent the formation of cataracts and macular degeneration, the world’s leading cause of blindness. However, if your vision problems aren’t related to vitamin A, your vision won’t change no matter how many carrots you eat.
Score: 0.1
Reasoning: While the generation mentions that carrots can improve vision, it fails to outline the reason for this phenomenon and the circumstances under which this is the case. The rest of the response contains misinformation and exaggerations regarding the benefits of eating carrots for vision improvement. It deviates significantly from the more accurate and nuanced explanation provided in the ground truth.
"""

eval_tools = {
    "tools": [
        {
            "toolSpec": {
                "name": "CorrectressGrader",
                "description": "Evaluate the correctness of the answer on a continuous scale from 0 to 1, and reasoning why the score is. A generation can be considered correct (Score: 1) if it includes all the key facts from the ground truth and if every fact presented in the generation is factually supported by the ground truth.",
                "inputSchema": {
                    "json": {
                        "type": "object",
                        "properties": {
                            "score": {
                                "type": "number",
                                "description": "The correctress score [0.0, 1.0]"
                            },
                            "reason": {
                                "type": "string",
                                "description": "The reason about the score"
                            }
                        },
                        "required": ["score", "reason"]
                    }
                }
            }
        }
    ]
}


In [None]:
from tqdm.notebook import tqdm

results = []

for question_data in tqdm(qa_dataset):
    question = question_data['question']
    ground_truth = question_data['ground_truth']

    generated = rag(question=question, index_name=index_name, os_client=os_client, bedrock_client=bedrock_client)
    
    evaluate_user_template = f"""
    Query: {question}
    Answer: {generated}
    Ground Truth: {ground_truth}
    """

    sys_prompt = [{"text": evaluate_system_prompt}]
    user_prompt = [{"role": "user", "content": [{"text": evaluate_user_template}]}]
    temperature = 0.0
    top_p = 0.5
    inference_config = {"temperature": temperature, "topP": top_p}

    response = bedrock_client.converse(
        modelId='anthropic.claude-3-5-sonnet-20240620-v1:0',
        messages=user_prompt,
        toolConfig=eval_tools,
        system=sys_prompt,
        inferenceConfig={
            'maxTokens': 4096
        }
    )
    
    stop_reason = response['stopReason']

    if stop_reason == 'tool_use':
        tool_requests = response['output']['message']['content']
        

        for tool_request in [x for x in tool_requests if 'toolUse' in x]:
            if tool_request['toolUse']['name'] == 'CorrectressGrader':
                res = tool_request['toolUse']['input']

                result = {
                     "question": question,
                     "question_type": question_data['question_type'],
                     "generated_answer": generated,
                     "ground_truth": ground_truth,
                     "score": res['score']
                }

                results.append(result)

results