# Amazon Bedrock Knowledge Bases와 통합하기
BDA 프로젝트로 오디오와 비디오 파일을 처리한 후에는 Bedrock KB와 통합할 차례입니다.
## 통합 단계: 
- Amazon Bedrock Data Automation을 파서로 사용하도록 Knowledge Base를 설정합니다.
- 처리된 데이터를 Knowledge Base에 적재해 검색과 응답 생성을 수행합니다.

<div class="alert alert-block alert-info">
<b>참고:</b> 이 노트북은 01_data_prep_using_bda.ipynb를 완료한 후 실행하세요. "Run All Cells" 옵션 대신 셀을 하나씩 실행하시기 바랍니다.
</div>

# 노트북 및 boto3 클라이언트 설정

이 단계에서는 노트북 전반에서 사용할 필수 라이브러리를 임포트합니다. Amazon Bedrock Data Automation(BDA)을 boto3와 함께 사용하려면 최신 AWS SDK for Python(boto3) 버전이 필요합니다. Boto3 1.35.96 이상 버전이 요구됩니다.

참고: 퍼블릭 프리뷰 출시 시점에는 BDA가 us-west-2 리전에서만 사용 가능합니다.

In [None]:
# %pip install --upgrade pip --quiet
# %pip install -r ../requirements.txt --no-deps --quiet
# %pip install -r ../requirements.txt --upgrade --quiet

In [None]:
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

In [None]:
import boto3
from botocore.exceptions import ClientError
from datetime import datetime
import time
from time import sleep
import pprint
import random
from tqdm import tqdm
from pathlib import Path
from IPython.display import JSON, IFrame, Audio, display, clear_output
import IPython.display as display
import logging
import sys
from pathlib import Path
import base64

# Get current path and go up two parent directories
current_path = Path().resolve()
parent_path = current_path.parent.parent  # Go up two levels

# Add to sys.path if not already there
if str(parent_path) not in sys.path:
    sys.path.append(str(parent_path))

# Now you can import from utils
from utils.knowledge_base import BedrockKnowledgeBase

In [None]:
#Clients
suffix = random.randrange(200, 900)

sts_client = boto3.client('sts')
account_id = sts_client.get_caller_identity()["Account"]

session = boto3.session.Session()
region_name =  session.region_name

bucket_name_kb = f'bedrock-kb-{suffix}-1' # replace it with your first bucket name.

s3_client = boto3.client('s3', region_name=region_name)

bda_client = boto3.client('bedrock-data-automation', region_name=region_name)
bda_runtime_client = boto3.client('bedrock-data-automation-runtime', region_name=region_name)

bedrock_agent_client = boto3.client('bedrock-agent')
bedrock_agent_runtime_client = boto3.client('bedrock-agent-runtime') 

logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

region_name, account_id

In [None]:
# Copy local BDA output files to a S3 bucket for KB integration
# Function to check if the bucket exists, if not, create the data_bucket
from utils.knowledge_base_operators import bucket_exists
suffix = random.randrange(200, 900)
bucket_name_kb = f'bedrock-bda-kb-{suffix}-1'            
# Create S3 bucket for the KB if it doesn't exist
if not bucket_exists(bucket_name_kb):
    print(f"Bucket '{bucket_name_kb}' does not exist. Creating it now...")
    if region_name == "us-east-1":
        s3_client.create_bucket(Bucket=bucket_name_kb)
    else:
        s3_client.create_bucket(
            Bucket=bucket_name_kb,
            CreateBucketConfiguration={'LocationConstraint': region_name}
        )
    print(f"Bucket '{bucket_name_kb}' created successfully.")
else:
    print(f"Bucket '{bucket_name_kb}' already exists.")


obj_audio = 'bda/dataset/result_aud.json'  
s3_client.upload_file('result_aud.json', bucket_name_kb, obj_audio)

obj_video = 'bda/dataset/result_vid.json'  
s3_client.upload_file('result_vid.json', bucket_name_kb, obj_video)


In [None]:
# Get the current timestamp
current_time = time.time()

# Format the timestamp as a string
timestamp_str = time.strftime("%Y%m%d%H%M%S", time.localtime(current_time))[-7:]
# Create the suffix using the timestamp
suffix = f"{timestamp_str}"

knowledge_base_name = f"bedrock-multi-modal-kb-{suffix}"
knowledge_base_description = "Multi-modal RAG knowledge base."

foundation_model = "anthropic.claude-3-sonnet-20240229-v1:0"

### Knowledge Base 생성 시작

이 노트북에서는 `utils` 폴더의 knowledge_base.py 파일에 있는 래퍼 함수를 사용해 KB 생성 과정을 단순화합니다. 데이터 소스 생성, KB 생성, 임베딩 인덱스 생성, 인덱스를 벡터 데이터 스토어에 저장하는 전체 과정을 이 함수로 간단히 처리할 수 있습니다. 

In [None]:
## Please uncomment the data sources that you want to add and update the placeholder values accordingly.

#data=[{"type": "S3", "bucket_name": bucket_name, "inclusionPrefixes": ["bda/dataset/"]}]
data=[{"type": "S3", "bucket_name": bucket_name_kb}]


                # {"type": "SHAREPOINT", "tenantId": "888d0b57-69f1-4fb8-957f-e1f0bedf64de", "domain": "yourdomain",
                #   "authType": "OAUTH2_CLIENT_CREDENTIALS",
                #  "credentialsSecretArn": f"arn:aws::secretsmanager:{region_name}:secret:<<your_secret_name>>",
                #  "siteUrls": ["https://yourdomain.sharepoint.com/sites/mysite"]
                # },
    
                
pp = pprint.PrettyPrinter(indent=2)

### 1단계 - 멀티모달 Knowledge Base 생성

In [None]:
# For multi-modal RAG While instantiating BedrockKnowledgeBase, pass multi_modal= True and choose the parser you want to use

knowledge_base = BedrockKnowledgeBase(
    kb_name=f'{knowledge_base_name}',
    kb_description=knowledge_base_description,
    data_sources=data,
    multi_modal= True,
    parser= 'BEDROCK_DATA_AUTOMATION', #'BEDROCK_Data Automation service is used'
    chunking_strategy = "FIXED_SIZE", 
    suffix = f'{suffix}-f'
)

### 2단계 - KB에 대한 데이터 ingestion 작업 시작

KB와 데이터 소스를 생성했으면 각 데이터 소스에 대해 ingestion 작업을 시작할 수 있습니다. Ingestion 동안 KB는 데이터 소스에서 문서를 가져와 텍스트를 추출하고, 지정한 청킹 크기에 따라 분할하며, 각 청크의 임베딩을 생성한 뒤 이번에는 OSS 벡터 데이터베이스에 저장합니다.

참고: 현재는 한 번에 하나의 ingestion 작업만 실행할 수 있습니다.

In [None]:
# ensure that the kb is available
time.sleep(30)
# sync knowledge base
knowledge_base.start_ingestion_job()

In [None]:
# keep the kb_id for invocation later in the invoke request
kb_id = knowledge_base.get_knowledge_base_id()
%store kb_id

### 3단계 - Knowledge Base 테스트
Knowledge Base가 준비되었으므로 [**retrieve**](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-agent-runtime/client/retrieve.html)와 [**retrieve_and_generate**](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-agent-runtime/client/retrieve_and_generate.html) 함수를 사용해 확인할 수 있습니다. 

#### Retrieve and Generate API로 Knowledge Base 테스트

먼저 retrieve and generate API로 Knowledge Base를 시험해 보겠습니다. 이 API는 Bedrock이 Knowledge Base에서 필요한 참조를 검색하고 Bedrock의 Foundation Model로 최종 답변을 생성합니다.

query = Give me the summary of the AWS Rethink podcast hosted by Nolan Chen and Malini Chatterjee?

이 질의에 대한 올바른 응답은 Knowledge Base에 적재한 오디오 전사(Transcript)에서 가져와야 합니다.

In [None]:
query = "Give me the summary of the AWS Rethink podcast hosted by Nolan Chen and Malini Chatterjee?"

In [None]:
foundation_model = "anthropic.claude-3-sonnet-20240229-v1:0"
# foundation_model = "amazon.nova-micro-v1:0"

response = bedrock_agent_runtime_client.retrieve_and_generate(
    input={
        "text": query
    },
    retrieveAndGenerateConfiguration={
        "type": "KNOWLEDGE_BASE",
        "knowledgeBaseConfiguration": {
            'knowledgeBaseId': kb_id,
            "modelArn": "arn:aws:bedrock:{}::foundation-model/{}".format(region_name, foundation_model),
            "retrievalConfiguration": {
                "vectorSearchConfiguration": {
                    "numberOfResults":5
                } 
            }
        }
    }
)

print(response['output']['text'],end='\n'*2)

### 4단계 - 응답에 오디오 파일을 직접 재생하기

In [None]:
# Extract audio path and timestamps from the response
from utils.knowledge_base_operators import extract_audio_path_and_timestamps
audio_s3_info, timestamps = extract_audio_path_and_timestamps(response)

In [None]:
# Display information for all audio segments first, and play audio file
from utils.knowledge_base_operators import play_audio_segment
#if timestamps and audio_s3_info:
# Display single audio player after all segments
print("\nAudio Player (click to play):")
print("Note: Please use the time ranges above as reference points in the audio.")
play_audio_segment(audio_s3_info, 0)


### 5단계 - 비디오 질의하기

In [None]:
query = "Can you find a promotional video containing BDA key features?"

In [None]:
foundation_model = "anthropic.claude-3-sonnet-20240229-v1:0"

response = bedrock_agent_runtime_client.retrieve_and_generate(
    input={
        "text": query
    },
    retrieveAndGenerateConfiguration={
        "type": "KNOWLEDGE_BASE",
        "knowledgeBaseConfiguration": {
            'knowledgeBaseId': kb_id,
            "modelArn": "arn:aws:bedrock:{}::foundation-model/{}".format(region_name, foundation_model),
            "retrievalConfiguration": {
                "vectorSearchConfiguration": {
                    "numberOfResults":5
                } 
            }
        }
    }
)

In [None]:
# Extract video path and timestamps from the response
from IPython.display import HTML
from utils.knowledge_base_operators import parse_response_and_get_s3_info
from utils.knowledge_base_operators import get_video_from_metadata

# Main execution
try:
    # Parse the response
    video_info = parse_response_and_get_s3_info(response)

    if video_info and video_info['s3_uri']:
        if video_info['timestamps']:
            print(f"\nFound {len(video_info['timestamps'])} Video Segments:")
            for ts in video_info['timestamps']:
                print(f"\nShot {ts['shot_index']}:")
                print(f"Time Range: {ts['start_timecode']} - {ts['end_timecode']}")
                print(f"Duration: {ts['duration']/1000:.2f} seconds")
        
        if video_info['summary']:
            print("\nVideo Summary:")
            print(video_info['summary'])
            
        # Get and play the video
        print("\nLoading video player...")

        local_video_path = get_video_from_metadata(
            video_info['s3_uri']['bucket'],
            video_info['s3_uri']['key']
        )
    else:
        print("Could not find video information in response")

except Exception as e:
    print(f"Error in main execution: {e}")    

### 정리
아래 섹션의 주석을 해제한 뒤 실행해 모든 리소스를 삭제하세요.

In [None]:
# delete role and policies
print("===============================Deleting Knowledge Base and associated resources==============================\n")
knowledge_base.delete_kb(delete_s3_bucket=True, delete_iam_roles_and_policies=True)

# 결론

이 가이드를 따라 Amazon Bedrock의 기능을 효과적으로 활용하면 필요에 맞는 강력한 멀티모달 RAG 애플리케이션을 구축할 수 있습니다.