# Import

In [1]:
import boto3
import sys
import json
import pprint
from botocore.client import Config
from botocore.exceptions import NoCredentialsError, PartialCredentialsError, ClientError
import os
import random
from retrying import retry
import time
from utility import *

print('Running boto3 version:', boto3.__version__)

Running boto3 version: 1.35.34


# Testing connection

In [2]:
test_aws_connection()

Connected successfully! Buckets:
  dev-managinglife-aws-bedrock
  dev-managinglife-db-bkp
  logs-bucket-managinglife
  managemypain-archives
  managinglife-business-data
  managinglife-business-data-humana
  managinglife-config-s3
  managinglife-documents
  managinglife-drupal-db-prod
  managinglife-es-bkp
  managinglife-guardduty-s3
  managinglife-images
  managinglife-solr-bkp
  managinglife-userdata-ca
  managinglife-userdata-ca-dev
  managinglife-userdata-ca-preprod
  managinglifebucket1
  s3-logs-bucket-managinglife
  shared-managinglife-developer-s3
  terraform-managinglife


# Converse API

In [3]:
region = 'ca-central-1'

bedrock = boto3.client(
    service_name = 'bedrock-runtime',
    region_name = region,
    )

MODEL_IDS = [
    "anthropic.claude-3-sonnet-20240229-v1:0",
    ]

In [4]:
prompt = ("What is the capital of Italy?")
print(f'Prompt: {prompt}\n')

for i in MODEL_IDS:
    response = invoke_bedrock_model(bedrock, i, prompt)
    print(f'Model: {i}\n{response}')

Prompt: What is the capital of Italy?

Model: anthropic.claude-3-sonnet-20240229-v1:0
The capital of Italy is Rome.
--- Latency: 478ms - Input tokens:14 - Output tokens:10 ---



# Multiple lines Converse API

In [5]:
bedrock_client = boto3.client('bedrock-runtime',region_name='ca-central-1')

In [6]:
messages=[{ "role":'user', "content":[{'type':'text','text': "What is quantum mechanics? "}]},\
         { "role":'assistant', "content":[{'type':'text','text': "It is a branch of physics that \
         describes how matter and energy interact with discrete energy values "}]},\
         { "role":'user', "content":[{'type':'text','text': "Can you explain a bit more about discrete energies?"}]}]

generate_message(bedrock_client, model_id = 'anthropic.claude-3-sonnet-20240229-v1:0',messages=messages,max_tokens=512,temp=0.5,top_p=0.9)

{'id': 'msg_bdrk_017SqWmsWifEn3Nh1git298j',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-3-sonnet-20240229',
 'content': [{'type': 'text',
   'text': 'Sure, the concept of discrete or quantized energies is a key principle of quantum mechanics. It states that the energy of particles or systems can only take on certain specific values, rather than varying continuously.\n\nSome key points about discrete energies:\n\n- Particles like electrons can only exist in specific, discrete energy levels around the nucleus rather than any arbitrary energy level.\n\n- When an electron transitions between allowed energy levels, it absorbs or emits a quantum of energy with very specific values corresponding to the energy difference between the levels.\n\n- This quantization of energy is observed at the atomic and subatomic scales and contrasts with classical physics, which views energy as continuous.\n\n- The allowed discrete energy levels depend on the specific system, like an atom, mole

## Messages with System (Personas)

In [7]:
messages = [{ "role":'user', "content":[{'type':'text','text': "What is quantum mechanics?"}]}]

system = "Respond in a way a caveman would understand"
generate_message(bedrock_client, model_id = "anthropic.claude-3-sonnet-20240229-v1:0",messages=messages,max_tokens=512,temp=0.5,top_p=0.9,system=system)

{'id': 'msg_bdrk_01FRPiHhqQWCfqzdY161WW8p',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-3-sonnet-20240229',
 'content': [{'type': 'text',
   'text': "Here's how I'd explain quantum mechanics in caveman terms:\n\nQuantum is tiny tiny world. Too small for caveman eyes to see. In tiny world, things not act like big world caveman knows. \n\nIn big world, rock is rock. Always rock. In tiny quantum world, thing can be rock AND not-rock at same time! Crazy, but true.\n\nQuantum things also not stay still. Always jumping, dancing around. Never know where thing is exactly. Only know maybe-here, maybe-there probabilities.\n\nQuantum world full of weird. Things connected, even far apart. Do one thing, affect 'other thing' somehow. Spooky action!\n\nCaveman brain get confused trying understand quantum. But quantum real. Quantum why technology work - computers, lasers, nuclear. Quantum is strange tiny universe controlling big universe we see."}],
 'stop_reason': 'end_turn',
 'stop_s

# Knowledge base

In [10]:
def retrieveAndGenerate(user_input, document_s3_uri, region, sourceType= "S3", model_id = "anthropic.claude-3-sonnet-20240229-v1:0"):
    model_arn = f'arn:aws:bedrock:{region}::foundation-model/{model_id}'
    print (document_s3_uri)
    return bedrock_agent_client.retrieve_and_generate(
        input={
            'text': user_input
        },
        retrieveAndGenerateConfiguration={
            'type': 'EXTERNAL_SOURCES',
            'externalSourcesConfiguration': {
                'modelArn': model_arn,
                "sources": [
                    {
                        "sourceType": sourceType,
                        "s3Location": {
                            "uri": document_s3_uri
                        }
                    }
                ]
            }
        }
    )

In [11]:

pp = pprint.PrettyPrinter(indent=2)
session = boto3.session.Session()
region = session.region_name or 'ca-central-1' 
bedrock_config = Config(connect_timeout=120, read_timeout=120, retries={'max_attempts': 0})

bedrock_agent_client = boto3.client("bedrock-agent-runtime",
                              region_name='ca-central-1',
                              config=bedrock_config,
                                    )
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"

In [12]:
document_s3_uri = "s3://dev-managinglife-aws-bedrock/1. User Guide/Common Questions.md"
query = "Summarize the document"
response = retrieveAndGenerate(query, document_s3_uri, region)
generated_text = response['output']['text']
pp.pprint(generated_text)

s3://dev-managinglife-aws-bedrock/1. User Guide/Common Questions.md
('The document provides information for users of the Manage My Pain app. It '
 'covers:\n'
 '\n'
 '- Getting started and personalizing the app without logging in\n'
 '- Creating an account and protecting user data privacy\n'
 '- Adding pain records, daily reflections, medications, and other custom '
 'values\n'
 '- Generating reports to share with healthcare providers, including paid '
 'options for more comprehensive reports\n'
 '- Managing account information, subscription plans, and in-app purchases\n'
 '- Setting reminders for recording pain and reflections\n'
 '- Contacting customer support')


In [13]:
citations = response["citations"]
contexts = []
for citation in citations:
    retrievedReferences = citation["retrievedReferences"]
    for reference in retrievedReferences:
         contexts.append(reference["content"]["text"])

pp.pprint(contexts)

[ '# Getting Started ## 1. Personalize the Application ### Can I personalize '
  'the application without logging in to the app? Yes, you can personalize the '
  'application without logging in to the app.  ### Do I need to accept the End '
  'User License Agreement? Yes, accepting the End User License Agreement is '
  "required to use Manage My Pain.  ### Why can't I find my pain condition? "
  'Our users have added thousands of conditions, but we just show the most '
  "common.  If you can't find one that suits your situation, simply add it!  "
  '### Do I have to add any pain conditions? No, all information entered into '
  'Manage My Pain is voluntary.  Keep in mind, the more information that is '
  'entered, the more powerful the reports can be.  ### Why are you asking me '
  'this? This information allows us to configure the application to better '
  'suit your situation, and ensures that the reports generated from Manage My '
  'Pain are a better communication tool.  ### Do I ha

# Knowledge base template
This notebook provides sample code for building an empty OpenSearch Serverless (OSS) index,Knowledge bases for Amazon Bedrock and ingest documents into the index from various data sources (S3, Confluence, Sharepoint, Salesforce, and Web). 

A data pipeline that ingests documents (typically stored in multiple data sources) into a knowledge base i.e. a vector database such as Amazon OpenSearch Service Serverless (AOSS) so that it is available for lookup when a question is received.

- Create an empty OpenSearch serverless index.
- Create knowledge base
- Create data source(s) within knowledge base
- For each data source, start ingestion jobs using KB APIs which will read data from the data source, chunk it, convert chunks into embeddings using Amazon Titan Embeddings model and then store these embeddings in AOSS. All of this without having to build, deploy and manage the data pipeline.

In [14]:
suffix = random.randrange(200, 900)

sts_client = boto3.client('sts')
boto3_session = boto3.session.Session()
region_name = boto3_session.region_name or 'ca-central-1'

bedrock_agent_client = boto3.client('bedrock-agent', region_name=region_name)
bedrock_agent_runtime_client = boto3.client('bedrock-agent-runtime', region_name=region_name)

service = 'aoss'
s3_client = boto3.client('s3')
account_id = sts_client.get_caller_identity()["Account"]
s3_suffix = f"{region_name}-{account_id}"

In [15]:
bucket_name = 'dev-managinglife-aws-bedrock'

## Below is a list of data sources including, 1 S3 buckets, 1 confluence, 1 Sharepoint, 1 Salesforce connectors

data_sources=[
                {"type": "S3", "bucket_name": bucket_name}, 
                
                # {"type": "CONFLUENCE", "hostUrl": "https://example.atlassian.net", "authType": "BASIC",
                #  "credentialsSecretArn": f"arn:aws::secretsmanager:{region_name}:secret:<<your_secret_name>>"},

                # {"type": "SHAREPOINT", "tenantId": "888d0b57-69f1-4fb8-957f-e1f0bedf64de", "domain": "yourdomain",
                #   "authType": "OAUTH2_CLIENT_CREDENTIALS",
                #  "credentialsSecretArn": f"arn:aws::secretsmanager:{region_name}:secret:<<your_secret_name>>",
                #  "siteUrls": ["https://yourdomain.sharepoint.com/sites/mysite"]
                # },

                # {"type": "SALESFORCE", "hostUrl": "https://company.salesforce.com/", "authType": "OAUTH2_CLIENT_CREDENTIALS",
                #  "credentialsSecretArn": f"arn:aws::secretsmanager:{region_name}:secret:<<your_secret_name>>"
                # },

                # {"type": "WEB", "seedUrls": [{ "url": "https://www.examplesite.com"}],
                #  "inclusionFilters": ["https://www\.examplesite\.com/.*\.html"],
                #  "exclusionFilters": ["https://www\.examplesite\.com/contact-us\.html"]
                # }
            ]
                
pp = pprint.PrettyPrinter(indent=2)

In [16]:
# For S3 data source, check if S3 bucket exists, and if not create S3 bucket for knowledge base data source

for ds in [d for d in data_sources if d['type']== 'S3']:
    bucket_name = ds['bucket_name']
    s3_client.head_bucket(Bucket=bucket_name)
    print(f'Bucket {bucket_name} Exists')
   

Bucket dev-managinglife-aws-bedrock Exists


## Create a vector store - Open Search Serverless Index

In [17]:
vector_store_name = f'bedrock-sample-rag-{suffix}'
index_name = f"bedrock-sample-rag-index-{suffix}"
aoss_client = boto3_session.client('opensearchserverless', region_name = 'ca-central-1')

# Personal ARN 
bedrock_kb_execution_role_arn = 'arn:aws:iam::442186832995:role/service-role/AmazonBedrockExecutionRoleForKnowledgeBase_nmhn9'

In [18]:
# create security, network and data access policies within OSS
encryption_policy, network_policy, access_policy = create_policies_in_oss(vector_store_name=vector_store_name,
                       aoss_client=aoss_client,
                       bedrock_kb_execution_role_arn=bedrock_kb_execution_role_arn)
collection = aoss_client.create_collection(name=vector_store_name,type='VECTORSEARCH')

In [19]:
pp.pprint(collection)

{ 'ResponseMetadata': { 'HTTPHeaders': { 'connection': 'keep-alive',
                                         'content-length': '317',
                                         'content-type': 'application/x-amz-json-1.0',
                                         'date': 'Mon, 07 Oct 2024 15:49:01 '
                                                 'GMT',
                                         'x-amzn-requestid': '84dac4f9-283f-4265-a38e-91965ed5bc31'},
                        'HTTPStatusCode': 200,
                        'RequestId': '84dac4f9-283f-4265-a38e-91965ed5bc31',
                        'RetryAttempts': 0},
  'createCollectionDetail': { 'arn': 'arn:aws:aoss:ca-central-1:442186832995:collection/iju9hsanwdefvqjeucdc',
                              'createdDate': 1728316141498,
                              'id': 'iju9hsanwdefvqjeucdc',
                              'kmsKeyArn': 'auto',
                              'lastModifiedDate': 1728316141498,
                          

In [20]:
# Get the OpenSearch serverless collection URL
collection_id = collection['createCollectionDetail']['id']
host = collection_id + '.' + region_name + '.aoss.amazonaws.com'
print(host)

iju9hsanwdefvqjeucdc.ca-central-1.aoss.amazonaws.com


In [21]:
# Wait for collection creation
response = aoss_client.batch_get_collection(names=[vector_store_name])
# Periodically check collection status
while (response['collectionDetails'][0]['status']) == 'CREATING':
    print('Creating collection...')
    interactive_sleep(30)
    response = aoss_client.batch_get_collection(names=[vector_store_name])
print('\nCollection successfully created:')
pp.pprint(response["collectionDetails"])


Collection successfully created:
[ { 'arn': 'arn:aws:aoss:ca-central-1:442186832995:collection/iju9hsanwdefvqjeucdc',
    'collectionEndpoint': 'https://iju9hsanwdefvqjeucdc.ca-central-1.aoss.amazonaws.com',
    'createdDate': 1728316141498,
    'dashboardEndpoint': 'https://iju9hsanwdefvqjeucdc.ca-central-1.aoss.amazonaws.com/_dashboards',
    'id': 'iju9hsanwdefvqjeucdc',
    'kmsKeyArn': 'auto',
    'lastModifiedDate': 1728316164606,
    'name': 'bedrock-sample-rag-825',
    'standbyReplicas': 'ENABLED',
    'status': 'ACTIVE',
    'type': 'VECTORSEARCH'}]


In [22]:
# create opensearch serverless access policy and attach it to Bedrock execution role
try:
    create_oss_policy_attach_bedrock_execution_role(collection_id=collection_id,
                                                    bedrock_kb_execution_role=bedrock_kb_execution_role)
    # It can take up to a minute for data access rules to be enforced
    interactive_sleep(60)
except Exception as e:
    print("Policy already exists")
    pp.pprint(e)

Policy already exists
NameError("name 'bedrock_kb_execution_role' is not defined")


## Create vector index

In [23]:
# Create the vector index in Opensearch serverless, with the knn_vector field index mapping, specifying the dimension size, name and engine.
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, RequestError
credentials = boto3.Session().get_credentials()
awsauth = auth = AWSV4SignerAuth(credentials, region_name, service)

index_name = f"bedrock-sample-index-{suffix}"
body_json = {
   "settings": {
      "index.knn": "true",
       "number_of_shards": 1,
       "knn.algo_param.ef_search": 512,
       "number_of_replicas": 0,
   },
   "mappings": {
      "properties": {
         "vector": {
            "type": "knn_vector",
            "dimension": 1024,
             "method": {
                 "name": "hnsw",
                 "engine": "faiss",
                 "space_type": "l2"
             },
         },
         "text": {
            "type": "text"
         },
         "text-metadata": {
            "type": "text"         }
      }
   }
}

# Build the OpenSearch client
oss_client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    timeout=300
)


In [24]:
# Create index
try:
    response = oss_client.indices.create(index=index_name, body=json.dumps(body_json))
    print('\nCreating index:')
    pp.pprint(response)

    # index creation can take up to a minute
    interactive_sleep(60)
except RequestError as e:
    # you can delete the index if its already exists
    # oss_client.indices.delete(index=index_name)
    print(f'Error while trying to create the index, with error {e.error}\nyou may unmark the delete above to delete, and recreate the index')
    


Creating index:
{ 'acknowledged': True,
  'index': 'bedrock-sample-index-825',
  'shards_acknowledged': True}
............................................................

## Create KB

In [25]:
opensearchServerlessConfiguration = {
            "collectionArn": collection["createCollectionDetail"]['arn'],
            "vectorIndexName": index_name,
            "fieldMapping": {
                "vectorField": "vector",
                "textField": "text",
                "metadataField": "text-metadata"
            }
        }

# The embedding model used by Bedrock to embed ingested documents, and realtime prompts
embeddingModelArn = f"arn:aws:bedrock:{region_name}::foundation-model/amazon.titan-embed-text-v2:0"
                    
name = f"bedrock-sample-knowledge-base-{suffix}"
description = "Amazon shareholder letter knowledge base."
roleArn = bedrock_kb_execution_role_arn


In [26]:
# Create a KnowledgeBase
from retrying import retry

@retry(wait_random_min=1000, wait_random_max=2000,stop_max_attempt_number=7)
def create_knowledge_base_func():
    create_kb_response = bedrock_agent_client.create_knowledge_base(
        name = name,
        description = description,
        roleArn = roleArn,
        knowledgeBaseConfiguration = {
            "type": "VECTOR",
            "vectorKnowledgeBaseConfiguration": {
                "embeddingModelArn": embeddingModelArn
            }
        },
        storageConfiguration = {
            "type": "OPENSEARCH_SERVERLESS",
            "opensearchServerlessConfiguration":opensearchServerlessConfiguration
        }
    )
    return create_kb_response["knowledgeBase"]

In [27]:
try:
    kb = create_knowledge_base_func()
except Exception as err:
    print(f"{err=}, {type(err)=}")

pp.pprint(kb)

{ 'createdAt': datetime.datetime(2024, 10, 7, 15, 58, 1, 786250, tzinfo=tzutc()),
  'description': 'Amazon shareholder letter knowledge base.',
  'knowledgeBaseArn': 'arn:aws:bedrock:ca-central-1:442186832995:knowledge-base/LSGWC8EJRL',
  'knowledgeBaseConfiguration': { 'type': 'VECTOR',
                                  'vectorKnowledgeBaseConfiguration': { 'embeddingModelArn': 'arn:aws:bedrock:ca-central-1::foundation-model/amazon.titan-embed-text-v2:0'}},
  'knowledgeBaseId': 'LSGWC8EJRL',
  'name': 'bedrock-sample-knowledge-base-825',
  'roleArn': 'arn:aws:iam::442186832995:role/service-role/AmazonBedrockExecutionRoleForKnowledgeBase_nmhn9',
  'status': 'CREATING',
  'storageConfiguration': { 'opensearchServerlessConfiguration': { 'collectionArn': 'arn:aws:aoss:ca-central-1:442186832995:collection/iju9hsanwdefvqjeucdc',
                                                                   'fieldMapping': { 'metadataField': 'text-metadata',
                                             

In [28]:
# Get KnowledgeBase 
get_kb_response = bedrock_agent_client.get_knowledge_base(knowledgeBaseId = kb['knowledgeBaseId'])

## Create data sources

- chunkingStrategyConfiguration can modify the parameters

NOTE: In the current sample, we'll use FIXED_SIZE chunking Strategy but you can also use other chunking chunking strategies like HIERARCHICAL, SEMANTIC or NONE. For more details on the chunking startegies please refer to the [AWS documentation page](https://docs.aws.amazon.com/bedrock/latest/APIReference/API_agent_ChunkingConfiguration.html)

In [33]:
# Function to create KB
def create_ds(data_sources):
    ds_list=[]
    for idx, ds in enumerate(data_sources):
        # Ingest strategy - How to ingest data from the data source
        chunkingStrategyConfiguration = {
            "chunkingStrategy": "FIXED_SIZE", 
            "fixedSizeChunkingConfiguration": {
                "maxTokens": 512,
                "overlapPercentage": 20
            }
        }
        
        # The data source to ingest documents from, into the OpenSearch serverless knowledge base index
        
        s3DataSourceConfiguration = {
                "type": "S3",
                "s3Configuration":{
                    "bucketArn": "",
                    # "inclusionPrefixes":["*.*"] # you can use this if you want to create a KB using data within s3 prefixes.
                    }
            }
        
        confluenceDataSourceConfiguration = {
            "confluenceConfiguration": {
                "sourceConfiguration": {
                    "hostUrl": "",
                    "hostType": "SAAS",
                    "authType": "", # BASIC | OAUTH2_CLIENT_CREDENTIALS
                    "credentialsSecretArn": ""
                    
                },
                "crawlerConfiguration": {
                    "filterConfiguration": {
                        "type": "PATTERN",
                        "patternObjectFilter": {
                            "filters": [
                                {
                                    "objectType": "Attachment",
                                    "inclusionFilters": [
                                        ".*\\.pdf"
                                    ],
                                    "exclusionFilters": [
                                        ".*private.*\\.pdf"
                                    ]
                                }
                            ]
                        }
                    }
                }
            },
            "type": "CONFLUENCE"
        }

        sharepointDataSourceConfiguration = {
            "sharePointConfiguration": {
                "sourceConfiguration": {
                    "tenantId": "",
                    "hostType": "ONLINE",
                    "domain": "domain",
                    "siteUrls": [],
                    "authType": "", # BASIC | OAUTH2_CLIENT_CREDENTIALS
                    "credentialsSecretArn": ""
                    
                },
                "crawlerConfiguration": {
                    "filterConfiguration": {
                        "type": "PATTERN",
                        "patternObjectFilter": {
                            "filters": [
                                {
                                    "objectType": "Attachment",
                                    "inclusionFilters": [
                                        ".*\\.pdf"
                                    ],
                                    "exclusionFilters": [
                                        ".*private.*\\.pdf"
                                    ]
                                }
                            ]
                        }
                    }
                }
            },
            "type": "SHAREPOINT"
        }


        salesforceDataSourceConfiguration = {
            "salesforceConfiguration": {
                "sourceConfiguration": {
                    "hostUrl": "",
                    "authType": "", # BASIC | OAUTH2_CLIENT_CREDENTIALS
                    "credentialsSecretArn": ""
                },
                "crawlerConfiguration": {
                    "filterConfiguration": {
                        "type": "PATTERN",
                        "patternObjectFilter": {
                            "filters": [
                                {
                                    "objectType": "Attachment",
                                    "inclusionFilters": [
                                        ".*\\.pdf"
                                    ],
                                    "exclusionFilters": [
                                        ".*private.*\\.pdf"
                                    ]
                                }
                            ]
                        }
                    }
                }
            },
            "type": "SALESFORCE"
        }

        webcrawlerDataSourceConfiguration = {
            "webConfiguration": {
                "sourceConfiguration": {
                    "urlConfiguration": {
                        "seedUrls": []
                    }
                },
                "crawlerConfiguration": {
                    "crawlerLimits": {
                        "rateLimit": 50
                    },
                    "scope": "HOST_ONLY",
                    "inclusionFilters": [],
                    "exclusionFilters": []
                }
            },
            "type": "WEB"
        }

        # Set the data source configuration based on the Data source type

        if ds['type'] == "S3":
            print(f'{idx +1 } data source: S3')
            ds_name = f'{name}-{bucket_name}'
            s3DataSourceConfiguration["s3Configuration"]["bucketArn"] = f'arn:aws:s3:::{ds["bucket_name"]}'
            # print(s3DataSourceConfiguration)
            data_source_configuration = s3DataSourceConfiguration
        
        if ds['type'] == "CONFLUENCE":
            print(f'{idx +1 } data source: CONFLUENCE')
            ds_name = f'{name}-confluence'
            confluenceDataSourceConfiguration['confluenceConfiguration']['sourceConfiguration']['hostUrl'] = ds['hostUrl']
            confluenceDataSourceConfiguration['confluenceConfiguration']['sourceConfiguration']['authType'] = ds['authType']
            confluenceDataSourceConfiguration['confluenceConfiguration']['sourceConfiguration']['credentialsSecretArn'] = ds['credentialsSecretArn']
            # print(confluenceDataSourceConfiguration)
            data_source_configuration = confluenceDataSourceConfiguration

        if ds['type'] == "SHAREPOINT":
            print(f'{idx +1 } data source: SHAREPOINT')
            ds_name = f'{name}-sharepoint'
            sharepointDataSourceConfiguration['sharePointConfiguration']['sourceConfiguration']['tenantId'] = ds['tenantId']
            sharepointDataSourceConfiguration['sharePointConfiguration']['sourceConfiguration']['domain'] = ds['domain']
            sharepointDataSourceConfiguration['sharePointConfiguration']['sourceConfiguration']['authType'] = ds['authType']
            sharepointDataSourceConfiguration['sharePointConfiguration']['sourceConfiguration']['siteUrls'] = ds["siteUrls"]
            sharepointDataSourceConfiguration['sharePointConfiguration']['sourceConfiguration']['credentialsSecretArn'] = ds['credentialsSecretArn']
            # print(sharepointDataSourceConfiguration)
            data_source_configuration = sharepointDataSourceConfiguration


        if ds['type'] == "SALESFORCE":
            print(f'{idx +1 } data source: SALESFORCE')
            ds_name = f'{name}-salesforce'
            salesforceDataSourceConfiguration['salesforceConfiguration']['sourceConfiguration']['hostUrl'] = ds['hostUrl']
            salesforceDataSourceConfiguration['salesforceConfiguration']['sourceConfiguration']['authType'] = ds['authType']
            salesforceDataSourceConfiguration['salesforceConfiguration']['sourceConfiguration']['credentialsSecretArn'] = ds['credentialsSecretArn']
            # print(salesforceDataSourceConfiguration)
            data_source_configuration = salesforceDataSourceConfiguration

        if ds['type'] == "WEB":
            print(f'{idx +1 } data source: WEB')
            ds_name = f'{name}-web'
            webcrawlerDataSourceConfiguration['webConfiguration']['sourceConfiguration']['urlConfiguration']['seedUrls'] = ds['seedUrls']
            webcrawlerDataSourceConfiguration['webConfiguration']['crawlerConfiguration']['inclusionFilters'] = ds['inclusionFilters']
            webcrawlerDataSourceConfiguration['webConfiguration']['crawlerConfiguration']['exclusionFilters'] = ds['exclusionFilters']
            # print(webcrawlerDataSourceConfiguration)
            data_source_configuration = webcrawlerDataSourceConfiguration
            

        # Create a DataSource in KnowledgeBase 
        create_ds_response = bedrock_agent_client.create_data_source(
            name = ds_name,
            description = description,
            knowledgeBaseId = kb['knowledgeBaseId'],
            dataSourceConfiguration = data_source_configuration,
            vectorIngestionConfiguration = {
                "chunkingConfiguration": chunkingStrategyConfiguration
            }
        )
        ds = create_ds_response["dataSource"]
        pp.pprint(ds)
        ds_list.append(ds)
    return ds_list

In [32]:
data_sources_list = create_ds(data_sources)

1 data source: S3
{'type': 'S3', 's3Configuration': {'bucketArn': 'arn:aws:s3:::dev-managinglife-aws-bedrock'}}


ConflictException: An error occurred (ConflictException) when calling the CreateDataSource operation: DataSource with name bedrock-sample-knowledge-base-825-dev-managinglife-aws-bedrock already exists.

In [34]:
data_sources_list

[{'createdAt': datetime.datetime(2024, 10, 7, 16, 0, 41, 110404, tzinfo=tzutc()),
  'dataDeletionPolicy': 'DELETE',
  'dataSourceConfiguration': {'s3Configuration': {'bucketArn': 'arn:aws:s3:::dev-managinglife-aws-bedrock'},
   'type': 'S3'},
  'dataSourceId': 'CNUAPTNKUR',
  'description': 'Amazon shareholder letter knowledge base.',
  'knowledgeBaseId': 'LSGWC8EJRL',
  'name': 'bedrock-sample-knowledge-base-825-dev-managinglife-aws-bedrock',
  'status': 'AVAILABLE',
  'updatedAt': datetime.datetime(2024, 10, 7, 16, 0, 41, 110404, tzinfo=tzutc()),
  'vectorIngestionConfiguration': {'chunkingConfiguration': {'chunkingStrategy': 'FIXED_SIZE',
    'fixedSizeChunkingConfiguration': {'maxTokens': 512,
     'overlapPercentage': 20}}}}]

In [35]:
# Get DataSource 
for idx, ds in enumerate(data_sources_list):
    print(bedrock_agent_client.get_data_source(knowledgeBaseId = kb['knowledgeBaseId'], dataSourceId = ds["dataSourceId"]))
    print(" ")

{'ResponseMetadata': {'RequestId': 'ad5a23af-271c-457b-80ec-9e9b16c301b8', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 07 Oct 2024 16:01:36 GMT', 'content-type': 'application/json', 'content-length': '627', 'connection': 'keep-alive', 'x-amzn-requestid': 'ad5a23af-271c-457b-80ec-9e9b16c301b8', 'x-amz-apigw-id': 'fSXbDGDW4osEe1w=', 'x-amzn-trace-id': 'Root=1-670405e0-0c47ecc64a0280c938f76585'}, 'RetryAttempts': 0}, 'dataSource': {'createdAt': datetime.datetime(2024, 10, 7, 16, 0, 41, 110404, tzinfo=tzutc()), 'dataDeletionPolicy': 'DELETE', 'dataSourceConfiguration': {'s3Configuration': {'bucketArn': 'arn:aws:s3:::dev-managinglife-aws-bedrock'}, 'type': 'S3'}, 'dataSourceId': 'CNUAPTNKUR', 'description': 'Amazon shareholder letter knowledge base.', 'knowledgeBaseId': 'LSGWC8EJRL', 'name': 'bedrock-sample-knowledge-base-825-dev-managinglife-aws-bedrock', 'status': 'AVAILABLE', 'updatedAt': datetime.datetime(2024, 10, 7, 16, 0, 41, 110404, tzinfo=tzutc()), 'vectorIngestionConfigur

# Start Ingestions

In [36]:
interactive_sleep(30)
ingest_jobs=[]
# Start an ingestion job
for idx, ds in enumerate(data_sources_list):
    try:
        start_job_response = bedrock_agent_client.start_ingestion_job(knowledgeBaseId = kb['knowledgeBaseId'], dataSourceId = ds["dataSourceId"])
        job = start_job_response["ingestionJob"]
        print(f"job {idx} started successfully\n")
    
        while job['status'] not in ["COMPLETE", "FAILED", "STOPPED"]:
            get_job_response = bedrock_agent_client.get_ingestion_job(
              knowledgeBaseId = kb['knowledgeBaseId'],
                dataSourceId = ds["dataSourceId"],
                ingestionJobId = job["ingestionJobId"]
          )
            job = get_job_response["ingestionJob"]
        pp.pprint(job)
        interactive_sleep(40)

        ingest_jobs.append(job)
    except Exception as e:
        print(f"Couldn't start {idx} job.\n")
        print(e)
    
    

job 0 started successfully....

{ 'dataSourceId': 'CNUAPTNKUR',
  'ingestionJobId': 'OJ7MK6PFD6',
  'knowledgeBaseId': 'LSGWC8EJRL',
  'startedAt': datetime.datetime(2024, 10, 7, 16, 2, 40, 28550, tzinfo=tzutc()),
  'statistics': { 'numberOfDocumentsDeleted': 0,
                  'numberOfDocumentsFailed': 0,
                  'numberOfDocumentsScanned': 13,
                  'numberOfMetadataDocumentsModified': 0,
                  'numberOfMetadataDocumentsScanned': 0,
                  'numberOfModifiedDocumentsIndexed': 0,
                  'numberOfNewDocumentsIndexed': 13},
  'status': 'COMPLETE',
  'updatedAt': datetime.datetime(2024, 10, 7, 16, 3, 0, 955326, tzinfo=tzutc())}
........................................

In [37]:
# Print the knowledge base Id in bedrock, that corresponds to the Opensearch index in the collection we created before, we will use it for the invocation later
kb_id = kb["knowledgeBaseId"]
pp.pprint(kb_id)

'LSGWC8EJRL'


In [38]:
# keep the kb_id for invocation later in the invoke request
%store kb_id

Stored 'kb_id' (str)


## Testing the Knowledge Base

Retrieve and Generate API

In [40]:
query = "How do I change my email?"

foundation_model = "anthropic.claude-3-sonnet-20240229-v1:0"

response = bedrock_agent_runtime_client.retrieve_and_generate(
    input={
        "text": query
    },
    retrieveAndGenerateConfiguration={
        "type": "KNOWLEDGE_BASE",
        "knowledgeBaseConfiguration": {
            'knowledgeBaseId': kb_id,
            "modelArn": "arn:aws:bedrock:{}::foundation-model/{}".format(region_name, foundation_model),
            "retrievalConfiguration": {
                "vectorSearchConfiguration": {
                    "numberOfResults":5
                } 
            }
        }
    }
)

print(response['output']['text'],end='\n'*2)

To change your email address in the Manage My Pain app:

1. Open the app and go to the "My Profile" section.
2. Expand the "Account Information" section and press the pencil icon next to your email address.
3. Enter your new email address and current password, then press "CHANGE EMAIL ADDRESS".
4. Check your new email inbox for a confirmation email and click the link to confirm the email change.
5. You can now log in with your new email address.



In [43]:
citations = response["citations"]
contexts = []
for citation in citations:
    retrievedReferences = citation["retrievedReferences"]
    for reference in retrievedReferences:
         contexts.append(reference["content"]["text"])

pp.pprint(contexts)

[ '# My Profile ## 1. Account Information ### a. Change Email Address 1. Go to '
  'Account Information section Open the app and click the "My Profile" '
  'button.  On the "My Profile" screen, expand the "Account Information" '
  'section.  Press the "pencil" icon on the right-hand side of the email '
  'address field.  You will be taken to "Change Email Address" screen. 2. '
  'Request to change email address On the "Change Email Address" screen, enter '
  'a new valid email address and your current password.  Press "CHANGE EMAIL '
  'ADDRESS" to confirm the change. 3. Check email request After changed email '
  'address, a message will pop-up at the bottom of the screen that says '
  '"Change email request is sent successfully, check your new email box". 4. '
  'Confirm change email request You will receive an email to update your email '
  'address. Press "Click here to confirm your email address update". 5. Log in '
  'to your account After you have confirmed your email address up

## Testing the Knowledge Base

Retrieve and Chunk API

In [45]:
response_ret = bedrock_agent_runtime_client.retrieve(
    knowledgeBaseId=kb_id, 
    nextToken='string',
    retrievalConfiguration={
        "vectorSearchConfiguration": {
            "numberOfResults":5,
        } 
    },
    retrievalQuery={
        "text": "How do I change my email?"
    }
)

def response_print(retrieve_resp):
#structure 'retrievalResults': list of contents. Each list has content, location, score, metadata
    for num,chunk in enumerate(response_ret['retrievalResults'],1):
        print(f'Chunk {num}: ',chunk['content']['text'],end='\n'*2)
        print(f'Chunk {num} Location: ',chunk['location'],end='\n'*2)
        print(f'Chunk {num} Score: ',chunk['score'],end='\n'*2)
        print(f'Chunk {num} Metadata: ',chunk['metadata'],end='\n'*2)

response_print(response_ret)

Chunk 1:  # My Profile ## 1. Account Information ### a. Change Email Address 1. Go to Account Information section Open the app and click the "My Profile" button.  On the "My Profile" screen, expand the "Account Information" section.  Press the "pencil" icon on the right-hand side of the email address field.  You will be taken to "Change Email Address" screen. 2. Request to change email address On the "Change Email Address" screen, enter a new valid email address and your current password.  Press "CHANGE EMAIL ADDRESS" to confirm the change. 3. Check email request After changed email address, a message will pop-up at the bottom of the screen that says "Change email request is sent successfully, check your new email box". 4. Confirm change email request You will receive an email to update your email address. Press "Click here to confirm your email address update". 5. Log in to your account After you have confirmed your email address update, you can log in using your new email address and

## Clean up

In [46]:

# Delete KnowledgeBase
for idx, ds in enumerate(data_sources_list):
    bedrock_agent_client.delete_data_source(dataSourceId = ds["dataSourceId"], knowledgeBaseId=kb['knowledgeBaseId'])
bedrock_agent_client.delete_knowledge_base(knowledgeBaseId=kb['knowledgeBaseId'])
oss_client.indices.delete(index=index_name)
aoss_client.delete_collection(id=collection_id)
aoss_client.delete_access_policy(type="data", name=access_policy['accessPolicyDetail']['name'])
aoss_client.delete_security_policy(type="network", name=network_policy['securityPolicyDetail']['name'])
aoss_client.delete_security_policy(type="encryption", name=encryption_policy['securityPolicyDetail']['name'])

{'ResponseMetadata': {'RequestId': 'bb3c505c-3680-4000-a941-083a8d3b16cd',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'bb3c505c-3680-4000-a941-083a8d3b16cd',
   'date': 'Mon, 07 Oct 2024 16:10:02 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '2',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}