# 2. Creating a managed PGVector knowledge base

This notebook walks you through the creation and usage of managed RAG (Retrieval Augmented Generation) using [Amazon Bedrock Knowledge Bases](https://docs.aws.amazon.com/bedrock/latest/userguide/knowledge-base.html). For this sample, we use PGVector in Amazon RDS Aurora PostgreSQL, with setup details described [here](https://docs.aws.amazon.com/bedrock/latest/userguide/knowledge-base-setup.html). 

At the time of writing, Bedrock Knowledge Bases use Amazon S3 as the ingestion source, so we will configure it to ingest the text files written to S3 in notebook 1. Additionally, for each product, we have written one text file and used the ASIN (uuid) as the file name. This will be important in RAG later for reliable source attribution during the search process.

We will also be selecting `No chunking` for ingestion, so that each file corresponds to one ASIN. This means that the text in each file has to adhere to the chunk size limit described [here](https://docs.aws.amazon.com/bedrock/latest/userguide/quotas.html#quotas-kb).

2.0. [Set up](#2.0)

2.1. [Configure RDS Aurora to permit access from Bedrock](#2.1)

2.2. [Create and configure a table for Bedrock knowledge base](#2.2)

2.3. [Manually set up a Bedrock knowledge base](#2.3)

2.4. [Create and track data source sync job](#2.4)

2.5. [Query the knowledge base](#2.5)

## <a id="2.0">Set up<a>

In [None]:
# run this cell to upgrade to the latest version of boto3 if required, and restart the kernel
!pip install --upgrade --force --quiet botocore boto3

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import boto3
import sagemaker

import pandas as pd
import json
from time import sleep

<div class="alert alert-block alert-warning">

IMPORTANT! Please copy and paste the required information for your <b>RDS Aurora PostgreSQL database</b> in the cell below.
    
</div>

In [None]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
region = sess.boto_region_name
accountid = sess.account_id()
product_db_data_path = 'amazon-reviews-fashion-metadata'
bedrock_kb_data_path = 'bedrock-kb-data'
bedrock_kb_datasource_uri = f's3://{bucket}/{bedrock_kb_data_path}/'

database_identifier='<TODO>'
database_arn='<TODO>'
database_secret_arn='<TODO>'
database_name='<TODO>'

In [None]:
%mkdir -p util

In [None]:
# OPTIONAL
items = pd.read_csv('items.txt', sep='|', index_col=False, 
                    names=['asin',	'title', 'brand', 'price', 'description', 'image'])
items

## 2.1 <a id="2.1">Configure RDS Aurora to permit access from Bedrock<a>
    
Create a IAM policies and roles for Bedrock to interact with RDS and use it as a vector database

In [None]:
rds_trust_policy = f'''{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Sid": "",
            "Effect": "Allow",
            "Principal": {{
                "Service": "rds.amazonaws.com"
            }},
            "Action": "sts:AssumeRole"
        }}
    ]
}}'''
featurename = 'Bedrock'
rds_bedrock_role_name = 'AuroraML'
rds_bedrock_role_description = 'IAM role for RDS to interact with Bedrock'
rds_bedrock_policy_name = 'rds-bedrock-policy'
rds_bedrock_policy_description = 'IAM policy for RDS to interact with Bedrock'
rds_bedrock_policy_document = f'''{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Sid": "VisualEditor0",
            "Effect": "Allow",
            "Action": "bedrock:InvokeModel",
            "Resource": [
                "arn:aws:bedrock:*:{accountid}:provisioned-model/*",
                "arn:aws:bedrock:*::foundation-model/*"
            ]
        }},
        {{
            "Sid": "RdsDescribeStatementID",
            "Effect": "Allow",
            "Action": [
                "rds:DescribeDBClusters"
            ],
            "Resource": [
                "{database_arn}"
            ]
        }},
        {{
            "Sid": "DataAPIStatementID",
            "Effect": "Allow",
            "Action": [
                "rds-data:BatchExecuteStatement",
                "rds-data:ExecuteStatement"
            ],
            "Resource": [
                "{database_arn}"
            ]
        }}
    ]
}}'''
    
client = boto3.client('iam')
try:
    response_policy = client.create_policy(
        PolicyName=rds_bedrock_policy_name,
        PolicyDocument=rds_bedrock_policy_document,
        Description=rds_bedrock_policy_description,
    )
    print('Policy created')
except Exception as e:
    print('Failed to create policy.', e)

try:
    response_role = client.create_role(
        RoleName=rds_bedrock_role_name,
        AssumeRolePolicyDocument=rds_trust_policy,
        Description=rds_bedrock_role_description,
    )
    print('Role created')
except Exception as e:
    print('Failed to create role.', e)

try:
    add_policy = client.attach_role_policy(
        RoleName=rds_bedrock_role_name,
        PolicyArn=f'arn:aws:iam::{accountid}:policy/{rds_bedrock_policy_name}'
    )
    sleep(5)  # Allow permissions to propagate before attempting to attach
    print('Policy added to Role')
except Exception as e:
    print(e)

rdsclient = boto3.client('rds')
try:
    response_role = client.get_role(RoleName=rds_bedrock_role_name)
    response_db_role = rdsclient.add_role_to_db_cluster(
        DBClusterIdentifier=database_identifier,
        RoleArn=response_role['Role']['Arn'],
        FeatureName=featurename
    )
    print('Role attached to DB')
except Exception as e:
    print('Failed to attach role to DB.', e)

## 2.2 <a id="2.2">Create and configure a table for Bedrock Knowledge Base<a>

Refer to the [prerequisites](https://docs.aws.amazon.com/bedrock/latest/userguide/knowledge-base-setup.html)

In [None]:
# embedding_size=1536 #titan embedding model
embedding_size=1024 #cohere embedding models

sql_queries=[
"CREATE EXTENSION IF NOT EXISTS vector;",
"CREATE SCHEMA IF NOT EXISTS bedrock_integration;",
f"""
CREATE TABLE IF NOT EXISTS bedrock_integration.bedrock_kb (
  id uuid PRIMARY KEY,
  embedding vector({embedding_size}),
  chunks text,
  metadata json
);
""",
f"""
CREATE INDEX ON bedrock_integration.bedrock_kb
  USING hnsw (embedding vector_cosine_ops);
"""]

rdsdata = boto3.client('rds-data')

for query in sql_queries:
    response = rdsdata.batch_execute_statement(
        resourceArn=database_arn,
        secretArn=database_secret_arn,
        sql=query,
        database=database_name,
    )
    print(response)

## 2.3 <a id="2.3">Manually set up Amazon Bedrock Knowledge Base<a>

Refer to this [release blog](https://aws.amazon.com/blogs/database/build-generative-ai-applications-with-amazon-aurora-and-knowledge-bases-for-amazon-bedrock/) for screenshots.

1. Use your admin user in your Aurora PostgreSQL cluster to create a new user: bedrock_user OR select an existing app user for Amazon Bedrock to use to access your cluster. Grant the user permissions to interact with the table created earlier.
    ```
    CREATE ROLE bedrock_user WITH PASSWORD <password> LOGIN;
    GRANT ALL ON SCHEMA bedrock_integration to bedrock_user;
    GRANT ALL ON ALL TABLES IN SCHEMA bedrock_integration to bedrock_user;
    SET SESSION AUTHORIZATION bedrock_user;
    ```
<br>

2. Create secret in Secrets Manager to store DB credentials for bedrock_user

   Go to [AWS Secrets Manager](https://console.aws.amazon.com/secretsmanager/)

3. Create Bedrock Knowledge Base using the AWS console

- Instructions to create Bedrock Knowledge Base:
    - For Amazon Aurora DB Cluster ARN, enter the ARN you saved when creating your Aurora cluster.
    - For Database name, enter postgres.
    - For Table name, enter bedrock_integration.bedrock_kb.
    - For Secret ARN, enter the ARN you saved when creating the secret for bedrock_user.
    - For Vector field, enter embedding.
    - For Text field, enter chunks.
    - For Bedrock-managed metadata field, enter metadata.
    - For Primary key, enter id.
- Instructions to create data source:
    - Create a new data source
    - Browse to the S3 path containing the output txt file(s)
    - Open advanced settings and set chunkingStrategy as NONE
    - Create the data source and click on sync to begin a new sync job

In [None]:
print(f"""S3 Data URI: {bedrock_kb_datasource_uri} 
Database ARN: {database_arn}
Database name: {database_name}""")

<div class="alert alert-block alert-warning">

IMPORTANT! Please copy and paste the <b>Bedrock Knowledge Base ID</b>, <b>Data Source ID</b> and <b>the secret ARN for bedrock_user </b>for the knowledge base that you have created in the cell below.
    
</div>

In [None]:
bedrock_kb_id = '<TODO>'
data_source_id = '<TODO>'
bedrock_user_secret_arn = '<TODO>'

In [None]:
# # [OPTIONAL] Update S3 Policy for Bedrock Knowledge Base if you get permission errors
# bedrock_agent_client = boto3.client('bedrock-agent')

# s3_role = bedrock_agent_client.get_knowledge_base(knowledgeBaseId=bedrock_kb_id)['knowledgeBase']['roleArn'].split('/')[-1]

# for policy in client.list_attached_role_policies(RoleName=s3_role)['AttachedPolicies']:
#     if 'AmazonBedrockS3PolicyForKnowledgeBase' in policy['PolicyName']:
#         s3_policy_arn = policy['PolicyArn']

# s3_policy_document = f'''{{
#     "Version": "2012-10-17",
#     "Statement": [
#         {{
#             "Sid": "S3ListBucketStatement",
#             "Effect": "Allow",
#             "Action": [
#                 "s3:ListBucket"
#             ],
#             "Resource": [
#                 "arn:aws:s3:::{bucket}"
#             ],
#             "Condition": {{
#                 "StringEquals": {{
#                     "aws:ResourceAccount": "{accountid}"
#                 }}
#             }}
#         }},
#         {{
#             "Sid": "S3GetObjectStatement",
#             "Effect": "Allow",
#             "Action": [
#                 "s3:GetObject"
#             ],
#             "Resource": [
#                 "arn:aws:s3:::{bucket}/{bedrock_kb_data_path}/*"
#             ],
#             "Condition": {{
#                 "StringEquals": {{
#                     "aws:ResourceAccount": "{accountid}"
#                 }}
#             }}
#         }}
#     ]
# }}'''

# s3_policy_response = client.create_policy_version(
#     PolicyArn= s3_policy_arn,
#     PolicyDocument= s3_policy_document,
#     SetAsDefault= True
# )


# print(f'Updated {s3_policy_arn}\n {s3_policy_response}')

## 2.4 <a id="2.4">Create and track data source sync job<a>

In [None]:
bedrock_agent_client = boto3.client('bedrock-agent')

counter = 0
while counter < 20:
    
    data_store_status = bedrock_agent_client.get_data_source(knowledgeBaseId=bedrock_kb_id,dataSourceId=data_source_id)
    
    if data_store_status['dataSource']['status'] == 'AVAILABLE':
        try:
            response_sync_source = bedrock_agent_client.start_ingestion_job(
                knowledgeBaseId=bedrock_kb_id,
                dataSourceId=data_source_id,
            )['ingestionJob']['ingestionJobId']
            print(response_sync_source)
        except Exception as e:
            response_sync_source = bedrock_agent_client.list_ingestion_jobs(
                knowledgeBaseId=bedrock_kb_id,
                dataSourceId=data_source_id,
            )['ingestionJobSummaries'][0]['ingestionJobId']
            print(e)
        break;
    else:
        sleep(30)
        counter +=1

In [None]:
ingestion_job_id = response_sync_source
# ingestion_job_id = 'IUI8RBF2TW'

while bedrock_agent_client.get_ingestion_job(
    knowledgeBaseId=bedrock_kb_id,
    dataSourceId=data_source_id,
    ingestionJobId=ingestion_job_id
) == 'IN_PROGRESS':
    sleep(60)
    
bedrock_agent_client.get_ingestion_job(
    knowledgeBaseId=bedrock_kb_id,
    dataSourceId=data_source_id,
    ingestionJobId=ingestion_job_id
)

## 2.5 <a id="2.5">Query the knowledge base<a>

In [None]:
%%writefile util/bedrockkb.py
import boto3

def bedrock_kb_retrieve(bedrock_kb_id, search_query, no_kb_results):
    client = boto3.client('bedrock-agent-runtime')

    response_kb = client.retrieve(
        knowledgeBaseId=bedrock_kb_id,
        retrievalConfiguration={
            'vectorSearchConfiguration': {
                'numberOfResults': no_kb_results,
            }
        },
        retrievalQuery={
            'text': search_query 
        }
    )
    
    results = response_kb["retrievalResults"]
    
    if len(results) == 0:
        return None
    elif len(results) ==1:
        return results.tolist()
    else:
        return results

In [None]:
from util.bedrockkb import bedrock_kb_retrieve

search_query = 'sports shoes'
no_kb_results = 3
bedrock_kb_retrieve(bedrock_kb_id, search_query, no_kb_results)