# File Name: simple_knwl_bases_building.ipynb
### Location: Chapter 7
### Purpose: 
#####             1. Create collection on serverless opensearch
#####             2. Create a network policy for collection
#####             3. Create a security policy for encryption using an AWS-owned key
#####             4. Create a access policy for collection to define permissions for the collection and index
#####             5. Call the create_access_policy method to define permissions for the collection and index
#####             6. Create a vector search collection in OpenSearch Serverless
#####             7. Collection will take some time to be "ACTIVE". So, checking when the collection is "ACTIVE" for the next steps
#####             8. Index Creation on the collection
#####             9. Create the Amazon Knowledge Bases
#####             10. Create a DataSource in KnowledgeBase 
#####             11. Ingest data into the Amazon Knowledge Bases.  
#####             12. Test the Amazon Knowledge Bases using RetrieveAndGenerate API and Retrieve API

##### Dependency: simple-sageMaker-bedrock.ipynb at Chapter 3 should work properly. 
# <ins>-----------------------------------------------------------------------------------</ins>


# <ins>Amazon SageMaker Classic</ins>
#### Those who are new to Amazon SageMaker Classic. Follow the link for the details. https://docs.aws.amazon.com/sagemaker/latest/dg/studio.html

# <ins>Environment setup of Kernel</ins>
##### Fill "Image" as "Data Science"
##### Fill "Kernel" as "Python 3"
##### Fill "Instance type" as "ml-t3-medium"
##### Fill "Start-up script" as "No Scripts"
##### Click "Select"

###### Refer https://docs.aws.amazon.com/sagemaker/latest/dg/notebooks-create-open.html for details.

# <ins>Mandatory installation on the kernel through pip</ins>

##### This lab will work with below software version. But, if you are trying with latest version of boto3, awscli, and botocore. This code may fail. You might need to change the corresponding api. 

##### You will see pip dependency errors. you can safely ignore these errors and continue executing rest of the cell. 

In [None]:
%pip install --no-build-isolation --force-reinstall -q \
    "boto3>=1.34.84" \
    "opensearch-py>=2.7.1" \
    "retrying>=1.3.4" \
    "ragas" \
    "ipywidgets>=7.6.5" \
    "iprogress>=0.4" \
    "langchain>=0.2.16" \
    "langchain_community>=0.2.17" \
    "awscli>=1.32.84" \
    "botocore>=1.34.84" \
    "langchain-aws>=0.1.7"    

# <ins>Disclaimer</ins>

##### You will see pip dependency errors. you can safely ignore these errors and continue executing rest of the cell.

# <ins>Restart the kernel</ins>

In [None]:
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

# <ins>Python package import</ins>

##### boto3 offers various clients for Amazon Bedrock to execute various actions.
##### botocore is a low-level interface to AWS tools, while boto3 is built on top of botocore and provides additional features

In [None]:
import json
import os
import boto3
import botocore
import pprint
import random
from retrying import retry
import warnings
import time
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, RequestError
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
import pprint as pp
from botocore.exceptions import BotoCoreError, ClientError

### Ignore warning 

In [None]:
warnings.filterwarnings('ignore')

## Define important environment variable

In [None]:
%%time

# Try-except block to handle potential errors
try:
    # Create a new Boto3 session to interact with AWS services
    boto3_session_name = boto3.session.Session()

    # Retrieve the current AWS region from the session
    aws_region_name = boto3_session_name.region_name

    # Create a Bedrock Agent client using the current session and region
    bedrock_agent_client = boto3_session_name.client('bedrock-agent', region_name=aws_region_name)

    # Define the service name for Amazon OpenSearch Serverless (AOSS)
    opensearch_service_name = 'aoss'

    # Create an S3 client to interact with Amazon S3
    s3_client = boto3.client('s3')

    # Create an STS client to interact with AWS Security Token Service (STS)
    sts_client = boto3.client('sts')

    # Get the AWS account ID of the caller
    aws_account_id = sts_client.get_caller_identity()["Account"]

    # Generate a suffix using the region and account ID for the S3 bucket name
    s3_suffix = f"{aws_region_name}-{aws_account_id}"

    # Generate a random suffix number between 200 and 900
    random_suffix = random.randrange(200, 900)

    # Define the name of the S3 bucket (you can replace this with your actual bucket name)
    s3_bucket_name = f'bedrock-kb-{s3_suffix}-{random_suffix}'

    # PrettyPrinter instance for formatted output
    pretty_printer = pprint.PrettyPrinter(indent=2)

    # Create an OpenSearch Serverless (AOSS) client using the current session
    aoss_client = boto3_session_name.client('opensearchserverless')

    # Generate unique names for the vector store and index based on the suffix
    vector_store_name = f'bedrock-sample-rag-{random_suffix}'
    index_name = f"bedrock-sample-rag-index-{random_suffix}"

    # Create an IAM client to interact with Identity and Access Management (IAM) service
    iam_client = boto3_session_name.client('iam')

    # Retrieve the current AWS account number and ARN of the caller
    sts_client = boto3.client('sts')
    identity_arn = sts_client.get_caller_identity().get('Arn')
    
    # Create security policy name for aoss collection
    security_policy_name = f'aoss-collection-sec-policy-{random_suffix}'
    network_policy_name = f'aoss-collection-net-policy-{random_suffix}'
    access_policy_name = f'aoss-collection-acs-policy-{random_suffix}'
    
    # Embedding model ARN for Bedrock
    embeddingModelArn = f"arn:aws:bedrock:{aws_region_name}::foundation-model/amazon.titan-embed-text-v1"
    
    # Amazon Knowledges Bases variable 
    bedrock_knowledge_bases_name = f"bedrock-sample-knowledge-bases-{random_suffix}"
    description = "Amazon shareholder letter knowledge base."

    # Store all variables in a dictionary
    variables_store = {
        "aws_region_name": aws_region_name,
        "bedrock_agent_client": bedrock_agent_client,
        "opensearch_service_name": opensearch_service_name,
        "s3_client": s3_client,
        "sts_client": sts_client,
        "aws_account_id": aws_account_id,
        "s3_suffix": s3_suffix,
        "s3_bucket_name": s3_bucket_name,
        "random_suffix": random_suffix,
        "aoss_client": aoss_client,
        "vector_store_name": vector_store_name,
        "index_name": index_name,
        "iam_client": iam_client,
        "sts_client": sts_client,
        "identity_arn": identity_arn,
        "security_policy_name": security_policy_name,
        "network_policy_name": network_policy_name,
        "access_policy_name": access_policy_name,
        "embeddingModelArn": embeddingModelArn,
        "bedrock_knowledge_bases_name": bedrock_knowledge_bases_name,
        "description": description
    }

    # Print all variables
    for var_name, value in variables_store.items():
        print(f"{var_name}: {value}")

except Exception as e:
    print(f"An unexpected error occurred: {e}")


### ---------------
##### The provided code snippet uses the AWS Boto3 library to manage an Amazon S3 bucket for a knowledge base data source. It begins by creating an S3 client and defines a bucket name, s3_bucket_name. A function, check_bucket_exists, checks whether the specified bucket exists by attempting to retrieve its metadata using the head_bucket method. If the bucket exists, a message is printed confirming its existence. If it does not exist (error code '404'), the function returns False. If the bucket is missing, the script proceeds to create it using the create_bucket method, ensuring the data source bucket is always available.

In [None]:
%%time

# Check if s3 bucket exists, and if not create S3 bucket for knowledge base data source

# Try-except block to handle potential errors
try:
    # Create an S3 client to interact with Amazon S3
    s3_client = boto3.client('s3')

    # Define the bucket name (you can replace this with your actual bucket name)
    bucket_name = s3_bucket_name

    # Check if the bucket exists
    def check_bucket_exists(bucket_name):
        try:
            s3_client.head_bucket(Bucket=bucket_name)
            print(f"Bucket '{bucket_name}' already exists.")
            return True
        except botocore.exceptions.ClientError as e:
            error_code = e.response['Error']['Code']
            if error_code == '404':
                print(f"Bucket '{bucket_name}' does not exist.")
                return False
            else:
                raise e

    # If the bucket doesn't exist, create it
    if not check_bucket_exists(bucket_name):
        # Create the S3 bucket
        s3_client.create_bucket(Bucket=bucket_name)
        print(f"Bucket '{bucket_name}' created successfully.")

except botocore.exceptions.BotoCoreError as boto_error:
    print(f"An error occurred with Boto3: {boto_error}")

except Exception as e:
    print(f"An unexpected error occurred: {e}")


### %store magic command to store the variable for use in other notebook cells

In [None]:
%store bucket_name aws_region_name opensearch_service_name embeddingModelArn description
%store aws_account_id s3_suffix s3_bucket_name random_suffix bedrock_knowledge_bases_name
%store vector_store_name index_name identity_arn security_policy_name network_policy_name access_policy_name

# Create collection on serverless opensearch

### Create a network policy for collection

##### This code creates a network security policy for an Amazon OpenSearch Serverless (AOSS) collection using the aoss_client from the AWS Boto3 library. The policy is named network_policy_name and specifies access rules in JSON format, targeting a resource identified as collection/<vector_store_name>. The policy type is set to network, with an option (AllowFromPublic) to allow public access, customizable based on the use case.

In [None]:
%%time

# Create a network policy for collection

try:
    # Creating a network security policy
    network_policy_name_res = aoss_client.create_security_policy(
        name=network_policy_name,  # Name of the security policy
        policy=json.dumps(  # JSON-formatted policy rules
            [
                {
                    'Rules': [
                        {
                            'Resource': ['collection/' + vector_store_name],  # Define the resource
                            'ResourceType': 'collection'  # Specify that it's a collection resource
                        }
                    ],
                    'AllowFromPublic': True  # Allow public access (may need to change based on your use case)
                }
            ]
        ),
        type='network'  # Define the type of security policy as 'network'
    )

    # If the security policy is created successfully, print the success message
    print(f"Security policy '{network_policy_name}' created successfully.")

# Handle the case where the security policy already exists
except aoss_client.exceptions.ConflictException:
    print(f"Security policy '{network_policy_name}' already exists.")

# Handle validation errors such as incorrect policy structure
except aoss_client.exceptions.ValidationException as e:
    print(f"Validation error when creating security policy: {str(e)}")

# Catch any other general exceptions
except Exception as e:
    print(f"An error occurred while creating the security policy: {str(e)}")


### Create a security policy for encryption using an AWS-owned key

##### This code snippet creates an encryption security policy for an Amazon OpenSearch Serverless (AOSS) collection using the AWS Boto3 aoss_client. The policy, named security_policy_name, is configured to use an AWS-owned key for encryption (AWSOwnedKey: True). The target resource is specified as collection/<vector_store_name>, ensuring encryption rules apply specifically to the intended collection.

In [None]:
%%time

# Create a security policy for encryption using an AWS-owned key
try:
    security_policy_response = aoss_client.create_security_policy(
        name=security_policy_name,
        policy=json.dumps(
            {
                'Rules': [{'Resource': ['collection/' + vector_store_name],
                           'ResourceType': 'collection'}],
                'AWSOwnedKey': True
            }),
        type='encryption'
    )
    
    print(f"Security policy '{security_policy_name}' created successfully.")
except aoss_client.exceptions.ConflictException:
    print(f"Security policy '{security_policy_name}' already exists.")
except aoss_client.exceptions.ValidationException as e:
    print(f"Validation error when creating security policy: {str(e)}")
except Exception as e:
    print(f"An error occurred while creating security policy: {str(e)}")

### Create a access policy for collection to define permissions for the collection and index

##### This code defines a function, find_iam_role_by_name_substring, to locate an IAM role and retrieve its ARN based on a specified substring within the role name. Using the AWS Boto3 iam_client, it lists all IAM roles and filters them for names containing the substring "GenAIBookBedrockSageMakerExecutionR".

In [None]:
%%time

# Find out IAM role and ARN for this session

def find_iam_role_by_name_substring(substring):
    try:
        # Use list_roles to retrieve IAM roles
        response = iam_client.list_roles()

        # Filter roles by name that contains the substring
        matching_roles = [role for role in response['Roles'] if substring in role['RoleName']]

        if matching_roles:
            for role in matching_roles:
                print(f"Found Role: {role['RoleName']} | ARN: {role['Arn']}")
                genaibookedbedrocksagemakerexecutionrolearn = role['Arn']
        else:
            print(f"No roles found with name containing '{substring}'.")
            
        return genaibookedbedrocksagemakerexecutionrolearn

    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Call the function with the desired substring
genaibookedbedrocksagemakerexecutionrolearn = find_iam_role_by_name_substring("GenAIBookBedrockSageMakerExecutionRole")

### %store magic command to store the variable for use in other notebook cells

In [None]:
%store genaibookedbedrocksagemakerexecutionrolearn

### Call the create_access_policy method to define permissions for the collection and index

##### This code creates an access policy for managing permissions on an Amazon OpenSearch Serverless (AOSS) collection and index using the AWS Boto3 aoss_client. The policy, named access_policy_name, defines detailed rules for both the collection and index resources, specifying actions allowed for each.
##### Policy Rules:
#####               a) For collection resources (collection/<vector_store_name>)
#####               b) For index resources (index/<vector_store_name>/*)

In [None]:
%%time

try:
    
    access_policy_res = aoss_client.create_access_policy(
        name=access_policy_name,  # The name of the access policy being created
        policy=json.dumps(  # The access policy body, provided in JSON format
            [
                {
                    'Rules': [  # Define the access rules for the resources
                        {
                            'Resource': ['collection/' + vector_store_name],  # Specify the resource collection
                            'Permission': [  # Define allowed actions for the collection
                                'aoss:CreateCollectionItems',  # Allows creating items in the collection
                                'aoss:DeleteCollectionItems',  # Allows deleting items from the collection
                                'aoss:UpdateCollectionItems',  # Allows updating items in the collection
                                'aoss:DescribeCollectionItems'  # Allows describing items in the collection
                            ],
                            'ResourceType': 'collection'  # Define resource type as collection
                        },
                        {
                            'Resource': ['index/' + vector_store_name + '/*'],  # Specify the index resource path
                            'Permission': [  # Define allowed actions for the index
                                'aoss:CreateIndex',  # Allows creating an index
                                'aoss:DeleteIndex',  # Allows deleting an index
                                'aoss:UpdateIndex',  # Allows updating an index
                                'aoss:DescribeIndex',  # Allows describing an index
                                'aoss:ReadDocument',  # Allows reading documents from the index
                                'aoss:WriteDocument'  # Allows writing documents to the index
                            ],
                            'ResourceType': 'index'  # Define resource type as index
                        }
                    ],
                    'Principal': [  # Define who has access to this policy
                        identity_arn,  # The primary ARN to which the policy applies
                        genaibookedbedrocksagemakerexecutionrolearn  # Example of an additional ARN
                    ],
                    'Description': 'Easy data policy'  # Description of the policy
                }
            ]
        ),
        type='data'  
    )
    
    # If the policy is created successfully, print a success message
    print(f"Access policy '{access_policy_name}' created successfully.")

# Handle case where a policy with the same name already exists
except aoss_client.exceptions.ConflictException:
    print(f"Access policy '{access_policy_name}' already exists.")

# Handle validation errors during policy creation
except aoss_client.exceptions.ValidationException as e:
    print(f"Validation error when creating access policy: {str(e)}")

# Handle any other exceptions that occur during the process
except Exception as e:
    print(f"An error occurred while creating access policy: {str(e)}")

### Create a vector search collection in OpenSearch Serverless

##### This code attempts to create a vector search collection in Amazon OpenSearch Serverless using the AWS Boto3 aoss_client. The collection, named vector_store_name, is configured for VECTORSEARCH, a specialized type of collection used for vector-based information retrieval.

##### The create_collection method is called with the specified name and type (VECTORSEARCH) and followed by extracting collection details. 

In [None]:
%%time

# Try to create a vector search collection in OpenSearch Serverless
try:
    response = aoss_client.create_collection(
        name=vector_store_name,
        type='VECTORSEARCH'
    )
    print(f"Collection '{vector_store_name}' creation is in progress.")
    print("Response:", response)
    
    aoss_collection_host = response['createCollectionDetail']['id'] + '.' + aws_region_name + '.aoss.amazonaws.com'
    aoss_collectionarn = response['createCollectionDetail']['arn']
    
    print(f"aoss_collection_host '{aoss_collection_host}' creation is in progress.")
    print(f"aoss_collectionarn '{aoss_collectionarn}' creation is in progress.")
    
except aoss_client.exceptions.ConflictException:
    print(f"Collection '{vector_store_name}' already exists.")
except aoss_client.exceptions.ValidationException as e:
    print(f"Validation error: {str(e)}")
except aoss_client.exceptions.ServiceQuotaExceededException as e:
    print(f"Service quota exceeded: {str(e)}")
except aoss_client.exceptions.OcuLimitExceededException as e:
    print(f"OCU limit exceeded: {str(e)}")
except aoss_client.exceptions.InternalServerException as e:
    print(f"Internal server error: {str(e)}")
except aoss_client.exceptions.ResourceNotFoundException as e:
    print(f"Resource not found: {str(e)}")
except Exception as e:
    print(f"An error occurred: {str(e)}")

### %store magic command to store the variable for use in other notebook cells

In [None]:
%store aoss_collection_host aoss_collectionarn

### Collection will take some time to be "ACTIVE". So, checking when the collection is "ACTIVE" for the next steps.

##### This code provides a mechanism to wait for an Amazon OpenSearch Serverless vector search collection to transition to the "ACTIVE" state, which is necessary before proceeding with subsequent operations.

In [None]:
%%time

# Collection will take some time to be "ACTIVE". So, checking when the collection is "ACTIVE" for the next steps.

def interactive_sleep(seconds):
    """A simple sleep function that could be replaced with more complex logic."""
    time.sleep(seconds)

def wait_for_collection_creation(aoss_client, vector_store_name):
    try:
        # Initial call to batch_get_collection
        response = aoss_client.batch_get_collection(names=[vector_store_name])
        
        # Periodically check collection status
        while response['collectionDetails'][0]['status'] == 'CREATING':
            print('Creating collection...')
            interactive_sleep(30)
            response = aoss_client.batch_get_collection(names=[vector_store_name])
        
        print(f'\nCollection successfully created: {vector_store_name}')
    
    except ClientError as e:
        print(f"An error occurred: {e.response['Error']['Message']}")
    except IndexError:
        print("No collection details found. Please check the collection name.")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")

# Example usage
wait_for_collection_creation(aoss_client, vector_store_name)

# Index Creation on the collection

##### This script defines a modular approach to create a KNN (k-Nearest Neighbor) vector index in an Amazon OpenSearch Serverless collection using Python and Boto3. The code provides functions for authentication, OpenSearch client creation, and index management while handling errors gracefully.

##### 1. AWS Authentication (get_aws_auth): Retrieves AWS credentials using boto3.Session() and constructs an AWSV4SignerAuth object.
##### 2. OpenSearch Client Creation (create_opensearch_client): Establishes a connection to OpenSearch using the provided host and AWS authentication.
##### 3. Vector Index Creation (create_vector_index): Checks if the specified index already exists using client.indices.exists().

In [None]:
%%time

def get_aws_auth(region_name, service):
    """Retrieve AWS authentication credentials."""
    try:
        credentials = boto3.Session().get_credentials()
        awsauth = auth = AWSV4SignerAuth(credentials, region_name, service)
        return awsauth
    except (NoCredentialsError, PartialCredentialsError) as e:
        print(f"Error retrieving AWS credentials: {e}")
        raise

def create_opensearch_client(host, awsauth):
    """Build the OpenSearch client."""
    try:
        client = OpenSearch(
            hosts=[{'host': host, 'port': 443}],
            http_auth=awsauth,
            use_ssl=True,
            verify_certs=True,
            connection_class=RequestsHttpConnection,
            timeout=300
        )
        return client
    except Exception as e:
        print(f"Error creating OpenSearch client: {e}")
        raise

def create_vector_index(client, index_name, index_body):
    """Create the vector index in OpenSearch."""
    try:
        if not client.indices.exists(index=index_name):
            client.indices.create(index=index_name, body=json.dumps(index_body))
            print(f"Index '{index_name}' created successfully.")
        else:
            print(f"Index '{index_name}' already exists.")
    except RequestError as e:
        print(f"Error creating index '{index_name}': {e}")
        raise

def create_index():

    index_body = {
           "settings": {
              "index.knn": "true",
               "number_of_shards": 1,
               "knn.algo_param.ef_search": 512,
               "number_of_replicas": 0,
           },
           "mappings": {
              "properties": {
                 "vector": {
                    "type": "knn_vector",
                    "dimension": 1536,
                     "method": {
                         "name": "hnsw",
                         "engine": "faiss",
                         "space_type": "l2"
                     },
                 },
                 "text": {
                    "type": "text"
                 },
                 "text-metadata": {
                    "type": "text"         }
              }
           }
        }

    try:
        # Get AWS authentication
        awsauth = get_aws_auth(aws_region_name, opensearch_service_name)

        # Create OpenSearch client
        oss_client = create_opensearch_client(aoss_collection_host, awsauth)

        # Create index
        try:
            response = oss_client.indices.create(index=index_name, body=json.dumps(index_body))
            print('\nCreating index:')

            # index creation can take up to a minute
            interactive_sleep(60)
        except RequestError as e:
            # you can delete the index if its already exists
            # oss_client.indices.delete(index=index_name)
            print(f'Error while trying to create the index, with error {e.error}\nyou may unmark the delete above to delete, and recreate the index')


    except Exception as e:
        print(f"An error occurred during the process: {e}")

create_index()

# Download and prepare dataset

In [None]:
import os

# Get the current working directory
current_directory = os.getcwd()

# Print the current working directory
print(f"Current working directory: {current_directory}")

# Construct the path to 'data/rag_use_cases' inside the current directory
data_directory = os.path.join(current_directory, 'data', 'rag_use_cases')

# Print the resulting path
print(f"Data directory path: {data_directory}")

# Disclaimer
##### Make Sure that data_directory is pointing to the right path and data files are present. Otherwise, you need to change the above code

In [None]:
%%time

def upload_directory_to_s3(directory_path, bucket_name, s3_prefix=''):
    """
    Uploads all files from a local directory to an S3 bucket.

    :param directory_path: Local path to the directory
    :param bucket_name: The name of the target S3 bucket
    :param s3_prefix: The S3 folder (prefix) where the files will be uploaded (optional)
    """
    try:
        # Walk through the directory and upload files to S3
        for root, dirs, files in os.walk(directory_path):
            for file_name in files:
                # Construct the full local path
                local_file_path = os.path.join(root, file_name)
                
                # Construct the S3 key (upload destination path)
                relative_path = os.path.relpath(local_file_path, directory_path)
                s3_file_path = os.path.join(s3_prefix, relative_path)
                
                # Upload file to S3
                s3_client.upload_file(local_file_path, bucket_name, s3_file_path)
                print(f"Uploaded {local_file_path} to s3://{bucket_name}/{s3_file_path}")
    
    except FileNotFoundError as e:
        print(f"Error: The directory {directory_path} does not exist: {e}")
    except NoCredentialsError:
        print("Error: AWS credentials not found.")
    except PartialCredentialsError:
        print("Error: Incomplete AWS credentials.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")


# Upload the directory to S3
upload_directory_to_s3(data_directory, s3_bucket_name, 'data')


# Create the Amazon Knowledge Bases

##### This script is designed to create a knowledge base in Amazon Bedrock with OpenSearch Serverless as storage, using embeddings for vector-based search. It includes configurations for chunking document ingestion, retry mechanisms, and functions to handle the creation and retrieval of the knowledge base.

##### Chunking Strategy (chunkingStrategyConfiguration): Defines how documents are chunked before ingestion like 
##### Fixed Size: The documents are chunked into parts, each having a maximum size (maxTokens: 512).
##### Overlap Percentage: 20% overlap between chunks to ensure that context is maintained across chunks.

In [None]:
%%time

# Global configuration
opensearchServerlessConfiguration = {
    "collectionArn": aoss_collectionarn,
    "vectorIndexName": index_name,
    "fieldMapping": {
        "vectorField": "vector",
        "textField": "text",
        "metadataField": "text-metadata"
    }
}

# Chunking strategy for document ingestion
chunkingStrategyConfiguration = {
    "chunkingStrategy": "FIXED_SIZE",
    "fixedSizeChunkingConfiguration": {
        "maxTokens": 512,
        "overlapPercentage": 20
    }
}

# S3 configuration for the data source
s3Configuration = {
    "bucketArn": f"arn:aws:s3:::{s3_bucket_name}"
    # Optional: "inclusionPrefixes": ["*.*"]  # Specify prefixes if needed
}

# Retry mechanism for creating the knowledge base
# Retry Mechanism: The @retry decorator retries the creation process with a random delay to handle transient failures.
@retry(wait_random_min=1000, wait_random_max=2000, stop_max_attempt_number=7)
def create_knowledge_base_func(bedrock_client, name, description, roleArn, opensearch_config, embedding_arn):
    """
    Create a knowledge base using Bedrock with OpenSearch as storage.
    Retries on failure, with exponential backoff and random delay.
    
    :param bedrock_client: Boto3 Bedrock client
    :param name: Name of the knowledge base
    :param description: Description of the knowledge base
    :param roleArn: IAM Role ARN for execution
    :param opensearch_config: OpenSearch serverless configuration
    :param embedding_arn: ARN of the embedding model
    :return: Knowledge base details on success
    """
    try:
        create_kb_response = bedrock_client.create_knowledge_base(
            name=name,
            description=description,
            roleArn=roleArn,
            knowledgeBaseConfiguration={
                "type": "VECTOR",
                "vectorKnowledgeBaseConfiguration": {
                    "embeddingModelArn": embedding_arn
                }
            },
            storageConfiguration={
                "type": "OPENSEARCH_SERVERLESS",
                "opensearchServerlessConfiguration": opensearch_config
            }
        )
        return create_kb_response["knowledgeBase"]
    except Exception as err:
        print(f"Error occurred while creating knowledge base: {err}, {type(err)}")
        raise

# Function to get knowledge base details
def get_knowledge_base(bedrock_client, knowledge_base_id):
    """
    Retrieve details of the created knowledge base.
    
    :param bedrock_client: Boto3 Bedrock client
    :param knowledge_base_id: Knowledge base ID to retrieve
    :return: Knowledge base details
    """
    try:
        get_kb_response = bedrock_client.get_knowledge_base(knowledgeBaseId=knowledge_base_id)
        return get_kb_response
    except Exception as err:
        print(f"Error retrieving knowledge base: {err}, {type(err)}")
        raise


# Main section to execute the creation and retrieval process

# Create knowledge base
try:
    kb = create_knowledge_base_func(
            bedrock_agent_client, bedrock_knowledge_bases_name, description, genaibookedbedrocksagemakerexecutionrolearn, opensearchServerlessConfiguration, embeddingModelArn
        )
    print("Knowledge base created successfully.")
    print(kb)
        
    # Retrieve knowledge base details
    kb_details = get_knowledge_base(bedrock_agent_client, kb['knowledgeBaseId'])
    print("Knowledge base details:")
    print(kb_details)
        
except Exception as e:
    print(f"Failed to create or retrieve knowledge base: {e}")


# Create a DataSource in KnowledgeBase 

##### This script provides functions to create and retrieve data sources for a knowledge base in Amazon Bedrock. It uses S3 as the data source and supports vector ingestion with a specified chunking strategy.

In [None]:
%%time

# Function to create a data source in a knowledge base
def create_data_source(bedrock_agent_client, kb, name, description, s3_configuration, chunking_strategy_configuration):
    """
    Creates a data source in the knowledge base.

    Parameters:
    - bedrock_agent_client: The Boto3 client for Bedrock.
    - kb: Dictionary containing the knowledge base details.
    - name: Name of the data source.
    - description: Description of the data source.
    - s3_configuration: Configuration details for S3.
    - chunking_strategy_configuration: Chunking configuration for vector ingestion.

    Returns:
    - ds: The created data source object.
    """
    try:
        # Attempt to create the data source
        create_ds_response = bedrock_agent_client.create_data_source(
            name=name,
            description=description,
            knowledgeBaseId=kb['knowledgeBaseId'],
            dataSourceConfiguration={
                "type": "S3",
                "s3Configuration": s3_configuration
            },
            vectorIngestionConfiguration={
                "chunkingConfiguration": chunking_strategy_configuration
            }
        )
        
        # Retrieve the data source from the response
        ds = create_ds_response["dataSource"]
        print(ds)
        return ds

    except ClientError as e:
        # Handle any Boto3 client errors
        print(f"Error creating data source: {e}")
        return None


# Function to get details of a data source in the knowledge base
def get_data_source(bedrock_agent_client, kb, data_source_id):
    """
    Fetches the details of a data source by its ID.

    Parameters:
    - bedrock_agent_client: The Boto3 client for Bedrock.
    - kb: Dictionary containing the knowledge base details.
    - data_source_id: The ID of the data source.

    Returns:
    - response: The response containing the data source details.
    """
    try:
        # Attempt to retrieve the data source details
        response = bedrock_agent_client.get_data_source(
            knowledgeBaseId=kb['knowledgeBaseId'],
            dataSourceId=data_source_id
        )
        print(response)
        return response

    except ClientError as e:
        # Handle any Boto3 client errors
        print(f"Error retrieving data source: {e}")
        return None


# Example usage:
# Assume bedrock_agent_client is already initialized
# Assume kb is a dictionary containing the knowledge base details
# Assume s3_configuration and chunking_strategy_configuration are dictionaries containing relevant configuration details

knowledgeBaseId = kb['knowledgeBaseId'] # Example knowledge base

# Amazon Knowledges Bases variable 
bedrock_kb_datasouces_name = f"bedrock-sample-kb-ds-{random_suffix}"
bedrock_kb_datasouces_des = "A description of my data source."

# Create the data source
data_source = create_data_source(bedrock_agent_client, kb, bedrock_kb_datasouces_name, bedrock_kb_datasouces_des, s3Configuration, chunkingStrategyConfiguration)

# Get details of the data source if it was successfully created
if data_source:
    get_data_source(bedrock_agent_client, kb, data_source["dataSourceId"])


### %store magic command to store the variable for use in other notebook cells

In [None]:
# print dataSourceId
dataSourceId = data_source["dataSourceId"]

%store dataSourceId kb bedrock_kb_datasouces_name bedrock_kb_datasouces_des s3Configuration chunkingStrategyConfiguration knowledgeBaseId

##### The provided function check_data_source_status is designed to check the status of a data source within an Amazon Bedrock knowledge base. It polls for the status of the data source in a loop, printing the current status and retrying if the data source creation is still in progress.

In [None]:
%%time

def check_data_source_status(client, knowledge_base_id, data_source_id, max_attempts=10, delay=30):
    """Check the status of a data source in a loop until it's created or reaches a terminal state."""
    attempt = 0
    while attempt < max_attempts:
        try:
            # Fetch the current state of the knowledge base
            response = client.get_knowledge_base(knowledgeBaseId=knowledge_base_id)
            
            # Locate the specific data source in the knowledge base's data sources
            data_sources = response['knowledgeBase'].get('dataSources', [])
            data_source = next((ds for ds in data_sources if ds['dataSourceId'] == data_source_id), None)
            
            if not data_source:
                print(f"Data source {data_source_id} not found in knowledge base {knowledge_base_id}.")
                return
            
            # Check the status of the data source
            status = data_source['status']
            print(f"Data source {data_source_id} status: {status}")
            
            # Check if the data source creation is complete or failed
            if status == 'CREATED':
                print(f"Data source {data_source_id} has been successfully created.")
                return True
            elif status in ['FAILED', 'ERROR']:
                print(f"Data source {data_source_id} creation failed with status: {status}")
                return False

            # Wait for the specified delay before polling again
            print(f"Waiting for {delay} seconds before checking again...")
            time.sleep(delay)
            attempt += 1
        
        except Exception as e:
            print(f"Error checking data source status: {e}")
            return False

    print(f"Max attempts reached. Data source {data_source_id} creation not confirmed.")
    return False

# Call the function to check the data source status in a loop
check_data_source_status(bedrock_agent_client, knowledgeBaseId, dataSourceId)

### Wait untill Both Amazon Knowledge Bases and Data Source are ready.

In [None]:
%%time

while True:
    try:
        # Fetch knowledge base status
        response = bedrock_agent_client.get_knowledge_base(knowledgeBaseId=knowledgeBaseId)
        kb_status = response['knowledgeBase']['status']
        
        # Fetch data source status
        response = bedrock_agent_client.get_data_source(knowledgeBaseId=knowledgeBaseId, dataSourceId=dataSourceId)
        data_source_status = response['dataSource']['status']
        
        # Check if both statuses are as desired
        if kb_status == 'ACTIVE' and data_source_status == 'AVAILABLE':
            print("Both Knowledge Base and Data Source are ready.")
            break
        
        # Wait for a few seconds before the next check
        time.sleep(30)  # Wait for 30 seconds before checking again


    except Exception as e:
        print(f"Error occurred: {e}")
        break  # Exit the loop if there's an error


# Start an ingestion job

##### The provided code outlines the ingestion job process for Amazon Bedrock, including starting the ingestion job, checking its status, and polling until it completes.

##### start_ingestion_job: This function initiates an ingestion job for a specified data source and knowledge base.

In [None]:
%%time

# Function to start the ingestion job
def start_ingestion_job(bedrock_client, knowledge_base_id, data_source_id):
    try:
        # Start the ingestion job
        print("Starting ingestion job...")
        start_job_response = bedrock_client.start_ingestion_job(
            knowledgeBaseId=knowledge_base_id, 
            dataSourceId=data_source_id
        )
        job = start_job_response["ingestionJob"]
        print(job)
        return job
    except Exception as e:
        print(f"Error while starting ingestion job: {str(e)}")
        return None

# Function to get the status of the ingestion job
def get_ingestion_job_status(bedrock_client, knowledge_base_id, data_source_id, ingestion_job_id):
    try:
        # Fetch the job status
        get_job_response = bedrock_client.get_ingestion_job(
            knowledgeBaseId=knowledge_base_id,
            dataSourceId=data_source_id,
            ingestionJobId=ingestion_job_id
        )
        job = get_job_response["ingestionJob"]
        return job
    except Exception as e:
        print(f"Error while fetching ingestion job status: {str(e)}")
        return None

# Function to poll the ingestion job status until complete
def poll_ingestion_job_completion(bedrock_client, knowledge_base_id, data_source_id, ingestion_job_id, poll_interval=30):
    try:
        # Continuously poll the job status until it's complete
        job = get_ingestion_job_status(bedrock_client, knowledge_base_id, data_source_id, ingestion_job_id)
        while job and job['status'] != 'COMPLETE':
            print(f"Job status: {job['status']}. Waiting for completion...")
            interactive_sleep(poll_interval)
            job = get_ingestion_job_status(bedrock_client, knowledge_base_id, data_source_id, ingestion_job_id)
        if job:
            print("Ingestion job completed:")
            print(job)
        else:
            print("Failed to retrieve final job status.")
    except Exception as e:
        print(f"Error during job polling: {str(e)}")

# Main function to orchestrate the job ingestion process
def execute_ingestion_process(bedrock_client, knowledge_base_id, data_source_id):
    try:
        # Start the ingestion job
        job = start_ingestion_job(bedrock_client, knowledge_base_id, data_source_id)
        
        if job:
            # Poll until the job is complete
            poll_ingestion_job_completion(bedrock_client, knowledge_base_id, data_source_id, job["ingestionJobId"])
        else:
            print("Failed to start ingestion job.")
    except Exception as e:
        print(f"Error in the ingestion process: {str(e)}")


        # Execute the ingestion process
execute_ingestion_process(bedrock_agent_client, knowledgeBaseId , dataSourceId)

# Test the Amazon Knowledge Bases 

#### Using RetrieveAndGenerate API 
#### Refer: https://docs.aws.amazon.com/bedrock/latest/APIReference/API_agent-runtime_RetrieveAndGenerate.html 
####        https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-agent-runtime/client/retrieve_and_generate.html#
###### Purpose: Queries a knowledge base and generates responses based on the retrieved results and using the specified foundation model or inference profile. 
######          The response only cites sources that are relevant to the query.

In [None]:
%%time

# Function to query the knowledge base and generate a response
def ask_bedrock_llm_with_knowledge_base(client, query: str, model_arn: str, kb_id: str) -> dict:
    try:
        response = client.retrieve_and_generate(
            input={'text': query},
            retrieveAndGenerateConfiguration={
                'type': 'KNOWLEDGE_BASE',
                'knowledgeBaseConfiguration': {
                    'knowledgeBaseId': kb_id,
                    'modelArn': model_arn
                }
            },
        )
        return response
    except (BotoCoreError, ClientError) as e:
        print(f"Error during API call to retrieve and generate response: {str(e)}")
        raise

# Function to process and print the generated response and its citations
def process_and_print_response(model_name: str, response: dict):
    try:
        generated_text = response['output']['text']
        citations = response.get("citations", [])
        contexts = []

        for citation in citations:
            retrieved_references = citation.get("retrievedReferences", [])
            for reference in retrieved_references:
                contexts.append(reference["content"]["text"])

        print(f"---------- Generated using {model_name}:")
        print(generated_text)
        print()
        print()
        print(f"---------- Citations for the response generated by {model_name}:")
        print(contexts)
        print()

    except KeyError as e:
        print(f"Key error while processing response: {str(e)}")
        raise

# Main execution section
prompt = "What is Amazon doing and cashflow?"

# List of Bedrock models with names and model codes
bedrock_model_ids = [
        ["Anthropic Claude Haiku", "anthropic.claude-3-haiku-20240307-v1:0"]
    ]

# Initialize Bedrock client
try:
    
    boto3_bedrock_agent_runtime_client = boto3.client("bedrock-agent-runtime", region_name=aws_region_name)

# Loop through each Claude model to generate and print results
    for model_id in bedrock_model_ids:
        model_name, model_code = model_id
        model_arn = f'arn:aws:bedrock:{aws_region_name}::foundation-model/{model_code}'
            
        try:
            # Query the knowledge base with the specified model
            response = ask_bedrock_llm_with_knowledge_base(boto3_bedrock_agent_runtime_client, prompt, model_arn, knowledgeBaseId)
            # Process and display the generated results
            process_and_print_response(model_name, response)
            
        except Exception as e:
                print(f"An error occurred with model {model_name}: {str(e)}")

except Exception as e:
    print(f"An error occurred while initializing the client or querying models: {str(e)}")

#### Using Retrieve API
#### Refer: https://docs.aws.amazon.com/bedrock/latest/APIReference/API_agent-runtime_Retrieve.html
####        https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-agent-runtime/client/retrieve.html#retrieve#
###### Purpose: Queries a knowledge base and retrieves information from it.

In [None]:
%%time

# Function to retrieve relevant documents from the knowledge base
def retrieve_relevant_documents(client, query: str, kb_id: str, num_results: int = 3) -> dict:
    try:
        response = client.retrieve(
            retrievalQuery={'text': query},
            knowledgeBaseId=kb_id,
            retrievalConfiguration={
                'vectorSearchConfiguration': {
                    'numberOfResults': num_results  # Fetch top `num_results` documents matching the query
                }
            }
        )
        return response
    except (BotoCoreError, ClientError) as e:
        print(f"Error retrieving relevant documents: {str(e)}")
        raise

# Function to process and print the retrieved documents
def process_and_print_retrieved_documents(response: dict):
    try:
        retrieval_results = response.get("retrievalResults", [])
        if not retrieval_results:
            print("No relevant documents found.")
            return

        print("---------- Relevant Documents Retrieved:")
        print(retrieval_results)
        print()

    except KeyError as e:
        print(f"Key error while processing retrieved documents: {str(e)}")
        raise

# Main execution section
prompt = "What is Amazon doing and cashflow?"


# Initialize Bedrock client
try:
        
    boto3_bedrock_agent_runtime_client = boto3.client("bedrock-agent-runtime", region_name=aws_region_name)

    # Retrieve relevant documents based on the query
    response = retrieve_relevant_documents(boto3_bedrock_agent_runtime_client, prompt, knowledgeBaseId, num_results=3)

    # Process and display the retrieved documents
    process_and_print_retrieved_documents(response)

except Exception as e:
        print(f"An error occurred during retrieval: {str(e)}")

# End of NoteBook 

#### <ins>Step 1</ins> 

##### Please ensure that you close the kernel after using this notebook to avoid any potential charges to your account.

##### Process: Go to "Kernel" at top option. Choose "Shut Down Kernel". 
##### Refer https://docs.aws.amazon.com/sagemaker/latest/dg/studio-ui.html


#### <ins>Step 2</ins> 

#### If you are not executing any further lab of this Chapter 7
##### Execute the simple_knwl_bases_clean_up.ipynb to delete all the instances to avoid any potential charges to your account.