## Implicit (auto-generated metadata filtering)

Original notebook en [AWS Samples](https://github.com/aws-samples/amazon-bedrock-samples/blob/main/rag/knowledge-bases/features-examples/02-optimizing-accuracy-retrieved-results/autogenerated-metadata-filters.ipynb)

Documentación de feature [aquí](https://docs.aws.amazon.com/bedrock/latest/userguide/kb-test-config.html)


##### IMPORTANTE: Vamos a borrar todos los data sources de nuestro KB
---

In [None]:
import boto3
import pprint
import requests
import json
import re
from botocore.exceptions import ClientError

### List Knowledge Bases

In [None]:
# create a boto3 client for bedrock
bedrock = boto3.client(service_name='bedrock-agent')

# list all knowledge bases
response = bedrock.list_knowledge_bases()

# print the response
response

### Añadimos nuevo Data Source al Knowledge Base (KB) que ya tenemos

#### Importante: 
> Ahora creamos Data Source con "dataDeletionPolicy" == DELETE. Borra datos convertidos a embeddings, no el vector store en sí

In [None]:
def create_s3_data_source(kb_id,
                          kb_data_source_name,
                          kb_s3_bucket_name_arn,
                          kb_s3_data_source_path,
                          kb_s3_bucket_account_id,
                          vector_ingestion_configuration):
    """_summary_

    Args:
        kb_id (_type_): _description_
        kb_data_source_name (_type_): _description_
        kb_s3_bucket_name_arn (_type_): _description_
        kb_s3_data_source_path (_type_): _description_
        kb_s3_bucket_account_id (_type_): _description_
        vector_ingestion_configuration (_type_): _description_

    Returns:
        _type_: _description_
    """
    # Set SDK
    client = boto3.client('bedrock-agent')

    # Create S3 Data Source 
    response = client.create_data_source(
        dataDeletionPolicy='DELETE',
        dataSourceConfiguration={
            's3Configuration': {
                'bucketArn': kb_s3_bucket_name_arn,
                'bucketOwnerAccountId': kb_s3_bucket_account_id,
                'inclusionPrefixes': [
                    kb_s3_data_source_path,
                ]
            },
            'type': 'S3'
        },
        description='S3 data source with different chunking strategy for testing purposes',
        knowledgeBaseId=kb_id,
        name=kb_data_source_name,
        vectorIngestionConfiguration=vector_ingestion_configuration
    )

    return response

---
## Chunking Strategy: SEMANTIC
> IMPORTANTE! Cambien los detalles debajo

In [None]:
# CHANGE ME!!
kb_chunking_strategy = "SEMANTIC"

# Knowledge Base and New Data Source details:
# - Note: Account ID can be fetched using sts_client.get_caller_identity()["Account"]
kb_id = "RHIVVP2KFL"
kb_s3_bucket_name_arn = "arn:aws:s3:::genai-carlos-contreras-bucket-data-quarks-labs-oregon-01"
kb_s3_bucket_account_id = "992382616037"

# No need to change the following values:
kb_s3_bucket_name = kb_s3_bucket_name_arn.split(":::")[-1]
kb_data_source_name = f"virtual-assistant-amzn-reports-s3-{kb_chunking_strategy}"
kb_s3_data_source_path = f"datasets/demo_kb/knowledge-base-amzn-report-s3-001/{kb_data_source_name}/"

In [None]:
# Define Lab:
vectorIngestionConfiguration={
    'chunkingConfiguration': {
        'chunkingStrategy': kb_chunking_strategy,
        'semanticChunkingConfiguration': {
                'breakpointPercentileThreshold': 50,
                'bufferSize': 1,
                'maxTokens': 500
            }
    }
}

### Creamos Data Source

In [None]:
# Create data source
response = create_s3_data_source(kb_id=kb_id,
                                 kb_data_source_name=kb_data_source_name,
                                 kb_s3_bucket_name_arn=kb_s3_bucket_name_arn,
                                 kb_s3_bucket_account_id=kb_s3_bucket_account_id,
                                 kb_s3_data_source_path=kb_s3_data_source_path,
                                 vector_ingestion_configuration=vectorIngestionConfiguration)

# Get Data Source ID, so we can delete it after this lab
data_source_id = response['dataSource']['dataSourceId']
print(f"New Data Source ID: {data_source_id}")

#### Damos ahora permisos a nuestro KB, para invocar todos los modelos y nueva ruta en S3


In [None]:
def create_bedrock_policy(policy_name):
    """
    Creates an IAM policy for AWS Bedrock access or returns existing policy ARN
    
    Args:
        policy_name (str): Name of the IAM policy to create
        
    Returns:
        str: Policy ARN if successful or if policy exists, None if failed
    """
    # Create IAM client
    iam_client = boto3.client('iam')
    
    # Define the policy
    policy_document = {
        "Version": "2012-10-17",
        "Statement": [
            {
                "Effect": "Allow",
                "Action": [
                    "bedrock:ListFoundationModels",
                    "bedrock:ListCustomModels"
                ],
                "Resource": "*"
            },
            {
                "Effect": "Allow",
                "Action": [
                    "bedrock:InvokeModel"
                ],
                "Resource": "*"
            },
            {
                "Effect": "Allow",
                "Action": [
                    "s3:GetObject"
                ],
                "Resource": "arn:aws:s3:::genai-carlos-contreras-bucket-data-quarks-labs-oregon-01/datasets/demo_kb/*"
            }
        ]
    }
    
    try:
        # Create the IAM policy
        response = iam_client.create_policy(
            PolicyName=policy_name,
            PolicyDocument=json.dumps(policy_document),
            Description='Policy for AWS Bedrock access'
        )
        
        policy_arn = response['Policy']['Arn']
        print(f"Successfully created policy: {policy_name}")
        print(f"Policy ARN: {policy_arn}")
        return policy_arn
        
    except ClientError as e:
        if e.response['Error']['Code'] == 'EntityAlreadyExists':
            print(f"Policy {policy_name} already exists")
            # Get the ARN of the existing policy
            try:
                response = iam_client.list_policies(Scope='Local', PathPrefix='/')
                for policy in response['Policies']:
                    if policy['PolicyName'] == policy_name:
                        policy_arn = policy['Arn']
                        print(f"Found existing policy ARN: {policy_arn}")
                        return policy_arn
            except ClientError as e2:
                print(f"Error retrieving existing policy ARN: {e2}")
                return None
        else:
            print(f"Error creating policy: {e}")
            return None

# Example usage
if __name__ == "__main__":
    policy_name = "BedrockAccessPolicyForKnowledgeBaseIamRole"
    policy_arn = create_bedrock_policy(policy_name)
    
    if policy_arn:
        print("Policy created or found successfully")
        print(f"Policy ARN: {policy_arn}")
    else:
        print("Failed to create or find policy")


#### Asignamos política IAM a role de nuestro KB

In [None]:
# Client init
iam_client = boto3.client('iam')
bedrock_agent_client = boto3.client('bedrock-agent')

# Get KB Details
get_kb_response = bedrock_agent_client.get_knowledge_base(knowledgeBaseId = kb_id)
kb_role_arn = get_kb_response['knowledgeBase']['roleArn']
kb_role_name = kb_role_arn.split('/')[-1]

# attach policy to role
response = iam_client.attach_role_policy(
    RoleName=kb_role_name,
    PolicyArn=policy_arn
    )

print(response)

### Subimos manualmente los datos

In [None]:
# Creamos directorio local
local_dir_amzn_reports = "data/pdf-amzn-report"

In [None]:
import os

def create_directory(directory_name):    
    if not os.path.exists(directory_name):
        os.makedirs(directory_name)
        print(f"Directory '{directory_name}' created successfully.")
    else:
        print(f"Directory '{directory_name}' already exists.")

# Call the function to create the directory
create_directory(local_dir_amzn_reports)

In [None]:
def download_file(url, filename):
    """_summary_

    Args:
        url (_type_): _description_
        filename (_type_): _description_
    """

    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            
            # Write the content of the response to the file
            file.write(response.content)
        
        print(f"File downloaded successfully: {filename}")
    
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

# URL of the files to download
urls = ["https://s2.q4cdn.com/299287126/files/doc_financials/2024/ar/Amazon-com-Inc-2023-Annual-Report.pdf",
        "https://s2.q4cdn.com/299287126/files/doc_financials/2023/ar/Amazon-2022-Annual-Report.pdf",
        "https://s2.q4cdn.com/299287126/files/doc_financials/2022/ar/Amazon-2021-Annual-Report.pdf",
        "https://s2.q4cdn.com/299287126/files/doc_financials/2021/ar/Amazon-2020-Annual-Report.pdf",
        "https://s2.q4cdn.com/299287126/files/doc_financials/2020/ar/2019-Annual-Report.pdf"]


for url in urls:
    # Name for the downloaded file
    filename = url.split('/')[-1]

    # Path to save the downloaded file
    filepath = f"./{local_dir_amzn_reports}/{filename}"

    # Call the function to download the file
    download_file(url, filepath)

In [None]:
def upload_directory(path, bucket_name, s3_key_dir):
    """_summary_

    Args:
        path (_type_): _description_
        bucket_name (_type_): _description_
    """
    for root,dirs,files in os.walk(path):
        for file in files:
            if not file.startswith('.DS_Store'):
                
                # Choose which files to upload
                file_to_upload = os.path.join(root,file)
                print(f"uploading file {file_to_upload} to {bucket_name}")
                
                # Init S3 Client and ulopad
                s3_client = boto3.client('s3')
                s3_key = f"{s3_key_dir}{file}"
                s3_client.upload_file(file_to_upload,bucket_name,s3_key)

#### Creamos Metadata Filters para Documents PDF

In [None]:
def generate_matadata(data_dir):
    """_summary_

    Args:
        data_dir (_type_): _description_
    """
    # Loop through all PDF files in the directory
    for filename in os.listdir(data_dir):
        if not filename.startswith('.DS_Store'):
            # Define the metadata dictionary
            metadata ={}
            
            filename= f'{data_dir}/{filename}'
            print(filename)
            
            # Create metadata
            metadata["company"] = "Amazon"
            metadata["ticker"] = "AMZN"
            metadata["year"] = re.search(r'\d+', filename.split('/')[-1]).group(0)

            # Create a JSON object
            json_data = {"metadataAttributes": metadata}

            # print(json_data)

            # Write the JSON object to a file
            with open(f"{filename.replace('.pdf', '.pdf.metadata.json')}", "w") as f:
                json.dump(json_data, f)

In [None]:
generate_matadata(local_dir_amzn_reports)

In [None]:
# upload metadata file to S3
upload_directory(local_dir_amzn_reports, kb_s3_bucket_name, kb_s3_data_source_path)

### Sync KB

In [None]:
# Sync del KB
bedrock_agent_client = boto3.client('bedrock-agent')
response = bedrock_agent_client.start_ingestion_job(
    dataSourceId=data_source_id,
    description='Ingesting PDF data for Amazon Reports',
    knowledgeBaseId=kb_id
)
print(response)

In [None]:
ingestion_job = response['ingestionJob']['ingestionJobId']
print(f'Ingestion Job ID: {ingestion_job}')

In [None]:
# Check ingestion status
response = bedrock_agent_client.get_ingestion_job(
    dataSourceId=data_source_id,
    ingestionJobId=ingestion_job,
    knowledgeBaseId=kb_id
)

# Show status
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(response['ingestionJob'])

#### (Por si no lo hiciste al comienzo del curso) - Habilita Cloudwatch + S3 logs on AWS Console

Pasos:

0. Confirma la creación del CloudWatch Log group "/aws/bedrock/invokemodel", en el paso anterior
1. Crea directorio en bucket; e.g. bedrock-logs/
2. AWS Bedrock --> Settings
3. Cambia Toggle a ON y Both S3 y CloudWatch Logs
4. Cambia ruta a s3://bucket/bedrock-logs/
5. Log group name: /aws/bedrock/invokemodel
6. Elige "Create and use a new role"
7. Elige nombre de IAM Role; e.g. bedrockCloudWatchAuditingDemoIamRole
8. Deja en blanco Large Data Delivery

In [None]:
#### (Por si no lo hiciste al comienzo del curso) - Habilita Cloudwatch + S3 logs on AWS Console

def create_cloudwatch_log_group_for_bedrock(log_group_name):
    logs_client = boto3.client('logs')
    try:
        logs_client.create_log_group(logGroupName=log_group_name)
        print(f"Successfully created CloudWatch log group: {log_group_name}")
        return True
    except ClientError as e:
        if e.response['Error']['Code'] == 'ResourceAlreadyExistsException':
            print(f"Log group {log_group_name} already exists.")
            return True
        else:
            print(f"Error creating log group: {e}")
            return False

# Create CloudWatch Log Group
cloudwatch_log_group_name = "/aws/bedrock/invokemodel"
create_cloudwatch_log_group_for_bedrock(cloudwatch_log_group_name)

### Query Knowledge Base

In [58]:
# Set KB Details
foundation_model = "anthropic.claude-3-5-sonnet-20240620-v1:0"
bedrock_agent_runtime_client = boto3.client('bedrock-agent-runtime') 
region = boto3.session.Session().region_name

In [None]:
query = "What was the YoY growth of Amazon?"

In [60]:
response = bedrock_agent_runtime_client.retrieve_and_generate(
    input={
        "text": query
    },
    retrieveAndGenerateConfiguration={
        "type": "KNOWLEDGE_BASE",
        "knowledgeBaseConfiguration": {
            'knowledgeBaseId': kb_id,
            "modelArn": "arn:aws:bedrock:{}::foundation-model/{}".format(region, foundation_model),
            "retrievalConfiguration": {
                "vectorSearchConfiguration": {
                    "numberOfResults":10,
                    "implicitFilterConfiguration": {
                        "metadataAttributes":[
                            {
                                "key": "company",
                                "type": "STRING",
                                "description": "The company name the document is describing. Possible values include ['Amazon']"
                            },
                            {
                            "key": "year",
                            "type": "NUMBER",
                            "description": "The year in which the document is about."
                            }
                        ],
                        "modelArn": "arn:aws:bedrock:{}::foundation-model/anthropic.claude-3-5-sonnet-20240620-v1:0".format(region)
                    },
                } 
            }
        }
    }
)

In [61]:
# Show response
print(response['output']['text'].replace('. ', '.\n\n'))

In 2023, Amazon's total revenue grew 12% year-over-year (YoY) from $514 billion to $575 billion.

This growth was seen across different segments of the company:

1.

North America revenue increased 12% YoY from $316 billion to $353 billion.
2.

International revenue grew 11% YoY from $118 billion to $131 billion.
3.

AWS (Amazon Web Services) revenue increased 13% YoY from $80 billion to $91 billion.

Additionally, Amazon's operating income significantly improved in 2023, increasing by 201% YoY from $12.2 billion to $36.9 billion.

The operating margin also improved from 2.4% to 6.4%.


> Ahora vamos a Cloudwatch, para revisar si está usando Metadata Filters o no.

---

In [None]:
# query = "What was the YoY growth of Amazon, in the first year of the pandemic, during years 2020 and 2021?"
# query = "What was the net income of Amazon"
query = "How many prime members does Amazon have after 2021?"


In [None]:
response_ret_with_implicit_fiters = bedrock_agent_runtime_client.retrieve(
    knowledgeBaseId=kb_id, 
    nextToken='string',
    retrievalConfiguration={
        "vectorSearchConfiguration": {
            "numberOfResults":10,
            "implicitFilterConfiguration": {
                    "metadataAttributes":[
                        {
                            "key": "year",
                            "type": "NUMBER",
                            "description": "The year in which the document is about."
                        },
                        {
                            "key": "company",
                            "type": "STRING",
                            "description": "The company name the document is describing. Possible values include ['Amazon']"
                        },
                        {
                            "key": "ticker",
                            "type": "STRING",
                            "description": "The ticker name of the company. Possible values include ['AMZN']"
                        }
                    ],
                    "modelArn": "arn:aws:bedrock:{}::foundation-model/anthropic.claude-3-5-sonnet-20240620-v1:0".format(region)
                },
        } 
    },
    retrievalQuery={
        "text": query
    }
)

def response_print(retrieve_resp):
#structure 'retrievalResults': list of contents. Each list has content, location, score, metadata
    for num,chunk in enumerate(retrieve_resp['retrievalResults'],1):
        print(f'Chunk {num}: ',chunk['content']['text'],end='\n'*2)
        print(f'Chunk {num} Location: ',chunk['location'],end='\n'*2)
        print(f'Chunk {num} Score: ',chunk['score'],end='\n'*2)
        print(f'Chunk {num} Metadata: ',chunk['metadata'],end='\n'*2)

response_print(response_ret_with_implicit_fiters)

In [None]:
# Show response
print(response['output']['text'].replace('. ', '.\n\n'))

In [None]:
pprint.pprint(response)

## Limpiamos entorno

In [None]:
# Delete de Data Source
bedrock_agent_client = boto3.client('bedrock-agent')
response = bedrock_agent_client.delete_data_source(
    dataSourceId=data_source_id,
    knowledgeBaseId=kb_id
)

In [None]:
# delete local directory
import shutil

# Delete it
local_dir_amzn_reports = "data/pdf-amzn-report"

try:
    shutil.rmtree(local_dir_amzn_reports)
    print(f"Directory '{local_dir_amzn_reports}' and its contents have been deleted successfully.")
except FileNotFoundError:
    print(f"Directory '{local_dir_amzn_reports}' not found.")
except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
def delete_bedrock_invokemodel_log_group(log_group_name):
    log_group_name = "/aws/bedrock/invokemodel"
    logs_client = boto3.client('logs')
    try:
        # First, disable the logging configuration in Bedrock
        bedrock_client = boto3.client('bedrock')
        bedrock_client.delete_model_invocation_logging_configuration()
        print("Successfully disabled InvokeModel logging for Bedrock")

        # Then, delete the log group
        logs_client.delete_log_group(logGroupName=log_group_name)
        print(f"Successfully deleted CloudWatch log group: {log_group_name}")
        return True
    except ClientError as e:
        if e.response['Error']['Code'] == 'ResourceNotFoundException':
            print(f"Log group {log_group_name} does not exist.")
            return True
        else:
            print(f"Error deleting log group or disabling logging: {e}")
            return False

In [None]:
# Delete log group name
cw_log_group_name = "/aws/bedrock/invokemodel"
delete_bedrock_invokemodel_log_group(cw_log_group_name)