Original notebook at [AWS Samples](https://github.com/aws-samples/amazon-bedrock-samples/blob/main/rag/knowledge-bases/features-examples/02-optimizing-accuracy-retrieved-results/autogenerated-metadata-filters.ipynb)

---
#### IMPORTANTE: Vamos a borrar todos los data sources de nuestro KB
---

In [None]:
import os
import boto3
import pprint

### List Knowledge Bases

In [None]:
# create a boto3 client for bedrock
bedrock = boto3.client(service_name='bedrock-agent')

# list all knowledge bases
response = bedrock.list_knowledge_bases()

# print the response
response

### Añadimos nuevo Data Source al Knowledge Base (KB) que ya tenemos

### 1. Creamos el Data Source en el KB
> Es más fácil a través de una función

#### Importante: 
> Ahora creamos Data Source con "dataDeletionPolicy" == DELETE. Borra datos convertidos a embeddings, no el vector store en sí

In [None]:
def create_s3_data_source(kb_id,
                          kb_data_source_name,
                          kb_s3_bucket_name_arn,
                          kb_s3_data_source_path,
                          kb_s3_bucket_account_id,
                          vector_ingestion_configuration):
    """_summary_

    Args:
        kb_id (_type_): _description_
        kb_data_source_name (_type_): _description_
        kb_s3_bucket_name_arn (_type_): _description_
        kb_s3_data_source_path (_type_): _description_
        kb_s3_bucket_account_id (_type_): _description_
        vector_ingestion_configuration (_type_): _description_

    Returns:
        _type_: _description_
    """
    # Set SDK
    client = boto3.client('bedrock-agent')

    # Create S3 Data Source 
    response = client.create_data_source(
        dataDeletionPolicy='DELETE',
        dataSourceConfiguration={
            's3Configuration': {
                'bucketArn': kb_s3_bucket_name_arn,
                'bucketOwnerAccountId': kb_s3_bucket_account_id,
                'inclusionPrefixes': [
                    kb_s3_data_source_path,
                ]
            },
            'type': 'S3'
        },
        description='S3 data source with different chunking strategy for testing purposes',
        knowledgeBaseId=kb_id,
        name=kb_data_source_name,
        vectorIngestionConfiguration=vector_ingestion_configuration
    )

    return response

---
## Chunking Strategy: FIXED SIZE  
> IMPORTANTE! Cambien los detalles debajo

In [None]:
# CHANGE ME!!
kb_chunking_strategy = "FIXED_SIZE"

# Knowledge Base and New Data Source details:
# - Note: Account ID can be fetched using sts_client.get_caller_identity()["Account"]
kb_id = "O0RJHPYXA4"
kb_s3_bucket_name_arn = "arn:aws:s3:::genai-carlos-contreras-bucket-data-quarks-labs-oregon-01"
kb_s3_bucket_account_id = "992382616037"

# No need to change the following values:
kb_s3_bucket_name = kb_s3_bucket_name_arn.split(":::")[-1]
kb_data_source_name = f"virtual-assistant-rrhh-wiki-s3-{kb_chunking_strategy}"
kb_s3_data_source_path = f"datasets/demo_kb/knowledge-base-rrhh-wiki-s3-001/{kb_data_source_name}/"

In [None]:
# Define Lab:
vectorIngestionConfiguration={
    'chunkingConfiguration': {
        'chunkingStrategy': kb_chunking_strategy,
        'fixedSizeChunkingConfiguration': {
            'maxTokens': 500,
            'overlapPercentage': 10
        }
    }
}

In [None]:
# Create data source
response = create_s3_data_source(kb_id=kb_id,
                                 kb_data_source_name=kb_data_source_name,
                                 kb_s3_bucket_name_arn=kb_s3_bucket_name_arn,
                                 kb_s3_bucket_account_id=kb_s3_bucket_account_id,
                                 kb_s3_data_source_path=kb_s3_data_source_path,
                                 vector_ingestion_configuration=vectorIngestionConfiguration)

# Get Data Source ID, so we can delete it after this lab
data_source_id = response['dataSource']['dataSourceId']
print(f"New Data Source ID: {data_source_id}")

### Subimos manualmente los datos

In [None]:
def upload_directory(path, bucket_name, s3_key_dir):
    """_summary_

    Args:
        path (_type_): _description_
        bucket_name (_type_): _description_
    """
    for root,dirs,files in os.walk(path):
        for file in files:
            if not file.startswith('.DS_Store'):
                
                # Choose which files to upload
                file_to_upload = os.path.join(root,file)
                print(f"uploading file {file_to_upload} to {bucket_name}")
                
                # Init S3 Client and ulopad
                s3_client = boto3.client('s3')
                s3_key = f"{s3_key_dir}{file}"
                s3_client.upload_file(file_to_upload,bucket_name,s3_key)

In [None]:
# Creamos directorio local
local_dir = "data/kb_privileges_demo"

In [None]:
# upload metadata file to S3
upload_directory(local_dir, kb_s3_bucket_name, kb_s3_data_source_path)

### Sync KB
> IMPORTANTE: Confirma permisos dados en sesión anterior, otorgados al rol del KB sobre nueva ruta en S3

In [None]:
# Sync del KB
bedrock_agent_client = boto3.client('bedrock-agent')
response = bedrock_agent_client.start_ingestion_job(
    dataSourceId=data_source_id,
    description='Ingesting PDF data from your HR Wiki',
    knowledgeBaseId=kb_id
)
print(response)

In [None]:
ingestion_job = response['ingestionJob']['ingestionJobId']
print(f'Ingestion Job ID: {ingestion_job}')

In [None]:
# Check ingestion status
response = bedrock_agent_client.get_ingestion_job(
    dataSourceId=data_source_id,
    ingestionJobId=ingestion_job,
    knowledgeBaseId=kb_id
)

# Show status
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(response['ingestionJob'])

### Query Knowledge Base

In [None]:
# Set KB Details
foundation_model = "anthropic.claude-3-5-haiku-20241022-v1:0"
bedrock_agent_runtime_client = boto3.client('bedrock-agent-runtime') 
region = boto3.session.Session().region_name

#### Define tu propio Profile

In [None]:
# Define access level
user_profile = 'general'

# Metadata Access Filter
access_filter = {
    "listContains": {
        "key": "access_level",
        "value": user_profile
    }
}

In [None]:
query = "¿Los managers reciben bonos?"
# query = "¿Cuántos días de vacaciones tenemos al año?"

In [None]:
response = bedrock_agent_runtime_client.retrieve_and_generate(
    input={
        "text": query
    },
    retrieveAndGenerateConfiguration={
        "type": "KNOWLEDGE_BASE",
        "knowledgeBaseConfiguration": {
            'knowledgeBaseId': kb_id,
            "modelArn": "arn:aws:bedrock:{}::foundation-model/{}".format(region, foundation_model),
            "retrievalConfiguration": {
                "vectorSearchConfiguration": {
                    "numberOfResults":10,
                    "filter": access_filter,
                    "overrideSearchType": "HYBRID"
                } 
            }
        }
    }
)

# Show response
print(response['output']['text'].replace('. ', '.\n\n'))

#### Ahora intentamos con manager

In [None]:
# Define access level
user_profile = 'manager'

# Metadata Access Filter
access_filter = {
    "listContains": {
        "key": "access_level",
        "value": user_profile
    }
}

In [None]:
response = bedrock_agent_runtime_client.retrieve_and_generate(
    input={
        "text": query
    },
    retrieveAndGenerateConfiguration={
        "type": "KNOWLEDGE_BASE",
        "knowledgeBaseConfiguration": {
            'knowledgeBaseId': kb_id,
            "modelArn": "arn:aws:bedrock:{}::foundation-model/{}".format(region, foundation_model),
            "retrievalConfiguration": {
                "vectorSearchConfiguration": {
                    "numberOfResults":10,
                    "filter": access_filter,
                    "overrideSearchType": "HYBRID"
                } 
            }
        }
    }
)

# Show response
print(response['output']['text'].replace('. ', '.\n\n'))

### Muestra retrieve solamente

In [None]:
response_retrieve = bedrock_agent_runtime_client.retrieve(
    knowledgeBaseId=kb_id, 
    nextToken='string',
    retrievalConfiguration={
        "vectorSearchConfiguration": {
            "numberOfResults":10,
            "filter": access_filter,
            "overrideSearchType": "HYBRID"
        }
    },
    retrievalQuery={
        "text": query
    }
)

def response_print(retrieve_resp):
    for num,chunk in enumerate(retrieve_resp['retrievalResults'],1):
        print(f'Chunk {num}: ',chunk['content']['text'],end='\n'*2)
        print(f'Chunk {num} Location: ',chunk['location'],end='\n'*2)
        print(f'Chunk {num} Score: ',chunk['score'],end='\n'*2)
        print(f'Chunk {num} Metadata: ',chunk['metadata'],end='\n'*2)

response_print(response_retrieve)

## Limpiamos entorno

In [None]:
# Delete de Data Source
bedrock_agent_client = boto3.client('bedrock-agent')
response = bedrock_agent_client.delete_data_source(
    dataSourceId=data_source_id,
    knowledgeBaseId=kb_id
)
print(response)