## Clean up the OpenSearch Indexes/S3 bucket to Rerun this solution with new data
---

1. This notebook cleans the indexes created in OpenSearch Serverless and the content in the S3 bucket

1. Run this step if you want to run the entire solution with new data from the scratch, no need to clean the content in the console manually

Set this step to `yes` in the [config.yaml](config.yaml) file to clean up the content after the run. Keep it set to `no` if you want to re run the solution with the same data

***Delete the CFT stack to clean up all the resources that are created in your account manually from the CloudFormation console. This notebook only deletes the S3 folders containing pre existing images and texts, and the indexes created in OpenSearch. This is for users to re run this solution with new indexes, new embeddings and new data stored in S3***

In [None]:
# install the requirements before running this notebook
import sys
!{sys.executable} -m pip install -r requirements.txt

In [None]:
# import the libraries that are needed to run this notebook
import os
import re
import ray
import time
import glob
import json
import yaml
import time
import boto3
import logging
import botocore
import sagemaker
import globals as g
from typing import List
from requests_auth_aws_sigv4 import AWSSigV4
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from utils import get_cfn_outputs, get_bucket_name, download_image_files_from_s3, get_text_embedding

In [None]:
# set a logger
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# global constants
CONFIG_FILE_PATH = "config.yaml"

In [None]:
# read the config yaml file
fpath = CONFIG_FILE_PATH
with open(fpath, 'r') as yaml_in:
    config = yaml.safe_load(yaml_in)
logger.info(f"config read from {fpath} -> {json.dumps(config, indent=2)}")

In [None]:
bucket_name: str = get_bucket_name(config['aws']['cfn_stack_name'])
logger.info(f"Bucket name being used to store extracted images and texts from data: {bucket_name}")

In [None]:
outputs = get_cfn_outputs(config['aws']['cfn_stack_name'])
host = outputs['MultimodalCollectionEndpoint'].split('//')[1]
text_index_name = outputs['OpenSearchTextIndexName']
img_index_name = outputs['OpenSearchImgIndexName']
logger.info(f"opensearchhost={host}, text index={text_index_name}, image index={img_index_name}")
osi_text_endpoint = f"https://{outputs['OpenSearchPipelineTextEndpoint']}/data/ingest"
osi_img_endpoint = f"https://{outputs['OpenSearchPipelineImgEndpoint']}/data/ingest"

### Clean up the indexes and the images/texts in the S3 bucket
---

In [None]:
session = boto3.Session()
credentials = session.get_credentials()
auth = AWSV4SignerAuth(credentials, config['aws']['region'], g.OS_SERVICE)

# Represents the OSI client for images
img_os_client = OpenSearch(
    hosts = [{'host': host, 'port': 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)

# Represents the OSI client for texts
text_os_client = OpenSearch(
    hosts = [{'host': host, 'port': 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)

In [None]:
# delete the text and image indexes created in the opensearch serveless collection
try:
    # Check if the image index exists
    if img_os_client.indices.exists(img_index_name):
        img_response = img_os_client.indices.delete(img_index_name)
        logger.info(f"response received for the create index for images -> {img_response}")
    else:
        logger.info(f"The image index '{img_index_name}' does not exist and cannot be deleted.")

    # Check if the text index exists
    if text_os_client.indices.exists(text_index_name):
        txt_response = text_os_client.indices.delete(text_index_name)
        logger.info(f"response received for the create index for texts -> {txt_response}")
    else:
        logger.info(f"The text index '{text_index_name}' does not exist and cannot be deleted.")
except Exception as e:
    logger.error(f"Error in deleting index, exception: {e}")

In [None]:
# clean up the image and text folders in the S3 bucket
def clean_up_s3_folders(bucket_name: str, prefixes: List[str]):
    """
    This function takes in a list of prefixes and deletes those folders
    """
    client = boto3.client('s3')
    for prefix in prefixes:
        response = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
        obj_list = [{'Key': obj['Key']} for obj in response.get('Contents', [])]
        if obj_list:
            delete_response = client.delete_objects(
                Bucket=bucket_name,
                Delete={'Objects': obj_list}
            )
            logger.info(f'Deleted objects from {prefix}:', delete_response)

In [None]:
# delete the text and image folders from the s3 bucket
prefixes = [g.BUCKET_IMG_PREFIX, g.BUCKET_PDF_TEXT_PREFIX]
clean_up_s3_folders(bucket_name, prefixes)