# File Name: simple_knwl_bases_chunking_strategy.ipynb
### Location: Chapter 7
### Purpose: 
#####             1. Example of variety of Chunking Strategy. 
#####                a) Fixed chunking strategy with Character splitting
#####                b) Fixed chunking strategy with Recursive Character Text Splitting
#####                c) Semantic Chunking strategy with Langchain
#####                d) Hierarchical chunking Strategy 
##### Dependency: simple-sagemaker-bedrock.ipynb at Chapter 3 should work properly. 
# <ins>-----------------------------------------------------------------------------------</ins>

# <ins>Amazon SageMaker Classic</ins>
#### Those who are new to Amazon SageMaker Classic. Follow the link for the details. https://docs.aws.amazon.com/sagemaker/latest/dg/studio.html

# <ins>Environment setup of Kernel</ins>
##### Fill "Image" as "Data Science"
##### Fill "Kernel" as "Python 3"
##### Fill "Instance type" as "ml-t3-medium"
##### Fill "Start-up script" as "No Scripts"
##### Click "Select"

###### Refer https://docs.aws.amazon.com/sagemaker/latest/dg/notebooks-create-open.html for details.

# <ins>Mandatory installation on the kernel through pip</ins>

##### This lab will work with below software version. But, if you are trying with latest version of boto3, awscli, and botocore. This code may fail. You might need to change the corresponding api. 

##### You will see pip dependency errors. you can safely ignore these errors and continue executing rest of the cell. 

In [None]:
%pip install --no-build-isolation --force-reinstall -q \
    "boto3>=1.34.84" \
    "opensearch-py>=2.7.1" \
    "retrying>=1.3.4" \
    "bs4" \
    "pypdf" \
    "langchain_experimental" \
    "langchain>=0.2.16" \
    "langchain_community>=0.2.17" \
    "awscli>=1.32.84" \
    "botocore>=1.34.84" \
    "langchain-aws>=0.1.7" \
    "langchain-core" \
    "llama-index"

# <ins>Disclaimer</ins>

##### You will see pip dependency errors. you can safely ignore these errors and continue executing rest of the cell.

# <ins>Restart the kernel</ins>

In [None]:
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

# <ins>Python package import</ins>

##### boto3 offers various clients for Amazon Bedrock to execute various actions.
##### botocore is a low-level interface to AWS tools, while boto3 is built on top of botocore and provides additional features

In [None]:
import json
import os
import boto3
import botocore
from retrying import retry
import warnings
import time
from langchain.document_loaders import PyPDFLoader

### Ignore warning 

In [None]:
warnings.filterwarnings('ignore')

# Download and prepare dataset

In [None]:
# Get the current working directory
current_directory = os.getcwd()

# Print the current working directory
print(f"Current working directory: {current_directory}")

# Construct the path to 'data/rag_use_cases' inside the current directory
data_directory = os.path.join(current_directory, 'data', 'rag_use_cases')

# Print the resulting path
print(f"Data directory path: {data_directory}")

In [None]:
documents = []

def load_pdf_documents(data_directory, documents ):
    

    # Loop through all files in the folder
    for filename in os.listdir(data_directory):
        if filename.endswith('.pdf'):  # Check if the file is a PDF
            file_path = os.path.join(data_directory, filename)
            try:
                loader = PyPDFLoader(file_path)
                # Load the PDF data
                data = loader.load()
                # Add the loaded data to the documents list
                documents.extend(data)
            except Exception as e:
                print(f"Error loading {filename}: {e}")  # Handle exceptions during loading

    # Print the text of the first page of the first document
    if documents:
        print(documents[0].page_content) #Printing page 1 
    else:
        print("No PDF files found in the folder.")
        
    return documents

documents = load_pdf_documents(data_directory, documents )

## Define important environment variable

In [None]:
# Try-except block to handle potential errors
try:
    # Create a new Boto3 session to interact with AWS services
    boto3_session_name = boto3.session.Session()

    # Retrieve the current AWS region from the session
    aws_region_name = boto3_session_name.region_name

    # Create a Bedrock Agent client using the current session and region
    bedrock_agent_client = boto3_session_name.client('bedrock-agent', region_name=aws_region_name)

    # Create an STS client to interact with AWS Security Token Service (STS)
    sts_client = boto3.client('sts')
    
    # Get the AWS account ID of the caller
    aws_account_id = sts_client.get_caller_identity()["Account"]

    # Create boto3_bedrock_runtime_client
    boto3_bedrock_runtime_client = boto3.client('bedrock-runtime', region_name = aws_region_name)
       
    # Create boto3_bedrock_agent_runtime_client
    boto3_bedrock_agent_runtime_client = boto3.client("bedrock-agent-runtime", region_name=aws_region_name)
    
    # Store all variables in a dictionary
    variables_store = {
        "aws_region_name": aws_region_name,
        "bedrock_agent_client": bedrock_agent_client,
        "aws_account_id": aws_account_id,
        "boto3_bedrock_runtime_client": boto3_bedrock_runtime_client,
        "boto3_bedrock_agent_runtime_client": boto3_bedrock_agent_runtime_client,
        "boto3_session_name": boto3_session_name,
        "sts_client": sts_client
    }

    # Print all variables
    for var_name, value in variables_store.items():
        print(f"{var_name}: {value}")

except Exception as e:
    print(f"An unexpected error occurred: {e}")


In [None]:
# Define Model ID and prompt
# List of Bedrock embed models with names and model codes
bedrock_embed_model_id = "amazon.titan-embed-text-v1"

# Fixed chunking strategy with Character splitting

##### The code is for splitting text documents into chunks using the CharacterTextSplitter from LangChain, which is particularly useful when dealing with large documents and ensuring that the chunks are of manageable size. 

##### Example of CharacterTextSplitter
#####   a) CharacterTextSplitter: This class splits documents into chunks based on character count.
#####   b) chunk_size=100: Each chunk will contain up to 100 characters.
#####   c) chunk_overlap=10: There will be a 10-character overlap between consecutive chunks. This can be useful to preserve context between chunks.
######  d) separator="": Since it's character-based, there's no separator (such as newline or punctuation) for splitting.

In [None]:
# Important package for Fixed chunking with Character splitting
from langchain.text_splitter import CharacterTextSplitter

# Function to split text documents into chunks using CharacterTextSplitter
def split_text_documents(documents):
    # Initialize the text splitter with specified parameters
    text_splitter = CharacterTextSplitter(
        chunk_size=100,         # Maximum length (in characters) of each chunk
        chunk_overlap=10,       # Number of characters to overlap between consecutive chunks
        separator=""            # Separator string to split the text; default is "\n\n"
    )
    
    try:
        # Split the documents into chunks
        splits = text_splitter.split_documents(documents)
        return splits  # Return the list of chunks
    except Exception as e:
        print(f"An error occurred while splitting documents: {e}")  # Handle any errors during splitting

# Call the function to split text documents and retrieve the chunks
chunked_documents = split_text_documents(documents)

# Print the first two chunks to verify the output
if chunked_documents:
    print(chunked_documents[:5])  # Display the first five chunks
else:
    print("No chunks were generated.")


# Fixed chunking strategy with Recursive Character Text Splitting

###### The code uses RecursiveCharacterTextSplitter from LangChain to split documents into smaller chunks. This class is useful for when you need to split documents into chunks while considering the structure of the text (e.g., paragraphs, sentences) and maintaining continuity.

##### Example of RecursiveCharacterTextSplitter
#####   a) chunk_size=100: This parameter sets the maximum size of each chunk (in characters). Each chunk will not exceed this size.
#####   b) chunk_overlap=10: This parameter specifies how many characters should overlap between consecutive chunks, helping preserve context between the chunks.
#####   c) Recursive Splitting: Unlike CharacterTextSplitter, which splits based on characters only, RecursiveCharacterTextSplitter attempts to split documents recursively by trying to respect the document structure, breaking at logical points such as sentences or paragraphs if necessary.

In [None]:
# Important package for Fixed chunking with Recursive Character Text Splitting

from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_documents_with_text_splitter(documents):
    try:
        # Initialize the RecursiveCharacterTextSplitter with chunk size and overlap
        rec_text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=100,  # Set the desired chunk size (number of characters per chunk)
            chunk_overlap=10  # Set the overlap between chunks for better continuity
        )

        # Split the documents into smaller chunks
        rec_text_splits = rec_text_splitter.split_documents(documents)

        # Return the first two splits as a sample output
        return rec_text_splits

    except Exception as e:
        # Handle and print any errors that occur during the splitting process
        print(f"Error during document splitting: {e}")
        return []

# Example usage
split_results = split_documents_with_text_splitter(documents)

if split_results:
    print("First five document splits:", split_results[:5])
else:
    print("No splits generated.")


# Semantic Chunking strategy with Langchain

Semantic chunking is a Natural Language Processing (NLP) technique that divides text into meaningful segments based on semantic similarity. This approach enhances information retrieval by focusing on meaning rather than syntax. The process is guided by embedding models that measure the similarity between sentences. Below is a summary of different methods used to determine the strategy for chunking text:

<ins>Percentile:</ins> This is the default method, where chunks are split based on sentence similarity differences that exceed a certain percentile threshold.

<ins>Standard Deviation:</ins> Chunks are created when the sentence similarity difference exceeds a specified number of standard deviations.

<ins>Interquartile:</ins> This method uses the interquartile range to set breakpoints, ensuring that the chunks are more evenly sized.

<ins>Gradient:</ins> This approach combines percentile-based splitting with anomaly detection based on gradient changes, making it particularly effective in domains with high semantic correlation, such as legal or medical texts.

In all cases, the embedding model—such as Amazon's Titan Embeddings—plays a crucial role in calculating these differences and ensuring that text is divided meaningfully.

<ins>Use Cases:</ins> Semantic chunking is especially beneficial in scenarios where the structure of the text matters more than simple syntactic cues like paragraph or sentence length.

# --------------------------------------
##### The code implements a Semantic Chunking strategy using LangChain's SemanticChunker and Bedrock Embeddings. 
##### SemanticChunker: Performs document splitting based on semantic information instead of fixed-size or character-based splitting. It uses embeddings to identify breakpoints in the text where splits should occur.
##### Breakpoint Types:
#####    a) "percentile": Splits at semantic breakpoints based on percentile distribution.
#####    b) "standard_deviation": Splits where semantic changes deviate significantly from the mean.
#####    c) "interquartile": Focuses on semantic shifts within the interquartile range.
#####    d) "gradient": Splits based on gradients in semantic change.

In [None]:
# Important packages for Semantic Chunking strategy with LangChain
from langchain_aws.embeddings.bedrock import BedrockEmbeddings
from langchain_experimental.text_splitter import SemanticChunker

def initialize_embeddings(client, model_id):
    """Initialize BedrockEmbeddings with the specified model ID."""
    try:
        embeddings_model = BedrockEmbeddings(client=client, model_id=model_id)
        print("Embeddings model initialized successfully.")
        return embeddings_model
    except Exception as e:
        print(f"Error initializing embeddings model: {e}")
        return None

def create_semantic_chunker(embeddings_model, breakpoint_type):
    """Create a SemanticChunker with the specified breakpoint method."""
    try:
        chunker = SemanticChunker(embeddings_model, breakpoint_threshold_type=breakpoint_type)
        print(f"SemanticChunker created with breakpoint type: {breakpoint_type}.")
        return chunker
    except Exception as e:
        print(f"Error creating SemanticChunker: {e}")
        return None

def split_documents(chunker, documents):
    """Split documents using the SemanticChunker."""
    try:
        splits = chunker.split_documents(documents)
        print("Documents split successfully.")
        return splits
    except Exception as e:
        print(f"Error splitting documents: {e}")
        return []

# Main execution section
def get_semantic():
    # Step 1: Initialize embeddings model
    embeddings_model = initialize_embeddings(boto3_bedrock_runtime_client, bedrock_embed_model_id)
    if embeddings_model is None:
        return  # Exit if embeddings initialization failed

    # Step 2: Create SemanticChunker
    breakpoint_type = [ "percentile", "standard_deviation", "interquartile", "gradient" ]
    for brk_typ in breakpoint_type:
    #breakpoint_type = "percentile"  # Change this to "standard_deviation", "interquartile", or "gradient" as needed
        semantic_text_splitter = create_semantic_chunker(embeddings_model, brk_typ)
        if semantic_text_splitter is None:
            return  # Exit if chunker creation failed

        # Step 3: Split documents
        semantic_text_splits = split_documents(semantic_text_splitter, documents)
    
        # Display the first two splits for verification
        print(semantic_text_splits[:2])
        print()
        print()

get_semantic()

# Hierarchical chunking Strategy 

Hierarchical Chunking is a technique that organizes documents into parent and child chunks, creating a structured hierarchy. This approach allows models to better understand relationships between different parts of a document, resulting in more contextually relevant and coherent responses from large language models (LLMs).

Using the HierarchicalNodeParser from the llama-index library, documents are segmented into multiple layers of chunks, where each smaller chunk (child) maintains a reference to its larger containing chunk (parent). This hierarchical structure preserves context, even when processing individual sections separately.

When paired with the AutoMergingRetriever, the model can dynamically replace retrieved child nodes with their parent nodes if the majority of child nodes are retrieved. This ensures the model accesses a more complete and cohesive context, enhancing response quality.

<ins>HierarchicalNodeParser:</ins> Segments documents into a hierarchy of nodes, with each node referencing its parent, enabling structured contextual representation.

<ins>AutoMergingRetriever:</ins> Automatically substitutes retrieved child nodes with their parent to provide the model with a broader context for synthesizing responses.

<ins>Use Cases:</ins> This technique is particularly advantageous for managing large or complex documents, as it maintains the integrity and context of information across varying levels of granularity.

##### This code demonstrates a Hierarchical Chunking strategy using llama_index, with a focus on progressively splitting text into smaller, contextually relevant chunks. Here's a breakdown of its workflow, functionality, and suggestions for enhancement:
##### chunk_sizes: A list of integers defining the hierarchical chunk sizes, e.g., [512, 254, 128].

In [None]:
# Important package for Hierarchical chunking
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import HierarchicalNodeParser

def load_documents(input_dir):
    """Load documents from the specified directory."""
    try:
        reader = SimpleDirectoryReader(input_dir=input_dir)
        documents = reader.load_data()
        print("Documents loaded successfully.")
        return documents
    except Exception as e:
        print(f"Error loading documents: {e}")
        return None

def initialize_hierarchical_node_parser(chunk_sizes):
    """Initialize HierarchicalNodeParser with specified chunk sizes."""
    try:
        node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
        print(f"HierarchicalNodeParser initialized with chunk sizes: {chunk_sizes}")
        return node_parser
    except Exception as e:
        print(f"Error initializing HierarchicalNodeParser: {e}")
        return None

def chunk_documents(node_parser, documents):
    """Chunk documents into hierarchical nodes."""
    try:
        nodes = node_parser.get_nodes_from_documents(documents)
        print("Documents chunked successfully.")
        return nodes
    except Exception as e:
        print(f"Error chunking documents: {e}")
        return None

def display_node_content(nodes, num_nodes=2):
    """Display the content of the first few nodes."""
    try:
        for i in range(min(num_nodes, len(nodes))):
            print(f"Node {i} content: {nodes[i].text[:500]}...")  # Display the first 500 chars for brevity
    except Exception as e:
        print(f"Error displaying node content: {e}")

# Main execution function
def get_hierarchical():
    
    chunk_sizes = [512, 254, 128]  # Define chunk sizes

    # Step 1: Load documents
    documents = load_documents(data_directory)
    if documents is None:
        return  # Exit if document loading failed

    # Step 2: Initialize HierarchicalNodeParser
    node_parser = initialize_hierarchical_node_parser(chunk_sizes)
    if node_parser is None:
        return  # Exit if node parser initialization failed

    # Step 3: Chunk documents
    nodes = chunk_documents(node_parser, documents)
    if nodes is None:
        return  # Exit if document chunking failed

    # Step 4: Display node content for verification
    display_node_content(nodes)


get_hierarchical()

# End of NoteBook 

#### <ins>Step 1</ins> 

##### Please ensure that you close the kernel after using this notebook to avoid any potential charges to your account.

##### Process: Go to "Kernel" at top option. Choose "Shut Down Kernel". 
##### Refer https://docs.aws.amazon.com/sagemaker/latest/dg/studio-ui.html