## Jupyter Notebook: `chunking_system_analysis.ipynb`

### This notebook will provide an interactive interface to test and analyze the document chunking system.

### **Cell 0: Install Requirements**

### - This cell will ensure all necessary Python packages are installed from `requirements.txt`.



In [None]:
# Cell 0: Install Requirements
# Ensure you are in the project's root directory or adjust path accordingly
# This command assumes requirements.txt is in the notebook's parent directory
# or the project root if the notebook is run from there.
!pip install -r requirements.txt --index-url https://download.pytorch.org/whl/cpu

# If 'uv' is preferred and available in your environment, you could use:
# !uv pip install -r requirements.txt --index-url https://download.pytorch.org/whl/cpu

# This ensures the kernel has access to all installed packages immediately.
import sys
import os

# Define project root for consistent pathing
# This tries to find the project root from common Jupyter contexts
current_dir = os.getcwd()
if os.path.basename(current_dir) == 'src': # If notebook is in src/
    project_root = os.path.abspath(os.path.join(current_dir, '..'))
elif os.path.basename(os.path.basename(current_dir)) == 'chunking_system_analysis.ipynb': # If notebook in data/input or similar
    project_root = os.path.abspath(os.path.join(current_dir, '..', '..'))
else: # Assume notebook is in project root
    project_root = current_dir

# Add the project root and src directory to sys.path
if project_root not in sys.path:
    sys.path.insert(0, project_root)
if os.path.join(project_root, 'src') not in sys.path:
    sys.path.insert(0, os.path.join(project_root, 'src'))

print("Requirements installation attempted.")
print(f"Project root set to: {project_root}")
print(f"sys.path updated: {sys.path}")

### **Cell 1: Setup and Imports** (Original Cell 1, now shifted)

## - Import necessary modules from our system (`HybridMarkdownChunker`, `ChunkQualityEvaluator`, `MetadataEnricher`, `config`, `FileHandler`).

- Set up paths to input/output directories.

- Display a reminder about `.env` file for API keys.


In [None]:
# Cell 1: Setup and Imports
import os
import sys
import asyncio
from IPython.display import display, Markdown, JSON # For richer display in Jupyter

# Add the parent directory of 'src' to the Python path
# This allows importing modules from src.chunkers, src.utils, etc.
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Check if we are in the 'document_chunking_system' root or a subdirectory
if os.path.basename(os.getcwd()) == 'document_chunking_system':
    base_dir = os.getcwd()
elif os.path.basename(os.getcwd()) == 'src': # If running from src/ or src/chunkers
    base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
else: # Assume we are in the project root if it's not explicitly handled
    base_dir = os.getcwd()

# Adjust sys.path to ensure src modules are discoverable
if os.path.join(base_dir, 'src') not in sys.path:
    sys.path.insert(0, os.path.join(base_dir, 'src'))

# Now import our modules
from chunkers.hybrid_chunker import HybridMarkdownChunker
from chunkers.evaluators import ChunkQualityEvaluator
from utils.file_handler import FileHandler
from utils.metadata_enricher import MetadataEnricher
from config.settings import config
from langchain_core.documents import Document

# Define file paths (relative to project root for consistency)
config.INPUT_DIR = os.path.join(base_dir, "data", "input", "markdown_files")
config.OUTPUT_DIR = os.path.join(base_dir, "data", "output")
config.TEMP_DIR = os.path.join(base_dir, "data", "temp")

os.makedirs(os.path.join(config.OUTPUT_DIR, "chunks"), exist_ok=True)
os.makedirs(os.path.join(config.OUTPUT_DIR, "reports"), exist_ok=True)

print("System modules and configurations loaded.")
print(f"Project base directory set to: {base_dir}")
print(f"LLM Metadata Enrichment Enabled: {config.ENABLE_LLM_METADATA_ENRICHMENT}")
print(f"LLM Image Description Enabled: {config.ENABLE_LLM_IMAGE_DESCRIPTION}")
if not config.GEMINI_API_KEY:
    display(Markdown("--- \n**WARNING: `GEMINI_API_KEY` is not set in your `.env` file.** \nLLM-based summaries and image descriptions will use mock data. \nTo enable live LLM calls, please set `GEMINI_API_KEY=\"YOUR_KEY_HERE\"` in your `.env` file. \n---"))

# Initialize core components
chunker = HybridMarkdownChunker(enable_semantic=True) # Semantic chunking enabled by default for testing
evaluator = ChunkQualityEvaluator()
metadata_enricher = MetadataEnricher()

### **Cell 2: Helper Functions for Notebook Display**

## - Functions to easily display chunks and reports.


In [None]:
# Cell 2: Helper Functions for Notebook Display
def display_chunks(chunks: List[Document], title="Generated Chunks"):
    display(Markdown(f"### {title} ({len(chunks)} Chunks)"))
    for i, chunk in enumerate(chunks):
        display(Markdown(f"#### Chunk {i} (Index: {chunk.metadata.get('chunk_index')})"))
        display(Markdown(f"**Content Type:** `{chunk.metadata.get('content_type', 'text')}`"))
        display(Markdown(f"**Chunk Strategy:** `{chunk.metadata.get('chunking_strategy', 'N/A')}`"))
        display(Markdown(f"**Tokens:** {chunk.metadata.get('chunk_tokens')}, **Words:** {chunk.metadata.get('word_count')}"))
        if 'llm_summary' in chunk.metadata:
            display(Markdown(f"**LLM Summary:** {chunk.metadata['llm_summary']}"))
        if 'image_alt_text' in chunk.metadata:
            display(Markdown(f"**Original Image Alt Text:** {chunk.metadata['image_alt_text']}"))
        if 'image_url' in chunk.metadata:
            display(Markdown(f"**Original Image URL:** {chunk.metadata['image_url']}"))
        
        display(Markdown("```markdown\n" + chunk.page_content + "\n```"))
        display(Markdown("---"))

def display_report(report_content: str):
    display(Markdown("### Quality Evaluation Report"))
    display(Markdown(report_content))

async def process_and_display(file_path: str, chunker_instance: HybridMarkdownChunker, evaluator_instance: ChunkQualityEvaluator, enricher_instance: MetadataEnricher):
    """Processes a file, enriches, evaluates, and displays results."""
    print(f"\n--- Processing Document: {os.path.basename(file_path)} ---")
    
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    initial_metadata = {
        'source_file': os.path.basename(file_path),
        'document_type': 'markdown'
    }

    chunks = await chunker_instance.chunk_document(content, initial_metadata)
    print(f"Initial chunking generated {len(chunks)} chunks.")

    enriched_chunks = await enricher_instance.enrich_chunks_with_llm_summaries(chunks)
    print(f"Enrichment completed for {len(enriched_chunks)} chunks.")

    display_chunks(enriched_chunks, f"Chunks from {os.path.basename(file_path)}")

    report_content = evaluator_instance.generate_report(enriched_chunks)
    display_report(report_content)

    return enriched_chunks # Return chunks for further inspection if needed

### **Cell 3: Chunking Example - Prose Document (`sample_prose_document.md`)**

### - Run chunking and evaluation for a prose document.


In [None]:
# Cell 3: Chunking Example - Prose Document
PROSE_FILE = os.path.join(config.INPUT_DIR, "sample_prose_document.md")

# Ensure dummy file exists for demonstration
if not os.path.exists(PROSE_FILE):
    print(f"Creating dummy prose test file: {PROSE_FILE}")
    dummy_content_prose = """
# The Future of Artificial Intelligence

Artificial intelligence (AI) is rapidly transforming industries and daily life. From self-driving cars to advanced medical diagnostics, AI's capabilities are expanding at an unprecedented pace. This technology promises to solve complex problems, enhance efficiency, and unlock new frontiers of innovation. However, its development also presents ethical and societal challenges that require careful consideration.

## Machine Learning and Deep Learning

At the core of many AI advancements are machine learning (ML) and deep learning (DL). Machine learning algorithms enable systems to learn from data without explicit programming. Deep learning, a subset of ML, uses neural networks with multiple layers to learn complex patterns. These techniques are behind breakthroughs in image recognition, natural language processing, and predictive analytics. The sheer volume of data available today fuels these algorithms, allowing them to achieve remarkable accuracy.

### The Role of Large Language Models (LLMs)

Large Language Models (LLMs) like Gemini, GPT, and Llama have revolutionized natural language understanding and generation. These models are trained on vast datasets of text and code, allowing them to perform tasks such as translation, summarization, and creative writing. They represent a significant leap towards more human-like AI interactions. The ability of LLMs to generate coherent and contextually relevant text has opened new possibilities for applications in customer service, content creation, and education.

## Ethical Considerations and Societal Impact

The increasing power of AI also brings forth critical ethical considerations. Issues such as algorithmic bias, privacy concerns, job displacement, and the potential for misuse of AI technologies are paramount. Ensuring fairness, transparency, and accountability in AI systems is essential for their responsible deployment. Discussions around AI governance and regulation are gaining momentum as societies grapple with the profound impact of these technologies.

### AI in Healthcare

AI is poised to transform healthcare by assisting with drug discovery, personalized treatment plans, and early disease detection. AI-powered tools can analyze vast amounts of patient data to identify patterns that human doctors might miss. This can lead to more accurate diagnoses and more effective interventions, ultimately improving patient outcomes. The integration of AI into healthcare systems requires robust validation and careful integration to ensure patient safety and data security.

## Conclusion

The journey of AI is far from over. As researchers continue to push the boundaries of what's possible, AI will undoubtedly become even more integrated into the fabric of society. Navigating its complexities, harnessing its potential, and mitigating its risks will be a collective endeavor, requiring collaboration across technology, policy, and ethics. The future is exciting, but it also demands thoughtful stewardship of this powerful technology.
    """
    with open(PROSE_FILE, 'w', encoding='utf-8') as f:
        f.write(dummy_content_prose)

prose_chunks = await process_and_display(PROSE_FILE, chunker, evaluator, metadata_enricher)


### **Cell 4: Chunking Example - Table Document (`sample_table_document.md`)**

### - Run chunking and evaluation for a document containing tables.


In [None]:
# Cell 4: Chunking Example - Table Document
TABLE_FILE = os.path.join(config.INPUT_DIR, "sample_table_document.md")

# Ensure dummy file exists for demonstration
if not os.path.exists(TABLE_FILE):
    print(f"Creating dummy table test file: {TABLE_FILE}")
    dummy_content_table = """
# Document with Tables

This is some introductory text before a table.

| Header 1 | Header 2 | Header 3 |
|----------|----------|----------|
| Row 1 Col 1 | Row 1 Col 2 | Row 1 Col 3 |
| Row 2 Col 1 | Row 2 Col 2 | Row 2 Col 3 |
| This is a very long piece of text that spans multiple words in a single cell, to test how the chunker handles long content within a table. It should ideally split this row if it exceeds the token limit set for table chunks. | Another cell | Last cell of a long row |
| Row 4 Col 1 | Row 4 Col 2 | Row 4 Col 3 |

Some text after the first table.

## Another Section with a Small Table

| Key | Value |
|-----|-------|
| Apple | Fruit |
| Carrot| Veggie|

End of document.
    """
    with open(TABLE_FILE, 'w', encoding='utf-8') as f:
        f.write(dummy_content_table)

table_chunks = await process_and_display(TABLE_FILE, chunker, evaluator, metadata_enricher)


### **Cell 5: Chunking Example - Image Document (`sample_image_document.md`)**

### - Run chunking and evaluation for a document containing images.


In [None]:
# Cell 5: Chunking Example - Image Document
IMAGE_FILE = os.path.join(config.INPUT_DIR, "sample_image_document.md")

# Ensure dummy file exists for demonstration
if not os.path.exists(IMAGE_FILE):
    print(f"Creating dummy image test file: {IMAGE_FILE}")
    dummy_content_image = """
# Document with Images

This document contains text and some images that need to be processed.

## Section 1: Introduction to AI

Artificial intelligence (AI) is transforming many aspects of our lives. From smart assistants to complex data analysis, AI is becoming increasingly prevalent.

Here's an example of an AI model's architecture:
![Neural Network Architecture](https://placehold.co/600x400/FF0000/FFFFFF?text=Neural%20Network)
A visual representation of a deep neural network, showing layers of interconnected nodes.

## Section 2: Data Visualization

Data visualization is key to understanding complex datasets. Charts and graphs help us interpret trends and patterns.

This chart illustrates market trends over the last quarter:
![Market Trends Chart](https://placehold.co/800x500/00FF00/000000?text=Market%20Trends%20Q1)
A bar chart depicting sales performance across different product categories for the first quarter.

## Conclusion

Images play a crucial role in conveying information effectively.
    """
    with open(IMAGE_FILE, 'w', encoding='utf-8') as f:
        f.write(dummy_content_image)

# For image processing, ensure ENABLE_LLM_IMAGE_DESCRIPTION is True in settings.py
# and GEMINI_API_KEY is set in .env for live LLM calls.
image_chunks = await process_and_display(IMAGE_FILE, chunker, evaluator, metadata_enricher)



### **Cell 6: Interactive Experimentation / Custom Runs**

### - Instructions on how to modify parameters and run custom tests.


In [None]:
# Cell 6: Interactive Experimentation / Custom Runs
display(Markdown("### Interactive Experimentation / Custom Runs"))
display(Markdown("""
This section demonstrates how to create new `HybridMarkdownChunker` instances with different parameters
and process documents to see the effects.

**Important Note:** If you modify values in `src/config/settings.py` (e.g., `DEFAULT_CHUNK_SIZE`, `SEMANTIC_SIMILARITY_THRESHOLD`, `ENABLE_LLM_IMAGE_DESCRIPTION`), you **MUST restart your Jupyter Kernel** (Kernel -> Restart) and then re-run **Cell 1** to ensure the changes are loaded.
"""))

# --- Experiment 1: Change Chunk Size for Prose Document ---
display(Markdown("#### Experiment 1: Prose Document with Smaller Chunk Size"))

# Create a new chunker instance with a smaller chunk size for more splits
# This will use the new chunk_size directly, overriding the config.DEFAULT_CHUNK_SIZE for this instance
chunker_small_prose = HybridMarkdownChunker(chunk_size=200, enable_semantic=True) 

# Process the prose document with the new chunker
prose_chunks_small_size = await process_and_display(PROSE_FILE, chunker_small_prose, evaluator, metadata_enricher)

display(Markdown("---"))

# --- Experiment 2: Table Document with different TABLE_CHUNK_MAX_TOKENS ---
display(Markdown("#### Experiment 2: Table Document with Very Small Table Chunk Limit"))

# Create a new chunker instance with a very small table chunk limit
# This will force tables to split into very granular row-by-row chunks
chunker_tiny_tables = HybridMarkdownChunker(chunk_size=800, enable_semantic=True) 
# Note: For this to work, you'd typically modify `TABLE_CHUNK_MAX_TOKENS` in settings.py,
# restart kernel, and then rerun cell 1. This is just for demonstration.
# For immediate effect, you would need to either:
# 1. Directly override it in settings.py and restart.
# 2. Modify the chunker's internal config directly (less clean):
#    chunker_tiny_tables.config.TABLE_CHUNK_MAX_TOKENS = 20 # This would apply to all subsequent calls

# Let's directly process with the default chunker but explain the settings change
display(Markdown(f"""
*(To truly see extreme table splitting, you would set `config.TABLE_CHUNK_MAX_TOKENS = 20` in `src/config/settings.py`, restart the kernel, and rerun Cell 1 and this cell.)*
"""))
table_chunks_default_settings = await process_and_display(TABLE_FILE, chunker, evaluator, metadata_enricher)

display(Markdown("---"))

# --- Experiment 3: Disable Semantic Chunking for Prose Document ---
display(Markdown("#### Experiment 3: Prose Document with Semantic Chunking Disabled"))

# Create a new chunker instance with semantic chunking disabled
chunker_no_semantic = HybridMarkdownChunker(enable_semantic=False)

# Process the prose document with semantic chunking disabled
prose_chunks_no_semantic = await process_and_display(PROSE_FILE, chunker_no_semantic, evaluator, metadata_enricher)

display(Markdown("---"))


: 