# Markdown Chunking for RAG System

This notebook implements text chunking specifically optimized for Markdown content in a RAG system. It processes markdown files from `data/markdown_clean_final` and creates coherent chunks that preserve the structure of the documents for optimal RAG performance.

In [None]:
# Import required libraries
import os
import json
import time
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm # For progress bars in Jupyter
import re
import logging # Added for better logging
from datetime import datetime

# Import LangChain's Markdown text splitter and Document object
from langchain_text_splitters import MarkdownTextSplitter
from langchain_core.documents import Document # For creating LangChain Document objects

In [None]:
# --- Configuration ---
# Input directory for clean markdown files
INPUT_MARKDOWN_DIR = "../data/markdown_clean_final" 
# Output directory for chunked JSON files
OUTPUT_CHUNKS_DIR = "../data/chunks"
# Output directory for log files
LOGS_DIR = "../logs" 
# Chunking parameters
CHUNK_SIZE = 500   # Characters per chunk
CHUNK_OVERLAP = 150  # Overlap between chunks to maintain context

In [None]:
# Create output directories if they don't exist
Path(OUTPUT_CHUNKS_DIR).mkdir(parents=True, exist_ok=True)
Path(LOGS_DIR).mkdir(parents=True, exist_ok=True)

In [None]:
# Set up logging
log_file = Path(LOGS_DIR) / f"markdown_chunking_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(module)s - %(funcName)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file, encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

In [None]:
def parse_frontmatter(content):
    """
    Parse markdown frontmatter without external YAML dependencies.
    
    Args:
        content (str): Markdown content with YAML frontmatter
        
    Returns:
        tuple: (metadata_dict, content_without_frontmatter)
    """
    if not content.startswith('---'):
        logger.debug("No frontmatter found (doesn't start with '---').")
        return {}, content
    
    parts = content.split('---', 2)
    if len(parts) < 3:
        logger.debug("Invalid frontmatter structure (not enough '---' separators).")
        return {}, content # Not a valid frontmatter structure
    
    frontmatter_text = parts[1].strip()
    content_text = parts[2].strip() # The rest of the document
    
    metadata = {}
    for line in frontmatter_text.splitlines():
        line = line.strip()
        if not line or ':' not in line:
            continue
        
        key, value = line.split(':', 1)
        key = key.strip()
        value = value.strip()
        
        # Remove quotes if present (handles both single and double)
        if (value.startswith('"') and value.endswith('"')) or \
           (value.startswith("'") and value.endswith("'")):
            value = value[1:-1]
            
        metadata[key] = value
    
    logger.debug(f"Parsed frontmatter: {metadata}")
    return metadata, content_text

In [None]:
def create_markdown_splitter(chunk_size=500, chunk_overlap=150):
    """
    Create a MarkdownTextSplitter with the specified parameters.
    """
    markdown_splitter = MarkdownTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    logger.info(f"MarkdownTextSplitter created with chunk_size={chunk_size}, chunk_overlap={chunk_overlap}")
    return markdown_splitter

In [None]:
def extract_file_category(file_path):
    """
    Extract category from file path based on filename pattern.
    For example: people_*.md -> 'people', news_*.md -> 'news', etc.
    """
    file_name = Path(file_path).name
    parts = file_name.split('_')
    if len(parts) > 0:
        return parts[0]
    return "unknown"

In [None]:
def split_markdown_file(file_path, markdown_splitter):
    """
    Split a markdown file into chunks while preserving metadata and markdown structure.
    
    Args:
        file_path (str or Path): Path to the markdown file
        markdown_splitter (MarkdownTextSplitter): Configured markdown splitter
        
    Returns:
        list: List of LangChain Document objects with content and metadata
    """
    file_path = Path(file_path)
    logger.info(f"Processing file: {file_path}")
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            full_file_content = f.read()
    except Exception as e:
        logger.error(f"Could not read file {file_path}: {e}")
        return []
    
    metadata, text_content = parse_frontmatter(full_file_content)
    
    # Add source information to metadata
    metadata['source_file'] = str(file_path.resolve()) # Store absolute path
    metadata['filename'] = file_path.name
    metadata['category'] = extract_file_category(file_path)
    
    # Attempt to extract the first main heading from the content body
    main_heading_match = re.search(r'^\s*#\s+(.+?)(?:\n|$)', text_content, re.MULTILINE)
    if main_heading_match:
        metadata['main_heading'] = main_heading_match.group(1).strip()
        logger.debug(f"Extracted main_heading: {metadata['main_heading']}")
    else:
        logger.debug("No main heading found in content body.")

    try:
        # Split the content into chunks using the MarkdownTextSplitter
        chunks_text = markdown_splitter.split_text(text_content)
        logger.info(f"Split '{file_path.name}' into {len(chunks_text)} text chunks.")
    except Exception as e:
        logger.error(f"Error splitting text for {file_path.name}: {e}", exc_info=True)
        return []
        
    processed_chunks = []
    for i, chunk_content_str in enumerate(chunks_text):
        chunk_metadata = metadata.copy() # Start with base metadata from frontmatter
        chunk_metadata['chunk_id'] = f"{file_path.stem}_{i}" # More unique chunk ID
        chunk_metadata['chunk_number'] = i
        chunk_metadata['total_chunks_in_file'] = len(chunks_text)
        
        # Create a LangChain Document object
        try:
            doc = Document(page_content=chunk_content_str.strip(), metadata=chunk_metadata)
            processed_chunks.append(doc)
        except Exception as e:
            logger.error(f"Error creating Document object for chunk {i} in {file_path.name}: {e}", exc_info=True)
            # Fallback to dictionary if Document creation fails
            processed_chunks.append({
                'page_content': chunk_content_str.strip(),
                'metadata': chunk_metadata,
                'error_creating_document': True
            })

    return processed_chunks

In [None]:
def process_markdown_directory(input_dir_path_str, output_dir_path_str, 
                               chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
    """
    Process all markdown files in a directory, chunk them, and save chunks.
    Chunks are saved to individual JSON files (one per original Markdown file)
    and also aggregated into a single 'all_chunks.json' file.
    """
    input_path = Path(input_dir_path_str)
    output_path = Path(output_dir_path_str)
    output_path.mkdir(parents=True, exist_ok=True) # Ensure output directory exists
    
    markdown_splitter = create_markdown_splitter(chunk_size, chunk_overlap)
    
    # Recursively find all .md files in all subdirectories of input_path
    markdown_files = list(input_path.rglob("*.md"))
    logger.info(f"Found {len(markdown_files)} markdown files in '{input_path}' and its subdirectories.")
    
    if not markdown_files:
        logger.warning(f"No markdown files found in {input_path}. Exiting.")
        return 0, 0

    all_processed_chunks_list = [] # To store all chunks from all files
    total_files_processed = 0
    
    # Use tqdm for a progress bar
    for md_file_path in tqdm(markdown_files, desc="Processing Markdown Files"):
        file_chunks_data = split_markdown_file(md_file_path, markdown_splitter)
        
        if file_chunks_data:
            total_files_processed += 1
            
            # Convert Document objects to dictionaries for JSON serialization
            file_chunks_serializable = []
            for chunk_item in file_chunks_data:
                if isinstance(chunk_item, Document):
                    file_chunks_serializable.append({
                        "page_content": chunk_item.page_content,
                        "metadata": chunk_item.metadata
                    })
                else: # It's already a dict (e.g., fallback from Document creation error)
                    file_chunks_serializable.append(chunk_item)
            
            all_processed_chunks_list.extend(file_chunks_serializable)
            
            # Save chunks for each individual file
            # Create a subdirectory structure in output_chunks_dir mirroring input_markdown_dir
            relative_path = md_file_path.relative_to(input_path)
            individual_chunk_output_dir = output_path / relative_path.parent
            individual_chunk_output_dir.mkdir(parents=True, exist_ok=True)
            output_json_path = individual_chunk_output_dir / f"{md_file_path.stem}_chunks.json"
            
            try:
                with open(output_json_path, "w", encoding="utf-8") as f:
                    json.dump(file_chunks_serializable, f, indent=2, ensure_ascii=False)
                logger.info(f"Saved {len(file_chunks_serializable)} chunks for {md_file_path.name} to {output_json_path}")
            except Exception as e:
                logger.error(f"Could not save chunks for {md_file_path.name} to {output_json_path}: {e}", exc_info=True)
        else:
            logger.warning(f"No chunks generated for {md_file_path.name}.")

    total_chunks_created = len(all_processed_chunks_list)
    logger.info(f"Successfully processed {total_files_processed} files, creating a total of {total_chunks_created} chunks.")
    
    # Save all chunks to a single aggregated file
    if all_processed_chunks_list:
        all_chunks_aggregated_file_path = output_path / "all_chunks_aggregated.json"
        try:
            with open(all_chunks_aggregated_file_path, "w", encoding="utf-8") as f:
                json.dump(all_processed_chunks_list, f, indent=2, ensure_ascii=False) # Use indent for readability
            logger.info(f"Saved all {total_chunks_created} aggregated chunks to {all_chunks_aggregated_file_path}")
        except Exception as e:
            logger.error(f"Could not save aggregated chunks file: {e}", exc_info=True)
            
    return total_files_processed, total_chunks_created

In [None]:
def analyze_chunk_distribution(all_chunks_aggregated_file_path_str):
    """
    Analyze the distribution of chunks from the aggregated JSON file.
    """
    import matplotlib.pyplot as plt
    
    all_chunks_file_path = Path(all_chunks_aggregated_file_path_str)
    if not all_chunks_file_path.exists():
        logger.error(f"Aggregated chunks file not found: {all_chunks_file_path}")
        return None

    try:
        with open(all_chunks_file_path, "r", encoding="utf-8") as f:
            chunks_data = json.load(f)
    except Exception as e:
        logger.error(f"Could not load or parse aggregated chunks file {all_chunks_file_path}: {e}")
        return None
        
    if not chunks_data:
        logger.info("No chunks found in the aggregated file for analysis.")
        return None

    chunk_lengths = [len(chunk.get("page_content", chunk.get("content", ""))) for chunk in chunks_data]
    categories = [chunk.get("metadata", {}).get("category", "unknown") for chunk in chunks_data]
    
    df = pd.DataFrame({
        "length": chunk_lengths,
        "category": categories
    })
    
    logger.info(f"\n--- Chunk Analysis ---")
    logger.info(f"Total chunks analyzed: {len(chunks_data)}")
    logger.info(f"Average chunk length: {df['length'].mean():.2f} characters")
    logger.info(f"Min chunk length: {df['length'].min()} characters")
    logger.info(f"Max chunk length: {df['length'].max()} characters")
    
    category_counts = df["category"].value_counts()
    logger.info("\nCategory distribution of chunks:")
    for category, count in category_counts.items():
        logger.info(f"  - {category}: {count}")
    
    # Plot distribution of chunk lengths
    plt.figure(figsize=(12, 7))
    plt.hist(chunk_lengths, bins=50, edgecolor='black')
    plt.xlabel("Chunk Length (characters)")
    plt.ylabel("Number of Chunks")
    plt.title("Distribution of Chunk Lengths")
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.show()
    
    # Plot category distribution
    if not category_counts.empty:
        plt.figure(figsize=(12, 8))
        category_counts.plot(kind="bar", color='skyblue', edgecolor='black')
        plt.xlabel("Category")
        plt.ylabel("Number of Chunks")
        plt.title("Number of Chunks by Category")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        plt.grid(True, axis='y', linestyle='--', alpha=0.7)
        plt.show()
    else:
        logger.info("No categories found for plotting category distribution.")
        
    return df

In [None]:
# Main execution
logger.info("--- Starting Markdown Chunking Process ---")
chunking_start_time = time.time()

# Ensure the input directory exists
if not Path(INPUT_MARKDOWN_DIR).exists():
    logger.error(f"Input directory '{INPUT_MARKDOWN_DIR}' does not exist.")
else:
    files_processed_count, chunks_created_count = process_markdown_directory(
        INPUT_MARKDOWN_DIR,
        OUTPUT_CHUNKS_DIR,
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )

    if chunks_created_count > 0:
        # Analyze the chunk distribution from the aggregated file
        aggregated_file = Path(OUTPUT_CHUNKS_DIR) / "all_chunks_aggregated.json"
        chunk_stats_df = analyze_chunk_distribution(aggregated_file)
        if chunk_stats_df is not None:
            logger.info("\nChunk statistics DataFrame head:")
            display(chunk_stats_df.head())

chunking_end_time = time.time()
elapsed_time = chunking_end_time - chunking_start_time
logger.info(f"--- Markdown Chunking Process Completed in {elapsed_time:.2f} seconds ---")