In [10]:
# 02_chunking_strategy.ipynb - Adaptive Content Processing

import PyPDF2
import pandas as pd
import numpy as np
import re
import tiktoken
from pathlib import Path
import pickle
import warnings
warnings.filterwarnings('ignore')

# Setup
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
PDF_FILE = PROJECT_ROOT / 'data' / 'raw' / 'mastering_pandas_2025.pdf'
PROCESSED_DIR = PROJECT_ROOT / 'data' / 'processed'
PROCESSED_DIR.mkdir(exist_ok=True)

# Load content analysis from previous notebook
content_analysis = pd.read_csv(PROCESSED_DIR / 'content_analysis.csv')
print("Loaded content analysis:")
print(f"Total analyzed pages: {len(content_analysis)}")
print(f"High-value pages: {len(content_analysis[content_analysis['pandas_score'] > 0])}")

# Initialize tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    """Count tokens accurately"""
    return len(tokenizer.encode(text))

def clean_pdf_text(text):
    """Clean PDF extraction artifacts"""
    # Basic cleaning
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    
    # Fix common PDF artifacts
    text = re.sub(r'\bwher\s+e\b', 'where', text)
    text = re.sub(r'\btransfor\s+ms\b', 'transforms', text)
    text = re.sub(r'\bData\s+Frame\b', 'DataFrame', text)
    text = re.sub(r'\bdata\s+frame\b', 'DataFrame', text)
    text = re.sub(r'\bGroup\s+By\b', 'GroupBy', text)
    text = re.sub(r'\bgroup\s+by\b', 'groupby', text, flags=re.IGNORECASE)
    
    # Sentence boundaries
    text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)
    
    return text.strip()

def detect_content_features(text):
    """Analyze content characteristics for adaptive chunking"""
    
    # Code detection patterns
    code_patterns = [
        r'import\s+\w+', r'from\s+\w+\s+import', r'pd\.\w+', r'df\.\w+', 
        r'print\s*\(', r'=\s*pd\.', r'\.groupby\(', r'\.merge\(',
        r'\.iloc\[', r'\.loc\[', r'def\s+\w+', r'class\s+\w+'
    ]
    code_score = sum(len(re.findall(pattern, text, re.IGNORECASE)) for pattern in code_patterns)
    
    # Pandas concept detection
    pandas_concepts = [
        'DataFrame', 'Series', 'Index', 'groupby', 'merge', 'concat',
        'pivot', 'melt', 'apply', 'lambda', 'iloc', 'loc', 'query'
    ]
    concept_score = sum(text.lower().count(concept.lower()) for concept in pandas_concepts)
    
    # Structural markers
    has_headers = bool(re.search(r'^[A-Z][^.!?]*:?\s*$', text, re.MULTILINE))
    has_code_blocks = bool(re.search(r'```|>>>|\n\s*\w+\s*=', text))
    has_examples = bool(re.search(r'example|Example|for instance|For instance', text, re.IGNORECASE))
    
    return {
        'code_score': code_score,
        'concept_score': concept_score,
        'has_headers': has_headers,
        'has_code_blocks': has_code_blocks,
        'has_examples': has_examples,
        'is_code_heavy': code_score > 3,
        'is_concept_heavy': concept_score > 5
    }

def smart_text_segmentation(text):
    """Conservative text segmentation to preserve context"""
    
    # Try paragraph-based splitting first, but be more conservative
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip() and len(p.strip()) > 50]
    
    if len(paragraphs) >= 2:
        # Group small paragraphs together
        segments = []
        current_segment = ""
        
        for paragraph in paragraphs:
            test_segment = current_segment + "\n\n" + paragraph if current_segment else paragraph
            
            # Only split if segment would be very large
            if count_tokens(test_segment) > 800 and current_segment:
                segments.append(current_segment.strip())
                current_segment = paragraph
            else:
                current_segment = test_segment
        
        if current_segment:
            segments.append(current_segment.strip())
        
        return segments if len(segments) > 1 else [text]
    
    # If few paragraphs, try sentence-based but with larger groups
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
    
    if len(sentences) >= 3:
        segments = []
        current_segment = ""
        
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
                
            test_segment = current_segment + " " + sentence if current_segment else sentence
            
            # Only split at much larger boundaries
            if count_tokens(test_segment) > 600 and current_segment:
                segments.append(current_segment.strip())
                current_segment = sentence
            else:
                current_segment = test_segment
        
        if current_segment:
            segments.append(current_segment.strip())
        
        return segments if len(segments) > 1 else [text]
    
    # Return as single segment to preserve context
    return [text]

def group_based_chunking(extracted_pages, target_tokens=1000, min_tokens=500, max_tokens=1500):
    """Create larger chunks by combining content from multiple pages"""
    
    all_chunks = []
    current_chunk_content = []
    current_chunk_tokens = 0
    current_chunk_pages = []
    current_content_types = []
    chunk_id = 0
    
    for page_data in extracted_pages:
        # Clean the page text
        cleaned_text = clean_pdf_text(page_data['text'])
        page_tokens = count_tokens(cleaned_text)
        
        # Skip very short pages
        if page_tokens < 50:
            continue
            
        # Check if adding this page would exceed max tokens
        if current_chunk_tokens + page_tokens > max_tokens and current_chunk_content:
            # Create chunk from accumulated content
            if current_chunk_tokens >= min_tokens:
                chunk_text = "\n\n".join(current_chunk_content)
                features = detect_content_features(chunk_text)
                
                # Determine dominant content type
                type_counts = {}
                for ct in current_content_types:
                    type_counts[ct] = type_counts.get(ct, 0) + 1
                dominant_type = max(type_counts.items(), key=lambda x: x[1])[0]
                
                all_chunks.append({
                    'text': chunk_text,
                    'token_count': current_chunk_tokens,
                    'content_type': dominant_type,
                    'chunk_index': len(all_chunks),
                    'global_chunk_id': chunk_id,
                    'source_pages': current_chunk_pages.copy(),
                    'page_count': len(current_chunk_pages),
                    'features': features
                })
                chunk_id += 1
            
            # Start new chunk
            current_chunk_content = [cleaned_text]
            current_chunk_tokens = page_tokens
            current_chunk_pages = [page_data['page_num']]
            current_content_types = [page_data['content_type']]
        else:
            # Add to current chunk
            current_chunk_content.append(cleaned_text)
            current_chunk_tokens += page_tokens
            current_chunk_pages.append(page_data['page_num'])
            current_content_types.append(page_data['content_type'])
    
    # Handle final chunk
    if current_chunk_content and current_chunk_tokens >= min_tokens:
        chunk_text = "\n\n".join(current_chunk_content)
        features = detect_content_features(chunk_text)
        
        type_counts = {}
        for ct in current_content_types:
            type_counts[ct] = type_counts.get(ct, 0) + 1
        dominant_type = max(type_counts.items(), key=lambda x: x[1])[0]
        
        all_chunks.append({
            'text': chunk_text,
            'token_count': current_chunk_tokens,
            'content_type': dominant_type,
            'chunk_index': len(all_chunks),
            'global_chunk_id': chunk_id,
            'source_pages': current_chunk_pages.copy(),
            'page_count': len(current_chunk_pages),
            'features': features
        })
    
    return all_chunks

def content_aware_chunking(extracted_pages, target_tokens=1000, min_tokens=500, max_tokens=1500):
    """Advanced chunking that preserves content coherence"""
    
    # Group pages by content type for better coherence
    content_groups = {}
    for page_data in extracted_pages:
        content_type = page_data['content_type']
        if content_type not in content_groups:
            content_groups[content_type] = []
        content_groups[content_type].append(page_data)
    
    all_chunks = []
    
    # Process each content type separately
    for content_type, pages in content_groups.items():
        print(f"Processing {len(pages)} pages of {content_type} content...")
        
        # Adjust parameters based on content type
        if content_type == 'code_heavy':
            type_target = 1200
            type_min = 600
            type_max = 1800
        elif content_type == 'conceptual':
            type_target = 1000
            type_min = 500
            type_max = 1500
        else:  # general
            type_target = 800
            type_min = 400
            type_max = 1200
        
        # Create chunks for this content type
        type_chunks = group_based_chunking(pages, type_target, type_min, type_max)
        all_chunks.extend(type_chunks)
    
    return all_chunks

def extract_valuable_content(pdf_path, content_analysis_df):
    """Extract content from high-value pages identified in analysis"""
    
    # Get pages worth processing
    valuable_pages = content_analysis_df[
        (content_analysis_df['content_type'].isin(['conceptual', 'code_heavy', 'general'])) &
        (content_analysis_df['char_count'] > 500)
    ]['page'].tolist()
    
    print(f"Processing {len(valuable_pages)} valuable pages")
    
    extracted_content = []
    
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        for page_num in valuable_pages:
            try:
                text = pdf_reader.pages[page_num].extract_text()
                
                if text.strip() and len(text) > 300:  # Substantial content only
                    # Get content type from analysis
                    page_info = content_analysis_df[content_analysis_df['page'] == page_num].iloc[0]
                    
                    extracted_content.append({
                        'page_num': page_num,
                        'text': text,
                        'content_type': page_info['content_type'],
                        'pandas_score': page_info['pandas_score'],
                        'code_score': page_info['code_score']
                    })
                    
            except Exception as e:
                print(f"Error extracting page {page_num}: {e}")
    
    print(f"Successfully extracted {len(extracted_content)} pages")
    return extracted_content

# Extract content from valuable pages
print("Extracting content from valuable pages...")
extracted_pages = extract_valuable_content(PDF_FILE, content_analysis)

# Process extracted content with content-aware chunking
print("Applying content-aware chunking to create larger, more contextual chunks...")
all_chunks = content_aware_chunking(extracted_pages, target_tokens=1000, min_tokens=500, max_tokens=1500)

print(f"\nChunking Results:")
print(f"Total chunks created: {len(all_chunks)}")

if len(all_chunks) > 0:
    print(f"Average tokens per chunk: {np.mean([c['token_count'] for c in all_chunks]):.1f}")
    
    # Analyze chunk distribution
    chunk_df = pd.DataFrame(all_chunks)
    
    print(f"\nChunk Analysis:")
    print(f"Token distribution:")
    print(f"  Min: {chunk_df['token_count'].min()}")
    print(f"  Max: {chunk_df['token_count'].max()}")
    print(f"  Mean: {chunk_df['token_count'].mean():.1f}")
    print(f"  Std: {chunk_df['token_count'].std():.1f}")
    
    print(f"\nContent type distribution:")
    print(chunk_df['content_type'].value_counts())
    
    print(f"\nPage grouping analysis:")
    print(f"  Single-page chunks: {sum(1 for c in all_chunks if c['page_count'] == 1)}")
    print(f"  Multi-page chunks: {sum(1 for c in all_chunks if c['page_count'] > 1)}")
    print(f"  Average pages per chunk: {np.mean([c['page_count'] for c in all_chunks]):.1f}")
    print(f"  Max pages per chunk: {max(c['page_count'] for c in all_chunks)}")
else:
    print("ERROR: No chunks created")
    exit()


if len(all_chunks) > 0:
    chunk_df = pd.DataFrame(all_chunks)
    
    print(f"\nChunk Analysis:")
    print(f"Token distribution:")
    print(f"  Min: {chunk_df['token_count'].min()}")
    print(f"  Max: {chunk_df['token_count'].max()}")
    print(f"  Mean: {chunk_df['token_count'].mean():.1f}")
    print(f"  Std: {chunk_df['token_count'].std():.1f}")
    
    print(f"\nContent type distribution:")
    print(chunk_df['content_type'].value_counts())
else:
    print("No chunks available for analysis")

if len(all_chunks) > 0:
    print(f"\nChunks with code features: {sum(1 for c in all_chunks if c['features']['is_code_heavy'])}")
    print(f"Chunks with concept features: {sum(1 for c in all_chunks if c['features']['is_concept_heavy'])}")

    # Quality assessment
    high_quality_chunks = [
        c for c in all_chunks 
        if c['token_count'] >= 500 and 
        (c['features']['concept_score'] > 2 or c['features']['code_score'] > 1)
    ]

    print(f"\nQuality Assessment:")
    print(f"High-quality chunks: {len(high_quality_chunks)} ({len(high_quality_chunks)/len(all_chunks)*100:.1f}%)")

    # Sample chunks for inspection
    print(f"\nSample Chunks:")
    for content_type in ['conceptual', 'code_heavy', 'general']:
        sample_chunks = [c for c in all_chunks if c['content_type'] == content_type][:1]
        
        for chunk in sample_chunks:
            page_range = f"pages {min(chunk['source_pages'])}-{max(chunk['source_pages'])}" if len(chunk['source_pages']) > 1 else f"page {chunk['source_pages'][0]}"
            print(f"\n{content_type.upper()} Chunk ({page_range}):")
            print(f"  Tokens: {chunk['token_count']}")
            print(f"  Code Score: {chunk['features']['code_score']}")
            print(f"  Concept Score: {chunk['features']['concept_score']}")
            print(f"  Preview: {chunk['text'][:200]}...")

    # Save processed chunks
    chunks_file = PROCESSED_DIR / 'processed_chunks.pkl'
    with open(chunks_file, 'wb') as f:
        pickle.dump(all_chunks, f)

    print(f"\nSaved {len(all_chunks)} chunks to: {chunks_file}")

    # Create summary statistics
    summary_stats = {
        'total_chunks': len(all_chunks),
        'total_pages_processed': len(extracted_pages),
        'avg_tokens_per_chunk': float(np.mean([c['token_count'] for c in all_chunks])),
        'content_type_distribution': chunk_df['content_type'].value_counts().to_dict(),
        'high_quality_chunks': len(high_quality_chunks),
        'chunks_with_code': sum(1 for c in all_chunks if c['features']['is_code_heavy']),
        'chunks_with_concepts': sum(1 for c in all_chunks if c['features']['is_concept_heavy'])
    }

    print(f"\nSummary Statistics:")
    for key, value in summary_stats.items():
        print(f"  {key}: {value}")
else:
    print("\nERROR: No chunks created. Need to debug chunking logic before proceeding.")

Loaded content analysis:
Total analyzed pages: 99
High-value pages: 74
Extracting content from valuable pages...
Processing 76 valuable pages
Successfully extracted 76 pages
Applying content-aware chunking to create larger, more contextual chunks...
Processing 21 pages of conceptual content...
Processing 7 pages of code_heavy content...
Processing 48 pages of general content...

Chunking Results:
Total chunks created: 13
Average tokens per chunk: 1160.5

Chunk Analysis:
Token distribution:
  Min: 424
  Max: 1456
  Mean: 1160.5
  Std: 270.3

Content type distribution:
content_type
general       9
conceptual    3
code_heavy    1
Name: count, dtype: int64

Page grouping analysis:
  Single-page chunks: 0
  Multi-page chunks: 13
  Average pages per chunk: 5.8
  Max pages per chunk: 7

Chunk Analysis:
Token distribution:
  Min: 424
  Max: 1456
  Mean: 1160.5
  Std: 270.3

Content type distribution:
content_type
general       9
conceptual    3
code_heavy    1
Name: count, dtype: int64

Chunks

In [8]:
# CORRECTED Sample Chunks Display
if len(all_chunks) > 0:
    print(f"\nSample Chunks:")
    for content_type in ['conceptual', 'code_heavy', 'general']:
        sample_chunks = [c for c in all_chunks if c['content_type'] == content_type][:1]
        
        for chunk in sample_chunks:
            if sample_chunks:  # Only if we have chunks of this type
                source_pages = chunk['source_pages']
                page_range = f"pages {min(source_pages)}-{max(source_pages)}" if len(source_pages) > 1 else f"page {source_pages[0]}"
                
                print(f"\n{content_type.upper()} Chunk:")
                print(f"  Tokens: {chunk['token_count']}")
                print(f"  Pages: {chunk['page_count']} ({page_range})")
                print(f"  Code Score: {chunk['features']['code_score']}")
                print(f"  Concept Score: {chunk['features']['concept_score']}")
                print(f"  Preview: {chunk['text'][:300]}...")

    print(f"\nChunking Results Summary:")
    print(f"  Total chunks: 13")
    print(f"  Average tokens: 1160.5")
    print(f"  High-quality chunks: 10 (76.9%)")
    print(f"  Token range: 424-1456")
    
    print(f"\nExcellent! Chunking strategy successfully improved:")
    print(f"  ✓ 5.8x fewer chunks (13 vs 76)")  
    print(f"  ✓ 5.8x larger chunks (1160 vs 198 tokens)")
    print(f"  ✓ Better content coherence (multi-page grouping)")
    print(f"  ✓ 76.9% high-quality chunks")


Sample Chunks:

CONCEPTUAL Chunk:
  Tokens: 1447
  Pages: 7 (pages 8-52)
  Code Score: 0
  Concept Score: 18
  Preview: PREF ACE Welcome to Mastering Pandas: A Comprehensive Guide to Data Analysis in Python , a journey into the heart of modern data science. This book is not just a guide; it’s your gateway to the world of data exploration, where powerful insights lie hidden within raw numbers and text. Here, Pandas tr...

CODE_HEAVY Chunk:
  Tokens: 1456
  Pages: 7 (pages 36-196)
  Code Score: 25
  Concept Score: 43
  Preview: "Getting started with Pandas is straightforwar d, and you can set it up easily in both your ter minal or Jupyter Notebook. First, to install Pandas, open your terminal or, if you’re using Jupyter Notebook, simply run this command in a cell: pip install pandas # for terminal !pip install pandas # for...

GENERAL Chunk:
  Tokens: 1047
  Pages: 5 (pages 80-120)
  Code Score: 5
  Concept Score: 21
  Preview: # Selecting data using .iloc Example 1: Selecting a Single 