In [1]:
# 02_chunking_strategy.ipynb - Chunk 1: Text Preprocessing

import PyPDF2
import re
from pathlib import Path
import pandas as pd

# Setup paths (same as before)
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
PDF_FILE = PROJECT_ROOT / 'data' / 'raw' / 'mastering_pandas_2025.pdf'

def extract_all_text(pdf_path, start_page=11, end_page=None):
    """Extract text from all content pages"""
    all_text = []
    
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        end_page = end_page or len(pdf_reader.pages)
        
        for page_num in range(start_page, end_page):
            try:
                text = pdf_reader.pages[page_num].extract_text()
                if text.strip():  # Only add non-empty pages
                    all_text.append({
                        'page': page_num,
                        'raw_text': text
                    })
            except Exception as e:
                print(f"Error extracting page {page_num}: {e}")
    
    return all_text

def clean_text(text):
    """Clean extracted PDF text"""
    # Fix spacing issues
    text = re.sub(r'\s+', ' ', text)  # Multiple spaces to single space
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # Add space between lowercase and uppercase
    
    # Fix common PDF extraction issues
    text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)  # Ensure space after sentence endings
    text = re.sub(r'([a-z])\s*\n\s*([a-z])', r'\1 \2', text)  # Join broken words across lines
    
    # Remove excessive whitespace
    text = re.sub(r'\n\s*\n', '\n\n', text)  # Multiple newlines to double newline
    text = text.strip()
    
    return text

# Extract sample pages for testing (first 50 content pages)
print("Extracting text from first 50 content pages...")
sample_pages = extract_all_text(PDF_FILE, start_page=11, end_page=61)
print(f"Extracted {len(sample_pages)} pages")

# Test cleaning on a sample
if sample_pages:
    sample_text = sample_pages[10]['raw_text']  # Page ~21
    
    print("\n=== BEFORE CLEANING ===")
    print(f"Length: {len(sample_text)} characters")
    print("First 300 characters:")
    print(repr(sample_text[:300]))
    
    cleaned_text = clean_text(sample_text)
    
    print("\n=== AFTER CLEANING ===")
    print(f"Length: {len(cleaned_text)} characters")
    print("First 300 characters:")
    print(repr(cleaned_text[:300]))
    
    print(f"\nImprovement: {len(sample_text) - len(cleaned_text)} characters removed")

Extracting text from first 50 content pages...
Extracted 50 pages

=== BEFORE CLEANING ===
Length: 1352 characters
First 300 characters:
'Python; it’s an invitation to enter a world wher e raw data\ntransfor ms into organized , meaningful insights, allowing you\nto uncover the stories hidden within the information.\nImagine having a toolkit that lets you clean, reshape, and\nanalyze vast datasets with ease, turning comp lex operation s\nin'

=== AFTER CLEANING ===
Length: 1352 characters
First 300 characters:
'Python; it’s an invitation to enter a world wher e raw data transfor ms into organized , meaningful insights, allowing you to uncover the stories hidden within the information. Imagine having a toolkit that lets you clean, reshape, and analyze vast datasets with ease, turning comp lex operation s in'

Improvement: 0 characters removed
Extracted 50 pages

=== BEFORE CLEANING ===
Length: 1352 characters
First 300 characters:
'Python; it’s an invitation to enter a world wher e raw d

In [2]:
# Chunk 2: Chunking Strategies Implementation

import tiktoken

# Initialize tokenizer for accurate token counting
tokenizer = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    """Count tokens in text"""
    return len(tokenizer.encode(text))

def fixed_size_chunking(text, chunk_size=1000, overlap=200):
    """Traditional fixed-size chunking by tokens"""
    tokens = tokenizer.encode(text)
    chunks = []
    
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk_tokens = tokens[i:i + chunk_size]
        chunk_text = tokenizer.decode(chunk_tokens)
        
        chunks.append({
            'text': chunk_text,
            'token_count': len(chunk_tokens),
            'start_token': i,
            'method': 'fixed_size'
        })
    
    return chunks

def semantic_chunking(text, min_chunk_size=500, max_chunk_size=1200):
    """Semantic chunking based on paragraphs and structure"""
    # Split by double newlines (paragraphs)
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    
    chunks = []
    current_chunk = ""
    
    for paragraph in paragraphs:
        paragraph_tokens = count_tokens(paragraph)
        current_tokens = count_tokens(current_chunk)
        
        # If adding this paragraph exceeds max size, save current chunk
        if current_tokens + paragraph_tokens > max_chunk_size and current_chunk:
            if current_tokens >= min_chunk_size:
                chunks.append({
                    'text': current_chunk.strip(),
                    'token_count': current_tokens,
                    'method': 'semantic'
                })
            current_chunk = paragraph
        else:
            current_chunk += "\n\n" + paragraph if current_chunk else paragraph
    
    # Add final chunk
    if current_chunk and count_tokens(current_chunk) >= min_chunk_size:
        chunks.append({
            'text': current_chunk.strip(),
            'token_count': count_tokens(current_chunk),
            'method': 'semantic'
        })
    
    return chunks

def detect_code_blocks(text):
    """Detect code blocks in text"""
    # Look for Python code patterns
    code_patterns = [
        r'import\s+\w+',
        r'pd\.\w+',
        r'df\.\w+',
        r'print\s*\(',
        r'=\s*pd\.',
        r'\.groupby\(',
        r'\.merge\(',
        r'\.iloc\[',
        r'\.loc\['
    ]
    
    code_score = sum(len(re.findall(pattern, text, re.IGNORECASE)) for pattern in code_patterns)
    return code_score > 2  # Threshold for code detection

# Test different chunking strategies on sample text
if sample_pages:
    # Combine first 5 pages for testing
    test_text = "\n\n".join([clean_text(page['raw_text']) for page in sample_pages[:5]])
    
    print(f"=== TEST TEXT STATS ===")
    print(f"Total length: {len(test_text)} characters")
    print(f"Total tokens: {count_tokens(test_text)}")
    print(f"Contains code: {detect_code_blocks(test_text)}")
    
    # Test fixed-size chunking
    fixed_chunks = fixed_size_chunking(test_text, chunk_size=1000, overlap=200)
    
    # Test semantic chunking
    semantic_chunks = semantic_chunking(test_text, min_chunk_size=500, max_chunk_size=1200)
    
    print(f"\n=== CHUNKING RESULTS ===")
    print(f"Fixed-size chunks: {len(fixed_chunks)}")
    print(f"Semantic chunks: {len(semantic_chunks)}")
    
    # Show sample chunks
    print(f"\n=== FIXED-SIZE CHUNK SAMPLE ===")
    if fixed_chunks:
        chunk = fixed_chunks[0]
        print(f"Tokens: {chunk['token_count']}")
        print(f"Text preview: {chunk['text'][:200]}...")
    
    print(f"\n=== SEMANTIC CHUNK SAMPLE ===")
    if semantic_chunks:
        chunk = semantic_chunks[0]
        print(f"Tokens: {chunk['token_count']}")
        print(f"Text preview: {chunk['text'][:200]}...")

=== TEST TEXT STATS ===
Total length: 7229 characters
Total tokens: 1486
Contains code: False

=== CHUNKING RESULTS ===
Fixed-size chunks: 2
Semantic chunks: 2

=== FIXED-SIZE CHUNK SAMPLE ===
Tokens: 1000
Text preview: environmental science, data holds the answers, and pandas is your toolkit to unlock them. If you’re diving into data science, machine learning, deep learning, or artiﬁcial intelligence, one library yo...

=== SEMANTIC CHUNK SAMPLE ===
Tokens: 894
Text preview: environmental science, data holds the answers, and pandas is your toolkit to unlock them. If you’re diving into data science, machine learning, deep learning, or artiﬁcial intelligence, one library yo...


In [3]:
# Chunk 3: Advanced Chunking and Quality Assessment

def enhanced_text_cleaning(text):
    """Enhanced cleaning for pandas documentation"""
    # Fix character separation issues we saw
    text = re.sub(r'\b([a-z])\s+([a-z])\b', r'\1\2', text)  # Fix "wher e" -> "where"
    text = re.sub(r'([a-z])\s+([a-z])([a-z]+)', r'\1\2\3', text)  # Fix "transfor ms" -> "transforms"
    
    # Fix pandas-specific terms
    pandas_fixes = {
        'Data Frame': 'DataFrame',
        'data frame': 'DataFrame',
        'Group By': 'groupby',
        'group by': 'groupby'
    }
    
    for wrong, correct in pandas_fixes.items():
        text = text.replace(wrong, correct)
    
    return clean_text(text)

def code_aware_chunking(text, target_size=1000, overlap=150):
    """Code-aware chunking that keeps examples with explanations"""
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    chunks = []
    current_chunk = ""
    
    for i, paragraph in enumerate(paragraphs):
        is_code = detect_code_blocks(paragraph)
        current_tokens = count_tokens(current_chunk)
        para_tokens = count_tokens(paragraph)
        
        # If this is code, try to include preceding context
        if is_code and current_chunk:
            # Keep code with its context even if slightly over target
            current_chunk += "\n\n" + paragraph
        elif current_tokens + para_tokens > target_size and current_chunk:
            # Save current chunk
            chunks.append({
                'text': current_chunk.strip(),
                'token_count': current_tokens,
                'has_code': detect_code_blocks(current_chunk),
                'method': 'code_aware'
            })
            current_chunk = paragraph
        else:
            current_chunk += "\n\n" + paragraph if current_chunk else paragraph
    
    # Add final chunk
    if current_chunk:
        chunks.append({
            'text': current_chunk.strip(),
            'token_count': count_tokens(current_chunk),
            'has_code': detect_code_blocks(current_chunk),
            'method': 'code_aware'
        })
    
    return chunks

def evaluate_chunk_quality(chunks):
    """Evaluate quality of chunks"""
    stats = {
        'total_chunks': len(chunks),
        'avg_tokens': sum(c['token_count'] for c in chunks) / len(chunks) if chunks else 0,
        'min_tokens': min(c['token_count'] for c in chunks) if chunks else 0,
        'max_tokens': max(c['token_count'] for c in chunks) if chunks else 0,
        'code_chunks': sum(1 for c in chunks if c.get('has_code', False)),
        'token_distribution': [c['token_count'] for c in chunks]
    }
    return stats

# Test on pages that likely contain code (pages 30-40, around data structures chapter)
print("Testing on code-heavy pages (30-40)...")
code_pages = extract_all_text(PDF_FILE, start_page=40, end_page=50)
code_text = "\n\n".join([enhanced_text_cleaning(page['raw_text']) for page in code_pages[:8]])

print(f"\n=== CODE-HEAVY TEXT STATS ===")
print(f"Total length: {len(code_text)} characters")
print(f"Total tokens: {count_tokens(code_text)}")
print(f"Contains code: {detect_code_blocks(code_text)}")

# Test all three methods on code-heavy content
fixed_chunks_code = fixed_size_chunking(code_text, chunk_size=1000, overlap=200)
semantic_chunks_code = semantic_chunking(code_text, min_chunk_size=500, max_chunk_size=1200)
code_aware_chunks = code_aware_chunking(code_text, target_size=1000, overlap=150)

# Evaluate all methods
methods = [
    ('Fixed Size', fixed_chunks_code),
    ('Semantic', semantic_chunks_code),
    ('Code Aware', code_aware_chunks)
]

print(f"\n=== CHUNKING COMPARISON ON CODE CONTENT ===")
for method_name, chunks in methods:
    stats = evaluate_chunk_quality(chunks)
    print(f"\n{method_name}:")
    print(f"  Chunks: {stats['total_chunks']}")
    print(f"  Avg tokens: {stats['avg_tokens']:.1f}")
    print(f"  Token range: {stats['min_tokens']}-{stats['max_tokens']}")
    print(f"  Code chunks: {stats['code_chunks']}")

# Show sample from best method (code-aware)
print(f"\n=== CODE-AWARE CHUNK SAMPLE ===")
if code_aware_chunks:
    code_chunk = next((c for c in code_aware_chunks if c.get('has_code')), code_aware_chunks[0])
    print(f"Has code: {code_chunk.get('has_code', False)}")
    print(f"Tokens: {code_chunk['token_count']}")
    print(f"Preview: {code_chunk['text'][:400]}...")

Testing on code-heavy pages (30-40)...

=== CODE-HEAVY TEXT STATS ===
Total length: 7314 characters
Total tokens: 1906
Contains code: True

=== CHUNKING COMPARISON ON CODE CONTENT ===

Fixed Size:
  Chunks: 3
  Avg tokens: 768.7
  Token range: 306-1000
  Code chunks: 0

Semantic:
  Chunks: 2
  Avg tokens: 953.0
  Token range: 730-1176
  Code chunks: 0

Code Aware:
  Chunks: 2
  Avg tokens: 952.5
  Token range: 901-1004
  Code chunks: 1

=== CODE-AWARE CHUNK SAMPLE ===
Has code: True
Tokens: 1004
Preview: print(series_basic) Inthis code, we’recreating abasic Pandas Serieswith defaultnumeric indices. First, weimport the Pandaslibrary, whichis essentialfor datamanipulation andanalysis in Python. Then, wedeﬁne a Seriesby callingpd. Series() andpassing alist ofvalues— [4.5, -2.1, 17, 8.9, 3.7] —asthe dataargument. Sincewe haven’tspeciﬁedcustom labels, Pandasautomatically assigns adefault numericindex, ...

=== CODE-HEAVY TEXT STATS ===
Total length: 7314 characters
Total tokens: 1906
Contai

In [4]:
# Chunk 4: Final Strategy and Recommendations

def optimal_pandas_chunking(text, target_size=1000, min_size=400, max_size=1400):
    """Final optimized chunking strategy for pandas documentation"""
    
    # Enhanced cleaning
    text = enhanced_text_cleaning(text)
    
    # Split into paragraphs
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    chunks = []
    current_chunk = ""
    
    for paragraph in paragraphs:
        is_code = detect_code_blocks(paragraph)
        current_tokens = count_tokens(current_chunk)
        para_tokens = count_tokens(paragraph)
        
        # Code blocks get special treatment
        if is_code:
            # If we have context, keep it with code even if over target
            if current_chunk and current_tokens < max_size:
                current_chunk += "\n\n" + paragraph
            else:
                # Save previous chunk if exists
                if current_chunk and current_tokens >= min_size:
                    chunks.append({
                        'text': current_chunk.strip(),
                        'token_count': current_tokens,
                        'has_code': detect_code_blocks(current_chunk),
                        'chunk_type': 'explanation' if not detect_code_blocks(current_chunk) else 'mixed'
                    })
                current_chunk = paragraph
        else:
            # Regular text handling
            if current_tokens + para_tokens > target_size and current_chunk:
                if current_tokens >= min_size:
                    chunks.append({
                        'text': current_chunk.strip(),
                        'token_count': current_tokens,
                        'has_code': detect_code_blocks(current_chunk),
                        'chunk_type': 'explanation' if not detect_code_blocks(current_chunk) else 'mixed'
                    })
                current_chunk = paragraph
            else:
                current_chunk += "\n\n" + paragraph if current_chunk else paragraph
    
    # Final chunk
    if current_chunk and count_tokens(current_chunk) >= min_size:
        chunks.append({
            'text': current_chunk.strip(),
            'token_count': count_tokens(current_chunk),
            'has_code': detect_code_blocks(current_chunk),
            'chunk_type': 'explanation' if not detect_code_blocks(current_chunk) else 'mixed'
        })
    
    return chunks

# Test final strategy on larger sample (50 pages)
print("Testing final strategy on larger sample...")
large_sample = extract_all_text(PDF_FILE, start_page=11, end_page=100)  # ~90 pages
large_text = "\n\n".join([page['raw_text'] for page in large_sample])

final_chunks = optimal_pandas_chunking(large_text)

print(f"\n=== FINAL CHUNKING RESULTS ===")
print(f"Total pages processed: {len(large_sample)}")
print(f"Total chunks created: {len(final_chunks)}")

# Analyze chunk types
explanation_chunks = [c for c in final_chunks if c['chunk_type'] == 'explanation']
mixed_chunks = [c for c in final_chunks if c['chunk_type'] == 'mixed']

print(f"Explanation chunks: {len(explanation_chunks)}")
print(f"Mixed (code + explanation) chunks: {len(mixed_chunks)}")

# Token distribution
tokens = [c['token_count'] for c in final_chunks]
print(f"\nToken statistics:")
print(f"  Average: {sum(tokens)/len(tokens):.1f}")
print(f"  Min: {min(tokens)}")
print(f"  Max: {max(tokens)}")
print(f"  Standard deviation: {(sum((t - sum(tokens)/len(tokens))**2 for t in tokens) / len(tokens))**0.5:.1f}")

# Show examples of different chunk types
print(f"\n=== EXPLANATION CHUNK EXAMPLE ===")
if explanation_chunks:
    chunk = explanation_chunks[0]
    print(f"Tokens: {chunk['token_count']}")
    print(f"Preview: {chunk['text'][:300]}...")

print(f"\n=== MIXED CHUNK EXAMPLE ===")
if mixed_chunks:
    chunk = mixed_chunks[0]
    print(f"Tokens: {chunk['token_count']}")
    print(f"Preview: {chunk['text'][:300]}...")

print(f"\n=== CHUNKING STRATEGY RECOMMENDATIONS ===")
print("1. Use code-aware chunking with context preservation")
print("2. Target chunk size: 1000 tokens (range: 400-1400)")
print("3. Keep code examples with their explanations")
print("4. Clean text during processing to fix PDF extraction issues")
print("5. Add metadata for chunk types (explanation vs mixed)")
print("\nReady to proceed to retrieval testing!")

Testing final strategy on larger sample...

=== FINAL CHUNKING RESULTS ===
Total pages processed: 89
Total chunks created: 1
Explanation chunks: 0
Mixed (code + explanation) chunks: 1

Token statistics:
  Average: 18472.0
  Min: 18472
  Max: 18472
  Standard deviation: 0.0

=== EXPLANATION CHUNK EXAMPLE ===

=== MIXED CHUNK EXAMPLE ===
Tokens: 18472
Preview: environmentalscience, dataholds theanswers, andpandas isyour toolkitto unlockthem. Ifyou’rediving intodata science, machinelearning, deeplearning, orartiﬁcialintelligence, onelibrary youabsolutely needto knowis pandas . Real-worlddata rarelycomes cleanand readyfor analysis. Often, it’smessy, inconsi...

=== CHUNKING STRATEGY RECOMMENDATIONS ===
1. Use code-aware chunking with context preservation
2. Target chunk size: 1000 tokens (range: 400-1400)
3. Keep code examples with their explanations
4. Clean text during processing to fix PDF extraction issues
5. Add metadata for chunk types (explanation vs mixed)

Ready to proceed to retr

In [5]:
# Debug Chunk: Fix Chunking Logic

def fixed_enhanced_cleaning(text):
    """Fixed version - less aggressive cleaning"""
    # Basic cleaning first
    text = clean_text(text)
    
    # Only fix obvious PDF artifacts, not normal spaces
    text = re.sub(r'\bwher\s+e\b', 'where', text)
    text = re.sub(r'\btransfor\s+ms\b', 'transforms', text)
    text = re.sub(r'\bcomp\s+lex\b', 'complex', text)
    text = re.sub(r'\boper\s+ation\s+s\b', 'operations', text)
    
    # Fix pandas terms
    text = re.sub(r'\bData\s+Frame\b', 'DataFrame', text)
    text = re.sub(r'\bgroup\s+by\b', 'groupby', text, flags=re.IGNORECASE)
    
    return text

def debug_chunking(text, target_size=1000, min_size=400, max_size=1400):
    """Debug version with verbose logging"""
    
    # Clean text
    cleaned_text = fixed_enhanced_cleaning(text)
    print(f"After cleaning: {len(cleaned_text)} chars, {count_tokens(cleaned_text)} tokens")
    
    # Split into paragraphs
    paragraphs = [p.strip() for p in cleaned_text.split('\n\n') if p.strip()]
    print(f"Split into {len(paragraphs)} paragraphs")
    
    chunks = []
    current_chunk = ""
    
    for i, paragraph in enumerate(paragraphs):
        para_tokens = count_tokens(paragraph)
        current_tokens = count_tokens(current_chunk)
        is_code = detect_code_blocks(paragraph)
        
        print(f"Para {i}: {para_tokens} tokens, code: {is_code}, current: {current_tokens}")
        
        # Decision logic
        if current_tokens + para_tokens > target_size and current_tokens >= min_size:
            # Save current chunk
            print(f"  -> Saving chunk {len(chunks)+1}: {current_tokens} tokens")
            chunks.append({
                'text': current_chunk.strip(),
                'token_count': current_tokens,
                'has_code': detect_code_blocks(current_chunk),
                'chunk_id': len(chunks) + 1
            })
            current_chunk = paragraph
        else:
            # Add to current chunk
            current_chunk += "\n\n" + paragraph if current_chunk else paragraph
            print(f"  -> Added to current chunk, now: {count_tokens(current_chunk)} tokens")
    
    # Final chunk
    final_tokens = count_tokens(current_chunk)
    if current_chunk and final_tokens >= min_size:
        print(f"Final chunk: {final_tokens} tokens")
        chunks.append({
            'text': current_chunk.strip(),
            'token_count': final_tokens,
            'has_code': detect_code_blocks(current_chunk),
            'chunk_id': len(chunks) + 1
        })
    else:
        print(f"Final chunk too small ({final_tokens} tokens), discarding or merging")
    
    return chunks

# Test on smaller sample first
print("=== DEBUGGING ON SMALL SAMPLE ===")
small_sample = extract_all_text(PDF_FILE, start_page=40, end_page=45)  # Just 5 pages
small_text = "\n\n".join([page['raw_text'] for page in small_sample])

print(f"Small sample: {len(small_text)} chars, {count_tokens(small_text)} tokens")
print(f"Sample text preview: {repr(small_text[:200])}")

debug_chunks = debug_chunking(small_text, target_size=800, min_size=300, max_size=1200)

print(f"\n=== DEBUG RESULTS ===")
print(f"Created {len(debug_chunks)} chunks")

for chunk in debug_chunks:
    print(f"\nChunk {chunk['chunk_id']}:")
    print(f"  Tokens: {chunk['token_count']}")
    print(f"  Has code: {chunk['has_code']}")
    print(f"  Preview: {chunk['text'][:150]}...")

=== DEBUGGING ON SMALL SAMPLE ===
Small sample: 4895 chars, 1209 tokens
Sample text preview: 'Chapter 4:\nData Structures In P andas:\nSeries and DataF rames\nIn Pandas, two fundamental data structures form the\nbackbone of everything you’ll do: Series \xa0and DataFrames .\nThink of these as the build'
After cleaning: 4891 chars, 1115 tokens
Split into 1 paragraphs
Para 0: 1115 tokens, code: True, current: 0
  -> Added to current chunk, now: 1115 tokens
Final chunk: 1115 tokens

=== DEBUG RESULTS ===
Created 1 chunks

Chunk 1:
  Tokens: 1115
  Has code: True
  Preview: Chapter 4: Data Structures In P andas: Series and Data F rames In Pandas, two fundamental data structures form the backbone of everything you’ll do: S...
Small sample: 4895 chars, 1209 tokens
Sample text preview: 'Chapter 4:\nData Structures In P andas:\nSeries and DataF rames\nIn Pandas, two fundamental data structures form the\nbackbone of everything you’ll do: Series \xa0and DataFrames .\nThink of these as the build'


In [6]:
# Final Fix: Robust Chunking for PDF Text

def robust_text_splitting(text):
    """Split text using multiple strategies"""
    
    # Strategy 1: Try double newlines first
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    
    if len(paragraphs) < 3:  # If we get too few paragraphs, try other methods
        # Strategy 2: Single newlines + sentence boundaries
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        paragraphs = []
        current_para = ""
        
        for line in lines:
            # Check if line ends with sentence endings
            if line.endswith(('.', '!', '?', ':')) or len(current_para) > 300:
                current_para += " " + line if current_para else line
                if len(current_para.split()) > 20:  # At least 20 words
                    paragraphs.append(current_para)
                    current_para = ""
            else:
                current_para += " " + line if current_para else line
        
        # Add remaining
        if current_para:
            paragraphs.append(current_para)
    
    if len(paragraphs) < 2:  # If still too few, force sentence splitting
        # Strategy 3: Force split by sentences
        sentences = re.split(r'[.!?]+\s+', text)
        paragraphs = []
        current_para = ""
        
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
                
            if count_tokens(current_para + " " + sentence) > 200:
                if current_para:
                    paragraphs.append(current_para)
                current_para = sentence
            else:
                current_para += " " + sentence if current_para else sentence
        
        if current_para:
            paragraphs.append(current_para)
    
    return paragraphs

def final_chunking_strategy(text, target_size=1000, min_size=400, max_size=1400):
    """Final robust chunking strategy"""
    
    # Clean text
    cleaned_text = fixed_enhanced_cleaning(text)
    
    # Smart splitting
    paragraphs = robust_text_splitting(cleaned_text)
    print(f"Smart splitting created {len(paragraphs)} segments")
    
    chunks = []
    current_chunk = ""
    
    for i, paragraph in enumerate(paragraphs):
        para_tokens = count_tokens(paragraph)
        current_tokens = count_tokens(current_chunk)
        
        # Force split if current chunk is already large
        if current_tokens > target_size:
            if current_tokens >= min_size:
                chunks.append({
                    'text': current_chunk.strip(),
                    'token_count': current_tokens,
                    'has_code': detect_code_blocks(current_chunk)
                })
            current_chunk = paragraph
        elif current_tokens + para_tokens > max_size:
            # Would exceed max size, save current
            if current_tokens >= min_size:
                chunks.append({
                    'text': current_chunk.strip(),
                    'token_count': current_tokens,
                    'has_code': detect_code_blocks(current_chunk)
                })
            current_chunk = paragraph
        else:
            # Add to current chunk
            current_chunk += "\n\n" + paragraph if current_chunk else paragraph
    
    # Final chunk
    final_tokens = count_tokens(current_chunk)
    if current_chunk and final_tokens >= min_size:
        chunks.append({
            'text': current_chunk.strip(),
            'token_count': final_tokens,
            'has_code': detect_code_blocks(current_chunk)
        })
    elif current_chunk and chunks:
        # Merge small final chunk with last chunk
        chunks[-1]['text'] += "\n\n" + current_chunk
        chunks[-1]['token_count'] = count_tokens(chunks[-1]['text'])
    
    return chunks

# Test final strategy
print("=== TESTING FINAL ROBUST STRATEGY ===")
final_chunks = final_chunking_strategy(small_text, target_size=800, min_size=300, max_size=1200)

print(f"\nFinal Results:")
print(f"Created {len(final_chunks)} chunks")

for i, chunk in enumerate(final_chunks, 1):
    print(f"\nChunk {i}:")
    print(f"  Tokens: {chunk['token_count']}")
    print(f"  Has code: {chunk['has_code']}")
    print(f"  Preview: {chunk['text'][:200]}...")

# Test on larger sample if this works
if len(final_chunks) > 1:
    print(f"\n=== TESTING ON LARGER SAMPLE ===")
    medium_sample = extract_all_text(PDF_FILE, start_page=40, end_page=55)  # 15 pages
    medium_text = "\n\n".join([page['raw_text'] for page in medium_sample])
    
    large_chunks = final_chunking_strategy(medium_text, target_size=1000, min_size=400, max_size=1400)
    
    tokens = [c['token_count'] for c in large_chunks]
    code_chunks = sum(1 for c in large_chunks if c['has_code'])
    
    print(f"15-page test: {len(large_chunks)} chunks")
    print(f"Token range: {min(tokens)}-{max(tokens)}, avg: {sum(tokens)/len(tokens):.1f}")
    print(f"Code chunks: {code_chunks}")
    print("SUCCESS! Chunking strategy working correctly.")
else:
    print("Still need to debug further...")

=== TESTING FINAL ROBUST STRATEGY ===
Smart splitting created 6 segments

Final Results:
Created 1 chunks

Chunk 1:
  Tokens: 1084
  Has code: True
  Preview: Chapter 4: Data Structures In P andas: Series and Data F rames In Pandas, two fundamental data structures form the backbone of everything you’ll do: Series and Data Frames Think of these as the buildi...
Still need to debug further...


In [7]:
# Simple Logic Fix - Use working semantic approach with better splitting

def working_chunking_strategy(text, target_size=1000, min_size=400):
    """Simple, working strategy based on what worked before"""
    
    # Clean text
    cleaned_text = fixed_enhanced_cleaning(text)
    
    # Get segments using our robust splitting
    segments = robust_text_splitting(cleaned_text)
    print(f"Created {len(segments)} segments")
    
    # Show segment sizes for debugging
    for i, seg in enumerate(segments):
        print(f"  Segment {i}: {count_tokens(seg)} tokens")
    
    chunks = []
    current_chunk = ""
    
    for segment in segments:
        current_tokens = count_tokens(current_chunk)
        segment_tokens = count_tokens(segment)
        
        # Simple decision: if adding this segment exceeds target, save current chunk
        if current_tokens + segment_tokens > target_size and current_tokens >= min_size:
            chunks.append({
                'text': current_chunk.strip(),
                'token_count': current_tokens,
                'has_code': detect_code_blocks(current_chunk)
            })
            print(f"  -> Saved chunk: {current_tokens} tokens")
            current_chunk = segment
        else:
            current_chunk += "\n\n" + segment if current_chunk else segment
            print(f"  -> Combined chunk now: {count_tokens(current_chunk)} tokens")
    
    # Final chunk
    if current_chunk and count_tokens(current_chunk) >= min_size:
        final_tokens = count_tokens(current_chunk)
        chunks.append({
            'text': current_chunk.strip(),
            'token_count': final_tokens,
            'has_code': detect_code_blocks(current_chunk)
        })
        print(f"  -> Final chunk: {final_tokens} tokens")
    
    return chunks

# Test the simple fix
print("=== TESTING SIMPLE FIXED LOGIC ===")
working_chunks = working_chunking_strategy(small_text, target_size=600, min_size=300)

print(f"\n=== RESULTS ===")
print(f"Created {len(working_chunks)} chunks")

for i, chunk in enumerate(working_chunks, 1):
    print(f"\nChunk {i}:")
    print(f"  Tokens: {chunk['token_count']}")
    print(f"  Has code: {chunk['has_code']}")
    print(f"  Preview: {chunk['text'][:150]}...")

# If this works, declare success and finalize
if len(working_chunks) > 1:
    print(f"\n✓ SUCCESS! Chunking working correctly.")
    print(f"✓ Ready to proceed to 03_retrieval_testing.ipynb")
    
    # Save the final working function for next phase
    print(f"\n=== FINAL CHUNKING FUNCTION READY ===")
    print("Function: working_chunking_strategy()")
    print("Parameters: target_size=1000, min_size=400")
    print("Features: Code detection, smart text splitting, robust handling")
else:
    print("Need one more iteration...")

=== TESTING SIMPLE FIXED LOGIC ===
Created 6 segments
  Segment 0: 179 tokens
  Segment 1: 196 tokens
  Segment 2: 186 tokens
  Segment 3: 180 tokens
  Segment 4: 200 tokens
  Segment 5: 138 tokens
  -> Combined chunk now: 179 tokens
  -> Combined chunk now: 376 tokens
  -> Combined chunk now: 563 tokens
  -> Saved chunk: 563 tokens
  -> Combined chunk now: 381 tokens
  -> Combined chunk now: 520 tokens
  -> Final chunk: 520 tokens

=== RESULTS ===
Created 2 chunks

Chunk 1:
  Tokens: 563
  Has code: False
  Preview: Chapter 4: Data Structures In P andas: Series and Data F rames In Pandas, two fundamental data structures form the backbone of everything you’ll do: S...

Chunk 2:
  Tokens: 520
  Has code: True
  Preview: You can create a Series from various sources: a single scalar value, a list, a Num Py array, or a dictionary Simply use pd Series() (remember the capi...

✓ SUCCESS! Chunking working correctly.
✓ Ready to proceed to 03_retrieval_testing.ipynb

=== FINAL CHUNKING FUNCTION