In [1]:
# Setup and Load Previous Analysis

import pandas as pd
import numpy as np
import PyPDF2
import json
import re
from pathlib import Path
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Setup paths
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
RAW_DATA_PATH = PROJECT_ROOT / 'data' / 'raw'
PROCESSED_DATA_PATH = PROJECT_ROOT / 'data' / 'processed'
PDF_FILE = RAW_DATA_PATH / 'mastering_pandas_2025.pdf'

print("TIERED CONTENT PROCESSING FOR 100% PDF UTILIZATION")
print("=" * 60)

# Load comprehensive analysis results
content_analysis_file = PROCESSED_DATA_PATH / 'comprehensive_content_analysis.csv'
summary_file = PROCESSED_DATA_PATH / 'analysis_summary.json'

if not content_analysis_file.exists():
    print("ERROR: Content analysis file not found!")
    print("Please run notebook 01_comprehensive_content_analysis.ipynb first")
    exit()

# Load data
content_df = pd.read_csv(content_analysis_file)
with open(summary_file, 'r') as f:
    summary_stats = json.load(f)

print(f"Loaded comprehensive analysis:")
print(f"  Total pages: {len(content_df)}")
print(f"  Analysis file size: {content_analysis_file.stat().st_size / 1024:.1f} KB")

# Display tier distribution from previous analysis
print(f"\nTier Distribution Summary:")
for tier, count in summary_stats['tier_distribution'].items():
    percentage = (count / summary_stats['total_pages']) * 100
    print(f"  {tier.replace('_', ' ').title()}: {count:3d} pages ({percentage:5.1f}%)")

print(f"\nReady to process content by tiers for optimal chunk creation")
print(f"Target: Extract and process all {summary_stats['utilization_strategy']['tier_1_primary'] + summary_stats['utilization_strategy']['tier_2_secondary']} high and medium value pages")

TIERED CONTENT PROCESSING FOR 100% PDF UTILIZATION
Loaded comprehensive analysis:
  Total pages: 473
  Analysis file size: 127.1 KB

Tier Distribution Summary:
  Tier 3 Reference: 232 pages ( 49.0%)
  Tier 2 Secondary: 105 pages ( 22.2%)
  Tier 1 Primary:  79 pages ( 16.7%)
  Tier 4 Context:  55 pages ( 11.6%)
  Empty:   2 pages (  0.4%)

Ready to process content by tiers for optimal chunk creation
Target: Extract and process all 184 high and medium value pages


In [2]:
# Extracting Content from PDF Organized by Tiers

def clean_extracted_text(text):
    """
    Clean PDF extraction artifacts while preserving content structure
    """
    if not text:
        return ""
    
    # Basic cleaning
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # Fix missing spaces
    
    # Fix common PDF artifacts
    text = re.sub(r'\bwher\s+e\b', 'where', text)
    text = re.sub(r'\btransfor\s+ms\b', 'transforms', text)
    text = re.sub(r'\bData\s+Frame\b', 'DataFrame', text)
    text = re.sub(r'\bdata\s+frame\b', 'DataFrame', text, flags=re.IGNORECASE)
    text = re.sub(r'\bGroup\s+By\b', 'GroupBy', text)
    text = re.sub(r'\bgroup\s+by\b', 'groupby', text, flags=re.IGNORECASE)
    
    # Preserve sentence boundaries
    text = re.sub(r'([.!?])([A-Z])', r'\1 \2', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)
    
    return text.strip()

def extract_content_by_tiers(pdf_path, content_df):
    """
    Extract full text content organized by content tiers
    """
    print("Extracting content from PDF organized by tiers...")
    
    tier_content = {
        'tier_1_primary': [],
        'tier_2_secondary': [], 
        'tier_3_reference': [],
        'tier_4_context': [],
        'empty': []
    }
    
    extraction_stats = {
        'successful_extractions': 0,
        'failed_extractions': 0,
        'empty_pages': 0,
        'total_characters': 0
    }
    
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        for _, row in content_df.iterrows():
            page_num = row['page_number']
            content_tier = row['content_tier']
            
            try:
                # Extract text from page
                raw_text = pdf_reader.pages[page_num].extract_text()
                cleaned_text = clean_extracted_text(raw_text)
                
                if not cleaned_text:
                    extraction_stats['empty_pages'] += 1
                    continue
                
                # Create content record
                content_record = {
                    'page_number': page_num,
                    'content_tier': content_tier,
                    'content_type': row['content_type'],
                    'pandas_score': row['pandas_score'],
                    'code_score': row['code_score'],
                    'quiz_potential': row['quiz_potential'],
                    'char_count': len(cleaned_text),
                    'word_count': len(cleaned_text.split()),
                    'text': cleaned_text,
                    'text_preview': cleaned_text[:200] + "..." if len(cleaned_text) > 200 else cleaned_text
                }
                
                # Add to appropriate tier
                tier_content[content_tier].append(content_record)
                
                extraction_stats['successful_extractions'] += 1
                extraction_stats['total_characters'] += len(cleaned_text)
                
            except Exception as e:
                extraction_stats['failed_extractions'] += 1
                print(f"  Error extracting page {page_num}: {e}")
        
        # Progress update every tier
        for tier, content_list in tier_content.items():
            if content_list:
                avg_chars = np.mean([c['char_count'] for c in content_list])
                print(f"  {tier.replace('_', ' ').title()}: {len(content_list)} pages, avg {avg_chars:.0f} chars")
    
    print(f"\nExtraction Statistics:")
    print(f"  Successful: {extraction_stats['successful_extractions']}")
    print(f"  Failed: {extraction_stats['failed_extractions']}")
    print(f"  Empty: {extraction_stats['empty_pages']}")
    print(f"  Total characters: {extraction_stats['total_characters']:,}")
    
    return tier_content, extraction_stats

# Execute content extraction
print("Starting tier-based content extraction...")
tier_content, extraction_stats = extract_content_by_tiers(PDF_FILE, content_df)

# Detailed tier analysis
print(f"\nDETAILED TIER ANALYSIS:")
print(f"=" * 40)

for tier_name, content_list in tier_content.items():
    if not content_list:
        continue
        
    print(f"\n{tier_name.replace('_', ' ').upper()}:")
    print(f"  Pages: {len(content_list)}")
    
    if content_list:
        char_counts = [c['char_count'] for c in content_list]
        pandas_scores = [c['pandas_score'] for c in content_list]
        code_scores = [c['code_score'] for c in content_list]
        quiz_scores = [c['quiz_potential'] for c in content_list]
        
        print(f"  Character range: {min(char_counts)} - {max(char_counts)}")
        print(f"  Average characters: {np.mean(char_counts):.0f}")
        print(f"  Average pandas score: {np.mean(pandas_scores):.1f}")
        print(f"  Average code score: {np.mean(code_scores):.1f}")
        print(f"  Average quiz potential: {np.mean(quiz_scores):.1f}")
        
        # Show content type distribution within tier
        content_types = {}
        for content in content_list:
            ct = content['content_type']
            content_types[ct] = content_types.get(ct, 0) + 1
        
        print(f"  Content types: {dict(content_types)}")

print(f"\nContent extraction by tiers completed successfully!")

Starting tier-based content extraction...
Extracting content from PDF organized by tiers...
  Tier 1 Primary: 79 pages, avg 1010 chars
  Tier 2 Secondary: 105 pages, avg 783 chars
  Tier 3 Reference: 232 pages, avg 818 chars
  Tier 4 Context: 55 pages, avg 354 chars

Extraction Statistics:
  Successful: 471
  Failed: 0
  Empty: 2
  Total characters: 371,221

DETAILED TIER ANALYSIS:

TIER 1 PRIMARY:
  Pages: 79
  Character range: 528 - 1817
  Average characters: 1010
  Average pandas score: 9.2
  Average code score: 1.4
  Average quiz potential: 2.4
  Content types: {'navigation': 25, 'conceptual': 19, 'structural': 29, 'code_heavy': 4, 'general': 2}

TIER 2 SECONDARY:
  Pages: 105
  Character range: 330 - 1677
  Average characters: 783
  Average pandas score: 3.8
  Average code score: 0.5
  Average quiz potential: 1.9
  Content types: {'structural': 78, 'code_heavy': 2, 'navigation': 10, 'general': 14, 'conceptual': 1}

TIER 3 REFERENCE:
  Pages: 232
  Character range: 298 - 1711
  Ave

In [3]:
# Advanced Tiered Chunking Strategy 

import tiktoken

# Initialize tokenizer for accurate token counting
tokenizer = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    """Count tokens accurately using tiktoken"""
    return len(tokenizer.encode(text))

def enhanced_content_features(text):
    """
    Enhanced content feature detection for chunking decisions
    """
    features = {
        'has_code_examples': bool(re.search(r'(import\s+\w+|pd\.|df\.|print\s*\(|>>>)', text)),
        'has_function_definitions': bool(re.search(r'(def\s+\w+|class\s+\w+)', text)),
        'has_pandas_methods': len(re.findall(r'\.(groupby|merge|concat|pivot|melt|apply|loc|iloc)\(', text, re.IGNORECASE)),
        'has_examples': bool(re.search(r'(example|Example|for instance|For instance)', text, re.IGNORECASE)),
        'has_explanations': bool(re.search(r'(because|therefore|however|moreover|furthermore)', text, re.IGNORECASE)),
        'has_lists': len(re.findall(r'(^\s*[*+-]\s+|\d+\.\s+)', text, re.MULTILINE)),  # Fixed regex
        'has_headers': bool(re.search(r'^[A-Z][^.!?]*:?\s*$', text, re.MULTILINE)),
        'conceptual_density': len(re.findall(r'(understand|concept|principle|approach|method)', text, re.IGNORECASE))
    }
    return features

def create_chunk_record(content, chunk_id, tier, pages, metadata):
    """
    Create standardized chunk record with comprehensive metadata
    """
    token_count = count_tokens(content)
    features = enhanced_content_features(content)
    
    return {
        'chunk_id': chunk_id,
        'tier': tier,
        'content': content,
        'token_count': token_count,
        'source_pages': pages,
        'page_count': len(pages),
        'avg_pandas_score': np.mean(metadata['pandas_scores']) if metadata['pandas_scores'] else 0,
        'avg_code_score': np.mean(metadata['code_scores']) if metadata['code_scores'] else 0,
        'avg_quiz_score': np.mean(metadata['quiz_scores']) if metadata['quiz_scores'] else 0,
        'features': features,
        'preview': content[:300] + "..." if len(content) > 300 else content
    }

def create_tier1_chunks(tier1_pages, start_chunk_id):
    """
    Create high-quality chunks from Tier 1 primary content
    Target: 800-1500 tokens per chunk, preserve context
    """
    chunks = []
    current_chunk_content = []
    current_chunk_tokens = 0
    current_chunk_pages = []
    chunk_metadata = {'pandas_scores': [], 'code_scores': [], 'quiz_scores': []}
    
    for page in tier1_pages:
        page_tokens = count_tokens(page['text'])
        
        # Check if adding this page would exceed target (1500 tokens)
        if current_chunk_tokens + page_tokens > 1500 and current_chunk_content:
            # Create chunk if we have enough content (min 800 tokens)
            if current_chunk_tokens >= 800:
                chunk = create_chunk_record(
                    content='\n\n'.join(current_chunk_content),
                    chunk_id=start_chunk_id + len(chunks),
                    tier='tier_1_primary',
                    pages=current_chunk_pages.copy(),
                    metadata=chunk_metadata
                )
                chunks.append(chunk)
            
            # Start new chunk
            current_chunk_content = [page['text']]
            current_chunk_tokens = page_tokens
            current_chunk_pages = [page['page_number']]
            chunk_metadata = {
                'pandas_scores': [page['pandas_score']],
                'code_scores': [page['code_score']],
                'quiz_scores': [page['quiz_potential']]
            }
        else:
            # Add to current chunk
            current_chunk_content.append(page['text'])
            current_chunk_tokens += page_tokens
            current_chunk_pages.append(page['page_number'])
            chunk_metadata['pandas_scores'].append(page['pandas_score'])
            chunk_metadata['code_scores'].append(page['code_score'])
            chunk_metadata['quiz_scores'].append(page['quiz_potential'])
    
    # Handle final chunk
    if current_chunk_content and current_chunk_tokens >= 600:  # Lower threshold for final chunk
        chunk = create_chunk_record(
            content='\n\n'.join(current_chunk_content),
            chunk_id=start_chunk_id + len(chunks),
            tier='tier_1_primary',
            pages=current_chunk_pages,
            metadata=chunk_metadata
        )
        chunks.append(chunk)
    
    return chunks

def create_tier2_chunks(tier2_pages, start_chunk_id):
    """
    Create supporting chunks from Tier 2 secondary content
    Target: 600-1200 tokens per chunk
    """
    chunks = []
    current_chunk_content = []
    current_chunk_tokens = 0
    current_chunk_pages = []
    chunk_metadata = {'pandas_scores': [], 'code_scores': [], 'quiz_scores': []}
    
    for page in tier2_pages:
        page_tokens = count_tokens(page['text'])
        
        if current_chunk_tokens + page_tokens > 1200 and current_chunk_content:
            if current_chunk_tokens >= 600:
                chunk = create_chunk_record(
                    content='\n\n'.join(current_chunk_content),
                    chunk_id=start_chunk_id + len(chunks),
                    tier='tier_2_secondary',
                    pages=current_chunk_pages.copy(),
                    metadata=chunk_metadata
                )
                chunks.append(chunk)
            
            current_chunk_content = [page['text']]
            current_chunk_tokens = page_tokens
            current_chunk_pages = [page['page_number']]
            chunk_metadata = {
                'pandas_scores': [page['pandas_score']],
                'code_scores': [page['code_score']],
                'quiz_scores': [page['quiz_potential']]
            }
        else:
            current_chunk_content.append(page['text'])
            current_chunk_tokens += page_tokens
            current_chunk_pages.append(page['page_number'])
            chunk_metadata['pandas_scores'].append(page['pandas_score'])
            chunk_metadata['code_scores'].append(page['code_score'])
            chunk_metadata['quiz_scores'].append(page['quiz_potential'])
    
    if current_chunk_content and current_chunk_tokens >= 400:
        chunk = create_chunk_record(
            content='\n\n'.join(current_chunk_content),
            chunk_id=start_chunk_id + len(chunks),
            tier='tier_2_secondary',
            pages=current_chunk_pages,
            metadata=chunk_metadata
        )
        chunks.append(chunk)
    
    return chunks

def group_similar_pages(pages, max_group_size=5):
    """
    Group similar pages for better reference chunks
    """
    groups = []
    current_group = []
    
    for page in pages:
        if len(current_group) >= max_group_size:
            groups.append(current_group)
            current_group = [page]
        else:
            current_group.append(page)
    
    if current_group:
        groups.append(current_group)
    
    return groups

def create_tier3_chunks(tier3_pages, start_chunk_id):
    """
    Create reference chunks from Tier 3 content
    Target: 400-800 tokens per chunk, group related pages
    """
    chunks = []
    
    # Group pages by content similarity for better reference chunks
    grouped_pages = group_similar_pages(tier3_pages)
    
    for group in grouped_pages:
        if not group:
            continue
            
        combined_text = '\n\n'.join([page['text'] for page in group])
        combined_tokens = count_tokens(combined_text)
        
        if combined_tokens >= 400:
            chunk_metadata = {
                'pandas_scores': [page['pandas_score'] for page in group],
                'code_scores': [page['code_score'] for page in group],
                'quiz_scores': [page['quiz_potential'] for page in group]
            }
            
            chunk = create_chunk_record(
                content=combined_text,
                chunk_id=start_chunk_id + len(chunks),
                tier='tier_3_reference',
                pages=[page['page_number'] for page in group],
                metadata=chunk_metadata
            )
            chunks.append(chunk)
    
    return chunks

def create_tier4_chunks(tier4_pages, start_chunk_id):
    """
    Create context chunks from Tier 4 content
    Target: 300-600 tokens per chunk, combine multiple pages
    """
    chunks = []
    current_chunk_content = []
    current_chunk_tokens = 0
    current_chunk_pages = []
    
    for page in tier4_pages:
        page_tokens = count_tokens(page['text'])
        
        if current_chunk_tokens + page_tokens > 600 and current_chunk_content:
            if current_chunk_tokens >= 300:
                chunk_metadata = {
                    'pandas_scores': [0],  # Context chunks have minimal pandas content
                    'code_scores': [0],
                    'quiz_scores': [1]
                }
                
                chunk = create_chunk_record(
                    content='\n\n'.join(current_chunk_content),
                    chunk_id=start_chunk_id + len(chunks),
                    tier='tier_4_context',
                    pages=current_chunk_pages.copy(),
                    metadata=chunk_metadata
                )
                chunks.append(chunk)
            
            current_chunk_content = [page['text']]
            current_chunk_tokens = page_tokens
            current_chunk_pages = [page['page_number']]
        else:
            current_chunk_content.append(page['text'])
            current_chunk_tokens += page_tokens
            current_chunk_pages.append(page['page_number'])
    
    if current_chunk_content and current_chunk_tokens >= 250:
        chunk_metadata = {
            'pandas_scores': [0],
            'code_scores': [0],
            'quiz_scores': [1]
        }
        
        chunk = create_chunk_record(
            content='\n\n'.join(current_chunk_content),
            chunk_id=start_chunk_id + len(chunks),
            tier='tier_4_context',
            pages=current_chunk_pages,
            metadata=chunk_metadata
        )
        chunks.append(chunk)
    
    return chunks

def tier_specific_chunking_strategy(tier_content):
    """
    Apply different chunking strategies based on content tier
    """
    all_chunks = []
    chunk_id = 0
    
    print("Applying tier-specific chunking strategies...")
    
    # TIER 1 PRIMARY - High-quality, context-preserving chunks
    print(f"\nProcessing Tier 1 Primary ({len(tier_content['tier_1_primary'])} pages):")
    tier1_chunks = create_tier1_chunks(tier_content['tier_1_primary'], chunk_id)
    all_chunks.extend(tier1_chunks)
    chunk_id += len(tier1_chunks)
    print(f"  Created {len(tier1_chunks)} high-quality chunks")
    
    # TIER 2 SECONDARY - Supporting content chunks
    print(f"\nProcessing Tier 2 Secondary ({len(tier_content['tier_2_secondary'])} pages):")
    tier2_chunks = create_tier2_chunks(tier_content['tier_2_secondary'], chunk_id)
    all_chunks.extend(tier2_chunks)
    chunk_id += len(tier2_chunks)
    print(f"  Created {len(tier2_chunks)} supporting chunks")
    
    # TIER 3 REFERENCE - Structured reference chunks
    print(f"\nProcessing Tier 3 Reference ({len(tier_content['tier_3_reference'])} pages):")
    tier3_chunks = create_tier3_chunks(tier_content['tier_3_reference'], chunk_id)
    all_chunks.extend(tier3_chunks)
    chunk_id += len(tier3_chunks)
    print(f"  Created {len(tier3_chunks)} reference chunks")
    
    # TIER 4 CONTEXT - Contextual background chunks
    print(f"\nProcessing Tier 4 Context ({len(tier_content['tier_4_context'])} pages):")
    tier4_chunks = create_tier4_chunks(tier_content['tier_4_context'], chunk_id)
    all_chunks.extend(tier4_chunks)
    chunk_id += len(tier4_chunks)
    print(f"  Created {len(tier4_chunks)} context chunks")
    
    return all_chunks

# Execute tiered chunking strategy
print("Starting advanced tiered chunking...")
all_chunks = tier_specific_chunking_strategy(tier_content)

print(f"\nCHUNKING RESULTS SUMMARY:")
print(f"=" * 40)
print(f"Total chunks created: {len(all_chunks)}")

# Analyze chunks by tier
chunk_analysis = {}
for chunk in all_chunks:
    tier = chunk['tier']
    if tier not in chunk_analysis:
        chunk_analysis[tier] = {
            'count': 0,
            'total_tokens': 0,
            'total_pages': 0,
            'avg_pandas_score': [],
            'avg_code_score': [],
            'avg_quiz_score': []
        }
    
    chunk_analysis[tier]['count'] += 1
    chunk_analysis[tier]['total_tokens'] += chunk['token_count']
    chunk_analysis[tier]['total_pages'] += chunk['page_count']
    chunk_analysis[tier]['avg_pandas_score'].append(chunk['avg_pandas_score'])
    chunk_analysis[tier]['avg_code_score'].append(chunk['avg_code_score'])
    chunk_analysis[tier]['avg_quiz_score'].append(chunk['avg_quiz_score'])

for tier, stats in chunk_analysis.items():
    print(f"\n{tier.replace('_', ' ').upper()}:")
    print(f"  Chunks: {stats['count']}")
    if stats['count'] > 0:
        print(f"  Avg tokens per chunk: {stats['total_tokens'] / stats['count']:.0f}")
        print(f"  Total pages covered: {stats['total_pages']}")
        print(f"  Avg pandas score: {np.mean(stats['avg_pandas_score']):.1f}")
        print(f"  Avg code score: {np.mean(stats['avg_code_score']):.1f}")
        print(f"  Avg quiz potential: {np.mean(stats['avg_quiz_score']):.1f}")

total_tokens = sum(chunk['token_count'] for chunk in all_chunks)
total_pages_in_chunks = sum(chunk['page_count'] for chunk in all_chunks)

print(f"\nOVERALL STATISTICS:")
print(f"  Total tokens across all chunks: {total_tokens:,}")
print(f"  Average tokens per chunk: {total_tokens / len(all_chunks):.0f}")
print(f"  Total pages utilized: {total_pages_in_chunks}")
print(f"  Utilization efficiency: {total_pages_in_chunks / 471 * 100:.1f}%")

print(f"\nTiered chunking strategy completed successfully!")

Starting advanced tiered chunking...
Applying tier-specific chunking strategies...

Processing Tier 1 Primary (79 pages):
  Created 13 high-quality chunks

Processing Tier 2 Secondary (105 pages):
  Created 16 supporting chunks

Processing Tier 3 Reference (232 pages):
  Created 47 reference chunks

Processing Tier 4 Context (55 pages):
  Created 9 context chunks

CHUNKING RESULTS SUMMARY:
Total chunks created: 85

TIER 1 PRIMARY:
  Chunks: 13
  Avg tokens per chunk: 1326
  Total pages covered: 79
  Avg pandas score: 9.2
  Avg code score: 1.4
  Avg quiz potential: 2.4

TIER 2 SECONDARY:
  Chunks: 16
  Avg tokens per chunk: 1099
  Total pages covered: 105
  Avg pandas score: 3.8
  Avg code score: 0.6
  Avg quiz potential: 2.0

TIER 3 REFERENCE:
  Chunks: 47
  Avg tokens per chunk: 873
  Total pages covered: 232
  Avg pandas score: 0.7
  Avg code score: 0.2
  Avg quiz potential: 1.8

TIER 4 CONTEXT:
  Chunks: 9
  Avg tokens per chunk: 493
  Total pages covered: 55
  Avg pandas score: 0.0

In [4]:
# Save Tiered Chunking Results

import pickle
import json

print("Saving comprehensive chunking results...")

# Save all chunks with complete metadata
chunks_file = PROCESSED_DATA_PATH / 'tiered_chunks_comprehensive.pkl'
with open(chunks_file, 'wb') as f:
    pickle.dump(all_chunks, f)
print(f"All chunks saved to: {chunks_file}")

# Create summary statistics for next notebooks
chunking_summary = {
    'total_chunks': len(all_chunks),
    'total_tokens': sum(chunk['token_count'] for chunk in all_chunks),
    'total_pages_utilized': sum(chunk['page_count'] for chunk in all_chunks),
    'utilization_efficiency': 100.0,
    'tier_statistics': {},
    'quality_metrics': {
        'high_pandas_chunks': len([c for c in all_chunks if c['avg_pandas_score'] > 5]),
        'code_heavy_chunks': len([c for c in all_chunks if c['avg_code_score'] > 1]),
        'quiz_potential_chunks': len([c for c in all_chunks if c['avg_quiz_score'] > 2]),
        'large_chunks': len([c for c in all_chunks if c['token_count'] > 1000]),
        'comprehensive_chunks': len([c for c in all_chunks if c['page_count'] > 3])
    },
    'improvement_metrics': {
        'vs_original_chunk_count': f"85 vs 13 (6.5x improvement)",
        'vs_original_utilization': f"100% vs 16% (6.25x improvement)", 
        'avg_tokens_per_chunk': f"{sum(chunk['token_count'] for chunk in all_chunks) / len(all_chunks):.0f}",
        'content_coverage': "Complete PDF utilization achieved"
    }
}

# Calculate tier-specific statistics
for tier in ['tier_1_primary', 'tier_2_secondary', 'tier_3_reference', 'tier_4_context']:
    tier_chunks = [c for c in all_chunks if c['tier'] == tier]
    if tier_chunks:
        chunking_summary['tier_statistics'][tier] = {
            'chunk_count': len(tier_chunks),
            'avg_tokens': sum(c['token_count'] for c in tier_chunks) / len(tier_chunks),
            'total_pages': sum(c['page_count'] for c in tier_chunks),
            'avg_pandas_score': np.mean([c['avg_pandas_score'] for c in tier_chunks]),
            'avg_code_score': np.mean([c['avg_code_score'] for c in tier_chunks]),
            'avg_quiz_score': np.mean([c['avg_quiz_score'] for c in tier_chunks])
        }

# Save summary statistics
summary_file = PROCESSED_DATA_PATH / 'chunking_summary.json'
with open(summary_file, 'w') as f:
    json.dump(chunking_summary, f, indent=2)
print(f"Chunking summary saved to: {summary_file}")

# Create tier-specific chunk collections for specialized processing
tier_collections = {}
for tier in ['tier_1_primary', 'tier_2_secondary', 'tier_3_reference', 'tier_4_context']:
    tier_chunks = [c for c in all_chunks if c['tier'] == tier]
    tier_collections[tier] = tier_chunks

# Save tier collections for quiz generation
tier_collections_file = PROCESSED_DATA_PATH / 'tier_collections.pkl'
with open(tier_collections_file, 'wb') as f:
    pickle.dump(tier_collections, f)
print(f"Tier collections saved to: {tier_collections_file}")

# Display comprehensive improvement summary
print(f"\nCOMPREHENSIVE CHUNKING COMPLETED!")
print(f"=" * 50)
print(f"MASSIVE IMPROVEMENTS ACHIEVED:")
print(f"  Original system: 13 chunks, ~16% PDF utilization")
print(f"  New system: 85 chunks, 100% PDF utilization")
print(f"  Improvement factor: 6.5x more chunks, 6.25x better utilization")

print(f"\nQUALITY METRICS:")
print(f"  High pandas content chunks: {chunking_summary['quality_metrics']['high_pandas_chunks']}")
print(f"  Code-heavy chunks: {chunking_summary['quality_metrics']['code_heavy_chunks']}")
print(f"  High quiz potential chunks: {chunking_summary['quality_metrics']['quiz_potential_chunks']}")
print(f"  Large comprehensive chunks: {chunking_summary['quality_metrics']['large_chunks']}")

print(f"\nTIER DISTRIBUTION:")
for tier, stats in chunking_summary['tier_statistics'].items():
    tier_name = tier.replace('_', ' ').title()
    print(f"  {tier_name}: {stats['chunk_count']} chunks, {stats['avg_tokens']:.0f} avg tokens")

print(f"\nREADY FOR NEXT STAGE:")
print(f"  All chunks processed and saved")
print(f"  Tier collections prepared for quiz generation")
print(f"  100% PDF content ready for retrieval system")

# Verification of saved files
print(f"\nFILE VERIFICATION:")
print(f"  Chunks file: {chunks_file.exists()} ({chunks_file.stat().st_size / 1024:.1f} KB)")
print(f"  Summary file: {summary_file.exists()} ({summary_file.stat().st_size / 1024:.1f} KB)")
print(f"  Tier collections: {tier_collections_file.exists()} ({tier_collections_file.stat().st_size / 1024:.1f} KB)")


print(f"Ready to proceed with optimized retrieval and quiz generation!")

Saving comprehensive chunking results...
All chunks saved to: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\processed\tiered_chunks_comprehensive.pkl
Chunking summary saved to: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\processed\chunking_summary.json
Tier collections saved to: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\processed\tier_collections.pkl

COMPREHENSIVE CHUNKING COMPLETED!
MASSIVE IMPROVEMENTS ACHIEVED:
  Original system: 13 chunks, ~16% PDF utilization
  New system: 85 chunks, 100% PDF utilization
  Improvement factor: 6.5x more chunks, 6.25x better utilization

QUALITY METRICS:
  High pandas content chunks: 13
  Code-heavy chunks: 8
  High quiz potential chunks: 31
  Large comprehensive chunks: 37

TIER DISTRIBUTION:
  Tier 1 Primary: 13 chunks, 1326 avg tokens
  Tier 2 Secondary: 16 chunks, 1099 avg tokens
  Tier 3 Reference: 47 chunks, 873 avg tokens
  Tier 4 Context: 9 chunks, 493 avg tokens

READY FOR NEXT STAGE:
  All chun