In [1]:
# Setup and Load Comprehensive Chunks

import pandas as pd
import numpy as np
import pickle
import json
import re
from pathlib import Path
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Setup paths
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
PROCESSED_DATA_PATH = PROJECT_ROOT / 'data' / 'processed'

print("ADVANCED CHUNK OPTIMIZATION AND PREPARATION")
print("=" * 60)
print("Optimizing 85 chunks for retrieval and quiz generation")

# Load comprehensive chunking results
chunks_file = PROCESSED_DATA_PATH / 'tiered_chunks_comprehensive.pkl'
summary_file = PROCESSED_DATA_PATH / 'chunking_summary.json'
tier_collections_file = PROCESSED_DATA_PATH / 'tier_collections.pkl'

if not chunks_file.exists():
    print("ERROR: Chunking results not found!")
    print("Please run notebook 02_tiered_content_processing.ipynb first")
    exit()

# Load all data
with open(chunks_file, 'rb') as f:
    all_chunks = pickle.load(f)

with open(summary_file, 'r') as f:
    chunking_summary = json.load(f)

with open(tier_collections_file, 'rb') as f:
    tier_collections = pickle.load(f)

print(f"Loaded comprehensive chunk data:")
print(f"  Total chunks: {len(all_chunks)}")
print(f"  Total tokens: {chunking_summary['total_tokens']:,}")
print(f"  Utilization: {chunking_summary['utilization_efficiency']:.1f}%")

print(f"\nChunk distribution by tier:")
for tier, stats in chunking_summary['tier_statistics'].items():
    tier_name = tier.replace('_', ' ').title()
    print(f"  {tier_name}: {stats['chunk_count']} chunks, {stats['avg_tokens']:.0f} avg tokens")

print(f"\nQuality metrics loaded:")
quality = chunking_summary['quality_metrics']
print(f"  High pandas chunks: {quality['high_pandas_chunks']}")
print(f"  Code-heavy chunks: {quality['code_heavy_chunks']}")
print(f"  Quiz potential chunks: {quality['quiz_potential_chunks']}")
print(f"  Large comprehensive chunks: {quality['large_chunks']}")

print(f"\nReady to optimize chunks for dual-purpose system:")
print(f"  1. Enhanced retrieval performance")
print(f"  2. Rich quiz content generation")
print(f"  3. Intelligent content tagging and categorization")

ADVANCED CHUNK OPTIMIZATION AND PREPARATION
Optimizing 85 chunks for retrieval and quiz generation
Loaded comprehensive chunk data:
  Total chunks: 85
  Total tokens: 80,277
  Utilization: 100.0%

Chunk distribution by tier:
  Tier 1 Primary: 13 chunks, 1326 avg tokens
  Tier 2 Secondary: 16 chunks, 1099 avg tokens
  Tier 3 Reference: 47 chunks, 873 avg tokens
  Tier 4 Context: 9 chunks, 493 avg tokens

Quality metrics loaded:
  High pandas chunks: 13
  Code-heavy chunks: 8
  Quiz potential chunks: 31
  Large comprehensive chunks: 37

Ready to optimize chunks for dual-purpose system:
  1. Enhanced retrieval performance
  2. Rich quiz content generation
  3. Intelligent content tagging and categorization


In [2]:
# Advanced Content Analysis and Enhancement

def analyze_chunk_content_depth(chunk):
    """
    Deep content analysis for enhanced retrieval and quiz generation
    """
    text = chunk['content']
    
    # Advanced pandas function detection
    pandas_functions = {
        'data_loading': ['read_csv', 'read_excel', 'read_json', 'read_sql', 'read_html'],
        'data_inspection': ['head', 'tail', 'info', 'describe', 'shape', 'dtypes', 'columns'],
        'data_selection': ['loc', 'iloc', 'query', 'filter', 'where', 'mask'],
        'data_manipulation': ['groupby', 'merge', 'concat', 'join', 'pivot', 'melt'],
        'data_cleaning': ['dropna', 'fillna', 'drop_duplicates', 'replace', 'astype'],
        'data_analysis': ['value_counts', 'unique', 'nunique', 'sort_values', 'sort_index'],
        'data_visualization': ['plot', 'hist', 'scatter', 'boxplot', 'bar']
    }
    
    function_coverage = {}
    for category, functions in pandas_functions.items():
        count = sum(len(re.findall(rf'\b{func}\b', text, re.IGNORECASE)) for func in functions)
        function_coverage[category] = count
    
    # Quiz content type detection
    quiz_indicators = {
        'conceptual_questions': len(re.findall(r'(what\s+is|define|concept|principle|understand)', text, re.IGNORECASE)),
        'practical_examples': len(re.findall(r'(example|for\s+instance|let\'s\s+try|consider)', text, re.IGNORECASE)),
        'code_exercises': len(re.findall(r'(import|pd\.|df\.|print\s*\(|\=\s*pd\.)', text, re.IGNORECASE)),
        'comparison_content': len(re.findall(r'(difference|compare|versus|vs\.|better|alternative)', text, re.IGNORECASE)),
        'step_by_step': len(re.findall(r'(step|first|then|next|finally|process)', text, re.IGNORECASE)),
        'best_practices': len(re.findall(r'(best\s+practice|recommend|should|important|tip)', text, re.IGNORECASE))
    }
    
    # Content difficulty assessment
    difficulty_indicators = {
        'beginner': len(re.findall(r'(basic|simple|introduction|getting\s+started|beginner)', text, re.IGNORECASE)),
        'intermediate': len(re.findall(r'(advanced|complex|sophisticated|optimization)', text, re.IGNORECASE)),
        'expert': len(re.findall(r'(expert|professional|production|performance|scale)', text, re.IGNORECASE))
    }
    
    # Technical depth indicators
    technical_depth = {
        'syntax_heavy': chunk['features']['has_code_examples'],
        'concept_heavy': chunk['features']['conceptual_density'] > 3,
        'method_focused': function_coverage['data_manipulation'] > 2,
        'practical_oriented': quiz_indicators['practical_examples'] > 2,
        'theoretical_content': quiz_indicators['conceptual_questions'] > 2
    }
    
    # Retrieval optimization features
    retrieval_features = {
        'keyword_density': len(re.findall(r'(pandas|dataframe|series|index)', text, re.IGNORECASE)),
        'context_richness': chunk['page_count'] > 2,
        'example_rich': quiz_indicators['practical_examples'] > 1,
        'comprehensive_coverage': chunk['token_count'] > 1000,
        'multi_topic': sum(1 for count in function_coverage.values() if count > 0) > 3
    }
    
    return {
        'function_coverage': function_coverage,
        'quiz_indicators': quiz_indicators,
        'difficulty_indicators': difficulty_indicators,
        'technical_depth': technical_depth,
        'retrieval_features': retrieval_features
    }

def categorize_chunk_for_quiz_generation(chunk, analysis):
    """
    Categorize chunks for different types of quiz questions
    """
    quiz_categories = []
    
    # Multiple choice potential
    if (analysis['quiz_indicators']['conceptual_questions'] > 1 or 
        analysis['quiz_indicators']['comparison_content'] > 1):
        quiz_categories.append('multiple_choice')
    
    # Code completion potential  
    if (analysis['quiz_indicators']['code_exercises'] > 2 or
        chunk['features']['has_code_examples']):
        quiz_categories.append('code_completion')
    
    # True/false potential
    if (analysis['quiz_indicators']['best_practices'] > 1 or
        analysis['difficulty_indicators']['beginner'] > 1):
        quiz_categories.append('true_false')
    
    # Fill in the blank potential
    if (analysis['function_coverage']['data_manipulation'] > 1 or
        analysis['function_coverage']['data_selection'] > 1):
        quiz_categories.append('fill_blank')
    
    # Scenario-based potential
    if (analysis['quiz_indicators']['practical_examples'] > 1 and
        analysis['quiz_indicators']['step_by_step'] > 1):
        quiz_categories.append('scenario_based')
    
    return quiz_categories

def enhance_chunk_metadata(chunk):
    """
    Add comprehensive metadata for dual-purpose optimization
    """
    # Perform deep analysis
    analysis = analyze_chunk_content_depth(chunk)
    
    # Categorize for quiz generation
    quiz_categories = categorize_chunk_for_quiz_generation(chunk, analysis)
    
    # Determine primary focus areas
    function_totals = analysis['function_coverage']
    primary_functions = [category for category, count in function_totals.items() if count > 1]
    
    # Calculate content scores
    content_scores = {
        'retrieval_score': sum([
            chunk['avg_pandas_score'] * 0.3,
            analysis['retrieval_features']['keyword_density'] * 0.2,
            analysis['retrieval_features']['comprehensive_coverage'] * 0.2,
            analysis['retrieval_features']['example_rich'] * 0.15,
            analysis['retrieval_features']['multi_topic'] * 0.15
        ]),
        'quiz_score': sum([
            sum(analysis['quiz_indicators'].values()) * 0.4,
            len(quiz_categories) * 0.3,
            chunk['avg_quiz_score'] * 0.3
        ]),
        'technical_score': sum([
            chunk['avg_code_score'] * 0.4,
            analysis['function_coverage']['data_manipulation'] * 0.3,
            analysis['technical_depth']['syntax_heavy'] * 0.3
        ])
    }
    
    # Enhanced metadata
    enhanced_metadata = {
        'content_analysis': analysis,
        'quiz_categories': quiz_categories,
        'primary_functions': primary_functions,
        'content_scores': content_scores,
        'optimization_tags': {
            'high_retrieval_value': content_scores['retrieval_score'] > 5,
            'excellent_quiz_source': content_scores['quiz_score'] > 3,
            'technical_reference': content_scores['technical_score'] > 2,
            'comprehensive_content': chunk['token_count'] > 1000,
            'multi_purpose': len(quiz_categories) > 2
        }
    }
    
    return enhanced_metadata

print("Performing advanced content analysis on all 85 chunks...")

# Enhance all chunks with comprehensive metadata
enhanced_chunks = []
for i, chunk in enumerate(all_chunks):
    enhanced_metadata = enhance_chunk_metadata(chunk)
    
    # Create enhanced chunk record
    enhanced_chunk = {
        **chunk,  # Original chunk data
        **enhanced_metadata  # Enhanced analysis
    }
    
    enhanced_chunks.append(enhanced_chunk)
    
    # Progress indicator
    if (i + 1) % 20 == 0 or i == len(all_chunks) - 1:
        print(f"  Enhanced {i + 1}/{len(all_chunks)} chunks")

print(f"\nAdvanced content analysis completed!")

# Analyze enhancement results
print(f"\nENHANCEMENT ANALYSIS:")
print(f"=" * 40)

# Quiz category distribution
all_quiz_categories = []
for chunk in enhanced_chunks:
    all_quiz_categories.extend(chunk['quiz_categories'])

quiz_category_counts = {}
for category in all_quiz_categories:
    quiz_category_counts[category] = quiz_category_counts.get(category, 0) + 1

print(f"Quiz generation potential:")
for category, count in quiz_category_counts.items():
    print(f"  {category.replace('_', ' ').title()}: {count} chunks")

# Function coverage analysis
function_coverage_summary = defaultdict(int)
for chunk in enhanced_chunks:
    for category, count in chunk['content_analysis']['function_coverage'].items():
        if count > 0:
            function_coverage_summary[category] += 1

print(f"\nPandas function coverage:")
for category, chunk_count in function_coverage_summary.items():
    print(f"  {category.replace('_', ' ').title()}: {chunk_count} chunks")

# Optimization tag analysis
optimization_stats = {
    'high_retrieval_value': 0,
    'excellent_quiz_source': 0, 
    'technical_reference': 0,
    'comprehensive_content': 0,
    'multi_purpose': 0
}

for chunk in enhanced_chunks:
    for tag, value in chunk['optimization_tags'].items():
        if value:
            optimization_stats[tag] += 1

print(f"\nOptimization tag distribution:")
for tag, count in optimization_stats.items():
    percentage = (count / len(enhanced_chunks)) * 100
    print(f"  {tag.replace('_', ' ').title()}: {count} chunks ({percentage:.1f}%)")

# Content score analysis
retrieval_scores = [chunk['content_scores']['retrieval_score'] for chunk in enhanced_chunks]
quiz_scores = [chunk['content_scores']['quiz_score'] for chunk in enhanced_chunks]
technical_scores = [chunk['content_scores']['technical_score'] for chunk in enhanced_chunks]

print(f"\nContent score statistics:")
print(f"  Retrieval scores - Avg: {np.mean(retrieval_scores):.1f}, Max: {np.max(retrieval_scores):.1f}")
print(f"  Quiz scores - Avg: {np.mean(quiz_scores):.1f}, Max: {np.max(quiz_scores):.1f}")
print(f"  Technical scores - Avg: {np.mean(technical_scores):.1f}, Max: {np.max(technical_scores):.1f}")

print(f"\nChunk enhancement completed successfully!")
print(f"All 85 chunks now optimized for dual-purpose system")

Performing advanced content analysis on all 85 chunks...
  Enhanced 20/85 chunks
  Enhanced 40/85 chunks
  Enhanced 60/85 chunks
  Enhanced 80/85 chunks
  Enhanced 85/85 chunks

Advanced content analysis completed!

ENHANCEMENT ANALYSIS:
Quiz generation potential:
  Multiple Choice: 26 chunks
  Code Completion: 29 chunks
  True False: 53 chunks
  Fill Blank: 65 chunks
  Scenario Based: 36 chunks

Pandas function coverage:
  Data Inspection: 71 chunks
  Data Selection: 77 chunks
  Data Manipulation: 24 chunks
  Data Analysis: 38 chunks
  Data Cleaning: 16 chunks
  Data Loading: 6 chunks

Optimization tag distribution:
  High Retrieval Value: 24 chunks (28.2%)
  Excellent Quiz Source: 72 chunks (84.7%)
  Technical Reference: 13 chunks (15.3%)
  Comprehensive Content: 37 chunks (43.5%)
  Multi Purpose: 40 chunks (47.1%)

Content score statistics:
  Retrieval scores - Avg: 3.8, Max: 19.8
  Quiz scores - Avg: 5.9, Max: 14.4
  Technical scores - Avg: 0.9, Max: 6.9

Chunk enhancement complete

In [3]:
# Final Chunk Optimization and Preparation

def create_specialized_collections(enhanced_chunks):
    """
    Create specialized chunk collections for different use cases
    """
    collections = {
        'high_retrieval': [],      # Best chunks for retrieval system
        'quiz_generation': [],     # Best chunks for quiz creation
        'code_examples': [],       # Code-heavy chunks for practical questions
        'conceptual': [],          # Conceptual content for theory questions
        'comprehensive': [],       # Large, multi-topic chunks
        'reference': []           # Quick reference and lookup content
    }
    
    for chunk in enhanced_chunks:
        # High retrieval value chunks
        if chunk['optimization_tags']['high_retrieval_value']:
            collections['high_retrieval'].append(chunk)
        
        # Excellent quiz source chunks
        if chunk['optimization_tags']['excellent_quiz_source']:
            collections['quiz_generation'].append(chunk)
        
        # Code-heavy chunks
        if (chunk['content_scores']['technical_score'] > 1 or 
            'code_completion' in chunk['quiz_categories']):
            collections['code_examples'].append(chunk)
        
        # Conceptual chunks
        if (chunk['avg_pandas_score'] > 5 and 
            chunk['content_analysis']['technical_depth']['concept_heavy']):
            collections['conceptual'].append(chunk)
        
        # Comprehensive content chunks
        if chunk['optimization_tags']['comprehensive_content']:
            collections['comprehensive'].append(chunk)
        
        # Reference chunks (Tier 3 and some Tier 2)
        if chunk['tier'] in ['tier_3_reference', 'tier_2_secondary']:
            collections['reference'].append(chunk)
    
    return collections

def create_quiz_question_bank_preparation(enhanced_chunks):
    """
    Prepare chunks specifically for quiz question generation
    """
    quiz_bank = {
        'multiple_choice': {
            'beginner': [],
            'intermediate': [],
            'advanced': []
        },
        'code_completion': {
            'syntax': [],
            'functions': [],
            'practical': []
        },
        'true_false': {
            'concepts': [],
            'best_practices': [],
            'facts': []
        },
        'fill_blank': {
            'parameters': [],
            'methods': [],
            'syntax': []
        },
        'scenario_based': {
            'data_analysis': [],
            'problem_solving': [],
            'real_world': []
        }
    }
    
    for chunk in enhanced_chunks:
        # Determine difficulty level
        difficulty_scores = chunk['content_analysis']['difficulty_indicators']
        if difficulty_scores['beginner'] > difficulty_scores['intermediate']:
            difficulty = 'beginner'
        elif difficulty_scores['expert'] > 0:
            difficulty = 'advanced'
        else:
            difficulty = 'intermediate'
        
        # Sort into appropriate quiz categories
        for quiz_type in chunk['quiz_categories']:
            if quiz_type == 'multiple_choice':
                quiz_bank['multiple_choice'][difficulty].append(chunk)
            
            elif quiz_type == 'code_completion':
                if chunk['content_scores']['technical_score'] > 2:
                    quiz_bank['code_completion']['practical'].append(chunk)
                elif chunk['content_analysis']['function_coverage']['data_manipulation'] > 1:
                    quiz_bank['code_completion']['functions'].append(chunk)
                else:
                    quiz_bank['code_completion']['syntax'].append(chunk)
            
            elif quiz_type == 'true_false':
                if chunk['content_analysis']['quiz_indicators']['best_practices'] > 1:
                    quiz_bank['true_false']['best_practices'].append(chunk)
                elif chunk['content_analysis']['quiz_indicators']['conceptual_questions'] > 1:
                    quiz_bank['true_false']['concepts'].append(chunk)
                else:
                    quiz_bank['true_false']['facts'].append(chunk)
            
            elif quiz_type == 'fill_blank':
                if chunk['content_analysis']['function_coverage']['data_manipulation'] > 1:
                    quiz_bank['fill_blank']['methods'].append(chunk)
                elif chunk['content_analysis']['function_coverage']['data_selection'] > 1:
                    quiz_bank['fill_blank']['parameters'].append(chunk)
                else:
                    quiz_bank['fill_blank']['syntax'].append(chunk)
            
            elif quiz_type == 'scenario_based':
                if chunk['content_analysis']['quiz_indicators']['practical_examples'] > 2:
                    quiz_bank['scenario_based']['real_world'].append(chunk)
                elif chunk['content_analysis']['quiz_indicators']['step_by_step'] > 2:
                    quiz_bank['scenario_based']['problem_solving'].append(chunk)
                else:
                    quiz_bank['scenario_based']['data_analysis'].append(chunk)
    
    return quiz_bank

def calculate_system_readiness_metrics(enhanced_chunks, collections, quiz_bank):
    """
    Calculate comprehensive readiness metrics for the dual-purpose system
    """
    metrics = {
        'retrieval_system': {
            'total_chunks': len(enhanced_chunks),
            'high_quality_chunks': len(collections['high_retrieval']),
            'comprehensive_chunks': len(collections['comprehensive']),
            'code_example_chunks': len(collections['code_examples']),
            'conceptual_chunks': len(collections['conceptual']),
            'coverage_score': len(collections['high_retrieval']) / len(enhanced_chunks) * 100
        },
        'quiz_system': {
            'total_quiz_chunks': len(collections['quiz_generation']),
            'quiz_types_covered': len([qt for qt in quiz_bank.keys() if any(quiz_bank[qt].values() if isinstance(quiz_bank[qt], dict) else quiz_bank[qt])]),
            'difficulty_levels': 3,  # beginner, intermediate, advanced
            'question_generation_potential': sum(len(chunks) for category in quiz_bank.values() 
                                               for chunks in (category.values() if isinstance(category, dict) else [category])),
            'quiz_coverage_score': len(collections['quiz_generation']) / len(enhanced_chunks) * 100
        },
        'content_quality': {
            'avg_tokens_per_chunk': np.mean([chunk['token_count'] for chunk in enhanced_chunks]),
            'avg_retrieval_score': np.mean([chunk['content_scores']['retrieval_score'] for chunk in enhanced_chunks]),
            'avg_quiz_score': np.mean([chunk['content_scores']['quiz_score'] for chunk in enhanced_chunks]),
            'multi_purpose_chunks': len([c for c in enhanced_chunks if c['optimization_tags']['multi_purpose']]),
            'utilization_efficiency': 100.0  # 100% PDF utilization achieved
        }
    }
    
    return metrics

print("Creating specialized chunk collections...")

# Create specialized collections
specialized_collections = create_specialized_collections(enhanced_chunks)

print(f"Specialized collections created:")
for collection_name, chunks in specialized_collections.items():
    print(f"  {collection_name.replace('_', ' ').title()}: {len(chunks)} chunks")

print(f"\nPreparing quiz question bank structure...")

# Create quiz question bank preparation
quiz_question_bank = create_quiz_question_bank_preparation(enhanced_chunks)

print(f"Quiz question bank prepared:")
for quiz_type, categories in quiz_question_bank.items():
    total_chunks = sum(len(chunks) for chunks in (categories.values() if isinstance(categories, dict) else [categories]))
    print(f"  {quiz_type.replace('_', ' ').title()}: {total_chunks} chunks")
    
    if isinstance(categories, dict):
        for category, chunks in categories.items():
            if chunks:
                print(f"    - {category.replace('_', ' ').title()}: {len(chunks)} chunks")

print(f"\nCalculating system readiness metrics...")

# Calculate comprehensive readiness metrics
readiness_metrics = calculate_system_readiness_metrics(enhanced_chunks, specialized_collections, quiz_question_bank)

print(f"\nSYSTEM READINESS ANALYSIS:")
print(f"=" * 50)

print(f"\nRETRIEVAL SYSTEM READINESS:")
retrieval = readiness_metrics['retrieval_system']
print(f"  Total chunks available: {retrieval['total_chunks']}")
print(f"  High-quality retrieval chunks: {retrieval['high_quality_chunks']}")
print(f"  Comprehensive content chunks: {retrieval['comprehensive_chunks']}")
print(f"  Code example chunks: {retrieval['code_example_chunks']}")
print(f"  Conceptual chunks: {retrieval['conceptual_chunks']}")
print(f"  Retrieval coverage score: {retrieval['coverage_score']:.1f}%")

print(f"\nQUIZ SYSTEM READINESS:")
quiz = readiness_metrics['quiz_system']
print(f"  Quiz-ready chunks: {quiz['total_quiz_chunks']}")
print(f"  Quiz types covered: {quiz['quiz_types_covered']}/5")
print(f"  Difficulty levels: {quiz['difficulty_levels']}")
print(f"  Question generation potential: {quiz['question_generation_potential']} total questions")
print(f"  Quiz coverage score: {quiz['quiz_coverage_score']:.1f}%")

print(f"\nCONTENT QUALITY METRICS:")
quality = readiness_metrics['content_quality']
print(f"  Average tokens per chunk: {quality['avg_tokens_per_chunk']:.0f}")
print(f"  Average retrieval score: {quality['avg_retrieval_score']:.1f}")
print(f"  Average quiz score: {quality['avg_quiz_score']:.1f}")
print(f"  Multi-purpose chunks: {quality['multi_purpose_chunks']}")
print(f"  PDF utilization efficiency: {quality['utilization_efficiency']:.1f}%")

# Calculate improvement vs original system
original_chunks = 13  # From previous system
improvement_factor = len(enhanced_chunks) / original_chunks

print(f"\nIMPROVEMENT ANALYSIS:")
print(f"  Chunk count improvement: {improvement_factor:.1f}x (85 vs 13)")
print(f"  Content utilization: 100% vs ~16% (6.25x improvement)")
print(f"  Quiz generation capability: NEW FEATURE ADDED")
print(f"  Dual-purpose optimization: FULLY IMPLEMENTED")

print(f"\nAdvanced chunk optimization completed!")

Creating specialized chunk collections...
Specialized collections created:
  High Retrieval: 24 chunks
  Quiz Generation: 72 chunks
  Code Examples: 40 chunks
  Conceptual: 8 chunks
  Comprehensive: 37 chunks
  Reference: 63 chunks

Preparing quiz question bank structure...
Quiz question bank prepared:
  Multiple Choice: 26 chunks
    - Beginner: 9 chunks
    - Intermediate: 8 chunks
    - Advanced: 9 chunks
  Code Completion: 29 chunks
    - Syntax: 23 chunks
    - Functions: 2 chunks
    - Practical: 4 chunks
  True False: 53 chunks
    - Concepts: 3 chunks
    - Best Practices: 37 chunks
    - Facts: 13 chunks
  Fill Blank: 65 chunks
    - Parameters: 45 chunks
    - Methods: 20 chunks
  Scenario Based: 36 chunks
    - Data Analysis: 5 chunks
    - Problem Solving: 7 chunks
    - Real World: 24 chunks

Calculating system readiness metrics...

SYSTEM READINESS ANALYSIS:

RETRIEVAL SYSTEM READINESS:
  Total chunks available: 85
  High-quality retrieval chunks: 24
  Comprehensive conte

In [4]:
# Save Optimized Dual-Purpose System

print("Saving optimized dual-purpose system...")

# Save enhanced chunks with all metadata
enhanced_chunks_file = PROCESSED_DATA_PATH / 'enhanced_chunks_complete.pkl'
with open(enhanced_chunks_file, 'wb') as f:
    pickle.dump(enhanced_chunks, f)
print(f"Enhanced chunks saved to: {enhanced_chunks_file}")

# Save specialized collections
collections_file = PROCESSED_DATA_PATH / 'specialized_collections.pkl'
with open(collections_file, 'wb') as f:
    pickle.dump(specialized_collections, f)
print(f"Specialized collections saved to: {collections_file}")

# Save quiz question bank structure
quiz_bank_file = PROCESSED_DATA_PATH / 'quiz_question_bank.pkl'
with open(quiz_bank_file, 'wb') as f:
    pickle.dump(quiz_question_bank, f)
print(f"Quiz question bank saved to: {quiz_bank_file}")

# Save system readiness metrics
metrics_file = PROCESSED_DATA_PATH / 'system_readiness_metrics.json'
with open(metrics_file, 'w') as f:
    json.dump(readiness_metrics, f, indent=2)
print(f"System metrics saved to: {metrics_file}")

# Create comprehensive system summary
system_summary = {
    'system_overview': {
        'total_chunks': len(enhanced_chunks),
        'pdf_utilization': 100.0,
        'dual_purpose_optimization': True,
        'chunk_improvement_factor': 6.5,
        'utilization_improvement_factor': 6.25
    },
    'retrieval_capabilities': {
        'high_quality_chunks': len(specialized_collections['high_retrieval']),
        'code_example_chunks': len(specialized_collections['code_examples']),
        'conceptual_chunks': len(specialized_collections['conceptual']),
        'comprehensive_chunks': len(specialized_collections['comprehensive']),
        'reference_chunks': len(specialized_collections['reference']),
        'avg_retrieval_score': readiness_metrics['content_quality']['avg_retrieval_score']
    },
    'quiz_capabilities': {
        'quiz_ready_chunks': len(specialized_collections['quiz_generation']),
        'question_types_covered': 5,
        'difficulty_levels_covered': 3,
        'total_question_potential': readiness_metrics['quiz_system']['question_generation_potential'],
        'quiz_coverage_percentage': readiness_metrics['quiz_system']['quiz_coverage_score'],
        'avg_quiz_score': readiness_metrics['content_quality']['avg_quiz_score']
    },
    'quiz_type_distribution': {
        'multiple_choice': len(quiz_question_bank['multiple_choice']['beginner']) + 
                          len(quiz_question_bank['multiple_choice']['intermediate']) + 
                          len(quiz_question_bank['multiple_choice']['advanced']),
        'code_completion': len(quiz_question_bank['code_completion']['syntax']) + 
                          len(quiz_question_bank['code_completion']['functions']) + 
                          len(quiz_question_bank['code_completion']['practical']),
        'true_false': len(quiz_question_bank['true_false']['concepts']) + 
                     len(quiz_question_bank['true_false']['best_practices']) + 
                     len(quiz_question_bank['true_false']['facts']),
        'fill_blank': len(quiz_question_bank['fill_blank']['parameters']) + 
                     len(quiz_question_bank['fill_blank']['methods']),
        'scenario_based': len(quiz_question_bank['scenario_based']['data_analysis']) + 
                         len(quiz_question_bank['scenario_based']['problem_solving']) + 
                         len(quiz_question_bank['scenario_based']['real_world'])
    },
    'quality_metrics': {
        'avg_tokens_per_chunk': readiness_metrics['content_quality']['avg_tokens_per_chunk'],
        'multi_purpose_chunks': readiness_metrics['content_quality']['multi_purpose_chunks'],
        'content_utilization_efficiency': readiness_metrics['content_quality']['utilization_efficiency']
    }
}

# Save comprehensive summary
summary_file = PROCESSED_DATA_PATH / 'comprehensive_system_summary.json'
with open(summary_file, 'w') as f:
    json.dump(system_summary, f, indent=2)
print(f"Comprehensive summary saved to: {summary_file}")

# Display final achievement summary
print(f"\nCOMPREHENSIVE DUAL-PURPOSE SYSTEM COMPLETED!")
print(f"=" * 60)

print(f"\nSYSTEM ACHIEVEMENTS:")
print(f"  MASSIVE SCALE: 85 chunks from 100% PDF content")
print(f"  DUAL PURPOSE: Optimized for retrieval AND quiz generation")
print(f"  QUIZ EXCELLENCE: 84.7% chunks ready for quiz generation")
print(f"  QUESTION DIVERSITY: 5/5 quiz types, 209 question potential")
print(f"  RETRIEVAL QUALITY: 28.2% high-quality retrieval chunks")
print(f"  IMPROVEMENT: 6.5x more chunks, 6.25x better utilization")

print(f"\nQUIZ SYSTEM CAPABILITIES:")
print(f"  Multiple Choice: {system_summary['quiz_type_distribution']['multiple_choice']} chunks")
print(f"  Code Completion: {system_summary['quiz_type_distribution']['code_completion']} chunks")
print(f"  True/False: {system_summary['quiz_type_distribution']['true_false']} chunks")
print(f"  Fill in Blank: {system_summary['quiz_type_distribution']['fill_blank']} chunks")
print(f"  Scenario Based: {system_summary['quiz_type_distribution']['scenario_based']} chunks")

print(f"\nRETRIEVAL SYSTEM CAPABILITIES:")
print(f"  High-quality retrieval: {system_summary['retrieval_capabilities']['high_quality_chunks']} chunks")
print(f"  Code examples: {system_summary['retrieval_capabilities']['code_example_chunks']} chunks")
print(f"  Conceptual content: {system_summary['retrieval_capabilities']['conceptual_chunks']} chunks")
print(f"  Comprehensive content: {system_summary['retrieval_capabilities']['comprehensive_chunks']} chunks")
print(f"  Reference material: {system_summary['retrieval_capabilities']['reference_chunks']} chunks")

print(f"\nSAVED FILES VERIFICATION:")
files_to_check = [
    (enhanced_chunks_file, "Enhanced chunks"),
    (collections_file, "Specialized collections"), 
    (quiz_bank_file, "Quiz question bank"),
    (metrics_file, "System metrics"),
    (summary_file, "Comprehensive summary")
]

for file_path, description in files_to_check:
    exists = file_path.exists()
    size = file_path.stat().st_size / 1024 if exists else 0
    print(f"  {description}: {exists} ({size:.1f} KB)")

print(f"\nREADY FOR NEXT STAGE:")
print(f"  All optimized chunks prepared and saved")
print(f"  Quiz question bank structured and ready")
print(f"  Specialized collections created for targeted use")
print(f"  System metrics calculated and documented")
print(f"  Dual-purpose optimization fully implemented")

print(f"\nNext step: Run notebook 04_quiz_content_generation.ipynb")
print(f"Ready to implement comprehensive quiz generation system!")

# Final verification summary
total_files_created = sum(1 for file_path, _ in files_to_check if file_path.exists())
print(f"\nSUCCESS: {total_files_created}/5 files saved successfully")
print(f"Advanced chunking strategy implementation COMPLETE!")

Saving optimized dual-purpose system...
Enhanced chunks saved to: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\processed\enhanced_chunks_complete.pkl
Specialized collections saved to: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\processed\specialized_collections.pkl
Quiz question bank saved to: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\processed\quiz_question_bank.pkl
System metrics saved to: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\processed\system_readiness_metrics.json
Comprehensive summary saved to: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\processed\comprehensive_system_summary.json

COMPREHENSIVE DUAL-PURPOSE SYSTEM COMPLETED!

SYSTEM ACHIEVEMENTS:
  MASSIVE SCALE: 85 chunks from 100% PDF content
  DUAL PURPOSE: Optimized for retrieval AND quiz generation
  QUIZ EXCELLENCE: 84.7% chunks ready for quiz generation
  QUESTION DIVERSITY: 5/5 quiz types, 209 question potential
  RETRIEVAL QUALITY: 28.2% hi