In [1]:
# Setup and Imports

import PyPDF2
import pandas as pd
import numpy as np
import os
import re
from pathlib import Path
import warnings
import json
from collections import defaultdict
import time

warnings.filterwarnings('ignore')

print("Libraries imported successfully")

Libraries imported successfully


In [2]:
# Path Setup and Verification

# Setup paths
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
RAW_DATA_PATH = PROJECT_ROOT / 'data' / 'raw'
PROCESSED_DATA_PATH = PROJECT_ROOT / 'data' / 'processed'
PDF_FILE = RAW_DATA_PATH / 'mastering_pandas_2025.pdf'

# Create processed directory if it doesn't exist
PROCESSED_DATA_PATH.mkdir(exist_ok=True)

print("Project Structure Verification:")
print(f"Project Root: {PROJECT_ROOT}")
print(f"Raw Data Path: {RAW_DATA_PATH}")
print(f"Processed Data Path: {PROCESSED_DATA_PATH}")
print(f"PDF File Path: {PDF_FILE}")
print(f"PDF File Exists: {PDF_FILE.exists()}")

if PDF_FILE.exists():
    file_size = PDF_FILE.stat().st_size
    print(f"File Size: {file_size / (1024*1024):.2f} MB")
else:
    print("ERROR: PDF file not found!")
    print("Please ensure 'mastering_pandas_2025.pdf' is in the data/raw/ directory")

Project Structure Verification:
Project Root: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project
Raw Data Path: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\raw
Processed Data Path: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\processed
PDF File Path: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\raw\mastering_pandas_2025.pdf
PDF File Exists: True
File Size: 29.36 MB


In [3]:
# Basic PDF Structure Analysis

print("Analyzing PDF structure...")

try:
    with open(PDF_FILE, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        total_pages = len(pdf_reader.pages)
        
        print(f"\nPDF Analysis Results:")
        print(f"Total Pages: {total_pages}")
        
        # Check metadata
        if pdf_reader.metadata:
            print(f"\nDocument Metadata:")
            for key, value in pdf_reader.metadata.items():
                print(f"  {key}: {value}")
        
        # Quick sample of first few pages to verify reading
        print(f"\nTesting text extraction on first 3 pages...")
        for i in range(min(3, total_pages)):
            try:
                text = pdf_reader.pages[i].extract_text()
                char_count = len(text.strip())
                word_count = len(text.split()) if text.strip() else 0
                print(f"  Page {i}: {char_count} characters, {word_count} words")
                
                if i == 0 and text.strip():
                    print(f"  Sample text: {text[:100].replace('\n', ' ')}...")
                    
            except Exception as e:
                print(f"  Page {i}: Error extracting text - {e}")

except Exception as e:
    print(f"Error opening PDF: {e}")
    print("Please check if the PDF file is not corrupted and accessible")

Analyzing PDF structure...

PDF Analysis Results:
Total Pages: 473

Document Metadata:
  /Author: Yildiz, Muslum
  /CreationDate: D:20250520051054+00'00'
  /Creator: calibre 7.16.0
  /ModDate: D:20250520051054+00'00'
  /Producer: calibre 7.16.0
  /Title: MASTERING PANDAS: A Comprehensive Guide to Data Analysis in Python

Testing text extraction on first 3 pages...
  Page 0: 0 characters, 0 words
  Page 1: 121 characters, 22 words
  Page 2: 452 characters, 83 words


In [4]:
# Content Classification Functions

def analyze_content_characteristics(text):
    """
    Comprehensive content analysis for tiered classification
    """
    if not text or not text.strip():
        return {
            'char_count': 0,
            'word_count': 0,
            'line_count': 0,
            'content_tier': 'empty',
            'content_type': 'empty',
            'pandas_score': 0,
            'code_score': 0,
            'structural_score': 0,
            'quiz_potential': 0
        }
    
    text = text.strip()
    char_count = len(text)
    word_count = len(text.split())
    line_count = len(text.split('\n'))
    
    # Content type detection patterns
    navigation_patterns = [
        r'table\s+of\s+contents', r'chapter\s+\d+', r'page\s+\d+',
        r'section\s+\d+', r'appendix', r'index', r'bibliography'
    ]
    
    code_patterns = [
        r'import\s+\w+', r'from\s+\w+\s+import', r'pd\.', r'df\.',
        r'print\s*\(', r'=\s*pd\.', r'\.groupby\(', r'\.merge\(',
        r'\.iloc\[', r'\.loc\[', r'def\s+\w+', r'class\s+\w+',
        r'>>>', r'```', r'#.*', r'pandas\.'
    ]
    
    pandas_concepts = [
        r'dataframe', r'series', r'index', r'groupby', r'merge', r'concat',
        r'pivot', r'melt', r'apply', r'lambda', r'iloc', r'loc', r'query',
        r'fillna', r'dropna', r'read_csv', r'to_csv', r'head\(', r'tail\(',
        r'describe\(', r'info\(', r'value_counts', r'unique\('
    ]
    
    structural_patterns = [
        r'^[A-Z][^.!?]*:?\s*$',  # Headers
        r'example\s*\d*[:\-]?', r'note[:\-]?', r'important[:\-]?',
        r'figure\s+\d+', r'table\s+\d+', r'listing\s+\d+',
        r'step\s+\d+', r'method\s+\d+', r'approach\s+\d+'
    ]
    
    quiz_indicators = [
        r'example', r'exercise', r'practice', r'try', r'test',
        r'solution', r'answer', r'result', r'output', r'returns?',
        r'what\s+is', r'how\s+to', r'why', r'when', r'where'
    ]
    
    # Score calculations
    navigation_score = sum(len(re.findall(pattern, text, re.IGNORECASE)) for pattern in navigation_patterns)
    code_score = sum(len(re.findall(pattern, text, re.IGNORECASE)) for pattern in code_patterns)
    pandas_score = sum(len(re.findall(pattern, text, re.IGNORECASE)) for pattern in pandas_concepts)
    structural_score = sum(len(re.findall(pattern, text, re.MULTILINE | re.IGNORECASE)) for pattern in structural_patterns)
    quiz_potential = sum(len(re.findall(pattern, text, re.IGNORECASE)) for pattern in quiz_indicators)
    
    # Content type classification
    if char_count < 100:
        content_type = 'minimal'
    elif navigation_score > 2:
        content_type = 'navigation'
    elif code_score > 5:
        content_type = 'code_heavy'
    elif pandas_score > 8:
        content_type = 'conceptual'
    elif structural_score > 3:
        content_type = 'structural'
    else:
        content_type = 'general'
    
    # Content tier assignment (for 100% utilization strategy)
    if pandas_score > 5 and char_count > 500:
        content_tier = 'tier_1_primary'      # High-value content
    elif (pandas_score > 2 or code_score > 3) and char_count > 300:
        content_tier = 'tier_2_secondary'    # Supporting content
    elif content_type in ['navigation', 'structural'] and char_count > 200:
        content_tier = 'tier_3_reference'    # Reference content
    elif char_count > 100:
        content_tier = 'tier_4_context'      # Background context
    else:
        content_tier = 'tier_5_minimal'      # Minimal content
    
    return {
        'char_count': char_count,
        'word_count': word_count,
        'line_count': line_count,
        'content_tier': content_tier,
        'content_type': content_type,
        'pandas_score': pandas_score,
        'code_score': code_score,
        'structural_score': structural_score,
        'quiz_potential': quiz_potential,
        'navigation_score': navigation_score
    }

def detect_quiz_opportunities(text):
    """
    Identify content suitable for quiz generation
    """
    quiz_types = {
        'multiple_choice': 0,
        'code_completion': 0,
        'true_false': 0,
        'fill_blank': 0,
        'scenario': 0
    }
    
    # Multiple choice indicators
    if re.search(r'(what\s+is|which\s+of|select\s+the|choose\s+the)', text, re.IGNORECASE):
        quiz_types['multiple_choice'] += 2
    
    # Code completion opportunities
    if re.search(r'(example|code|syntax|function)', text, re.IGNORECASE) and re.search(r'(pd\.|df\.|\(|\))', text):
        quiz_types['code_completion'] += 3
    
    # True/false opportunities
    if re.search(r'(always|never|only|must|cannot|should)', text, re.IGNORECASE):
        quiz_types['true_false'] += 1
    
    # Fill in the blank
    if re.search(r'(parameter|argument|method|attribute)', text, re.IGNORECASE):
        quiz_types['fill_blank'] += 1
    
    # Scenario-based
    if re.search(r'(dataset|data|analysis|problem|task)', text, re.IGNORECASE):
        quiz_types['scenario'] += 1
    
    return quiz_types

print("Content classification functions defined successfully")
print("Ready to analyze all pages for comprehensive content extraction")

Content classification functions defined successfully
Ready to analyze all pages for comprehensive content extraction


In [5]:
# Comprehensive Page Analysis - Processing ALL Pages

def analyze_complete_document(pdf_path):
    """
    Analyze every single page of the PDF for 100% content utilization
    """
    print("Starting comprehensive analysis of ALL pages...")
    
    page_analysis = []
    error_pages = []
    
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        total_pages = len(pdf_reader.pages)
        
        print(f"Processing {total_pages} pages...")
        
        for page_num in range(total_pages):
            try:
                # Extract text
                text = pdf_reader.pages[page_num].extract_text()
                
                # Analyze content characteristics
                analysis = analyze_content_characteristics(text)
                
                # Detect quiz opportunities
                quiz_types = detect_quiz_opportunities(text)
                
                # Create text preview
                clean_text = text.replace('\n', ' ').replace('\r', ' ')
                text_preview = clean_text[:200] if clean_text else ""
                
                # Combine all analysis
                page_data = {
                    'page_number': page_num,
                    'text_preview': text_preview,
                    'full_text_length': len(text),
                    **analysis,
                    **{f'quiz_{k}': v for k, v in quiz_types.items()}
                }
                
                page_analysis.append(page_data)
                
                # Progress indicator
                if (page_num + 1) % 50 == 0 or page_num == total_pages - 1:
                    print(f"  Processed {page_num + 1}/{total_pages} pages")
                    
            except Exception as e:
                error_pages.append({'page_number': page_num, 'error': str(e)})
                print(f"  Error on page {page_num}: {e}")
    
    print(f"\nAnalysis Complete!")
    print(f"Successfully processed: {len(page_analysis)} pages")
    print(f"Error pages: {len(error_pages)}")
    
    return page_analysis, error_pages

# Run the comprehensive analysis
start_time = time.time()
all_pages_analysis, error_pages = analyze_complete_document(PDF_FILE)
end_time = time.time()

print(f"\nProcessing completed in {end_time - start_time:.2f} seconds")
print(f"Average time per page: {(end_time - start_time) / len(all_pages_analysis):.3f} seconds")

Starting comprehensive analysis of ALL pages...
Processing 473 pages...
  Processed 50/473 pages
  Processed 100/473 pages
  Processed 150/473 pages
  Processed 200/473 pages
  Processed 250/473 pages
  Processed 300/473 pages
  Processed 350/473 pages
  Processed 400/473 pages
  Processed 450/473 pages
  Processed 473/473 pages

Analysis Complete!
Successfully processed: 473 pages
Error pages: 0

Processing completed in 9.24 seconds
Average time per page: 0.020 seconds


In [6]:
# Results Analysis and Content Distribution

# Create comprehensive DataFrame
content_df = pd.DataFrame(all_pages_analysis)

print("COMPREHENSIVE CONTENT ANALYSIS RESULTS")
print("=" * 60)
print(f"Total pages analyzed: {len(content_df)}")
print(f"Pages with errors: {len(error_pages)}")

# Content tier distribution
print(f"\nCONTENT TIER DISTRIBUTION (100% Utilization Strategy):")
tier_distribution = content_df['content_tier'].value_counts().sort_index()
for tier, count in tier_distribution.items():
    percentage = (count / len(content_df)) * 100
    print(f"  {tier.replace('_', ' ').title()}: {count:3d} pages ({percentage:5.1f}%)")

# Content type distribution
print(f"\nCONTENT TYPE DISTRIBUTION:")
type_distribution = content_df['content_type'].value_counts()
for content_type, count in type_distribution.items():
    percentage = (count / len(content_df)) * 100
    print(f"  {content_type.title()}: {count:3d} pages ({percentage:5.1f}%)")

# Quality metrics
print(f"\nCONTENT QUALITY METRICS:")
print(f"  Pages with substantial content (>500 chars): {(content_df['char_count'] > 500).sum()}")
print(f"  Pages with high pandas content (score >5): {(content_df['pandas_score'] > 5).sum()}")
print(f"  Pages with code examples (score >3): {(content_df['code_score'] > 3).sum()}")
print(f"  Pages with quiz potential (score >2): {(content_df['quiz_potential'] > 2).sum()}")

# Statistical summary
print(f"\nSTATISTICAL SUMMARY:")
print(f"  Average characters per page: {content_df['char_count'].mean():.0f}")
print(f"  Average words per page: {content_df['word_count'].mean():.0f}")
print(f"  Average pandas score: {content_df['pandas_score'].mean():.1f}")
print(f"  Average code score: {content_df['code_score'].mean():.1f}")

# Top content by different criteria
print(f"\nTOP 10 PAGES BY PANDAS CONTENT:")
top_pandas = content_df.nlargest(10, 'pandas_score')[['page_number', 'pandas_score', 'content_tier', 'content_type']]
for _, row in top_pandas.iterrows():
    print(f"  Page {row['page_number']:3d}: Score {row['pandas_score']:2d} | {row['content_tier']} | {row['content_type']}")

print(f"\nTOP 10 PAGES BY CODE CONTENT:")
top_code = content_df.nlargest(10, 'code_score')[['page_number', 'code_score', 'content_tier', 'content_type']]
for _, row in top_code.iterrows():
    print(f"  Page {row['page_number']:3d}: Score {row['code_score']:2d} | {row['content_tier']} | {row['content_type']}")

print(f"\nTOP 10 PAGES BY QUIZ POTENTIAL:")
top_quiz = content_df.nlargest(10, 'quiz_potential')[['page_number', 'quiz_potential', 'content_tier', 'content_type']]
for _, row in top_quiz.iterrows():
    print(f"  Page {row['page_number']:3d}: Score {row['quiz_potential']:2d} | {row['content_tier']} | {row['content_type']}")

# Content utilization summary
tier_1_pages = len(content_df[content_df['content_tier'] == 'tier_1_primary'])
tier_2_pages = len(content_df[content_df['content_tier'] == 'tier_2_secondary'])
tier_3_pages = len(content_df[content_df['content_tier'] == 'tier_3_reference'])
tier_4_pages = len(content_df[content_df['content_tier'] == 'tier_4_context'])
tier_5_pages = len(content_df[content_df['content_tier'] == 'tier_5_minimal'])

print(f"\n100% CONTENT UTILIZATION STRATEGY:")
print(f"  Tier 1 (Primary): {tier_1_pages} pages - High-value pandas content")
print(f"  Tier 2 (Secondary): {tier_2_pages} pages - Supporting concepts and code")
print(f"  Tier 3 (Reference): {tier_3_pages} pages - Navigation and structure")
print(f"  Tier 4 (Context): {tier_4_pages} pages - Background information")
print(f"  Tier 5 (Minimal): {tier_5_pages} pages - Minimal content")
print(f"  Total: {tier_1_pages + tier_2_pages + tier_3_pages + tier_4_pages + tier_5_pages} pages")

utilizable_pages = len(content_df[content_df['content_tier'] != 'tier_5_minimal'])
print(f"\nUtilizable content: {utilizable_pages}/{len(content_df)} pages ({utilizable_pages/len(content_df)*100:.1f}%)")

COMPREHENSIVE CONTENT ANALYSIS RESULTS
Total pages analyzed: 473
Pages with errors: 0

CONTENT TIER DISTRIBUTION (100% Utilization Strategy):
  Empty:   2 pages (  0.4%)
  Tier 1 Primary:  79 pages ( 16.7%)
  Tier 2 Secondary: 105 pages ( 22.2%)
  Tier 3 Reference: 232 pages ( 49.0%)
  Tier 4 Context:  55 pages ( 11.6%)

CONTENT TYPE DISTRIBUTION:
  Structural: 335 pages ( 70.8%)
  General:  71 pages ( 15.0%)
  Navigation:  39 pages (  8.2%)
  Conceptual:  20 pages (  4.2%)
  Code_Heavy:   6 pages (  1.3%)
  Empty:   2 pages (  0.4%)

CONTENT QUALITY METRICS:
  Pages with substantial content (>500 chars): 369
  Pages with high pandas content (score >5): 85
  Pages with code examples (score >3): 19
  Pages with quiz potential (score >2): 133

STATISTICAL SUMMARY:
  Average characters per page: 787
  Average words per page: 125
  Average pandas score: 2.8
  Average code score: 0.5

TOP 10 PAGES BY PANDAS CONTENT:
  Page  84: Score 22 | tier_1_primary | navigation
  Page  55: Score 21 | t

In [7]:
# Save Comprehensive Analysis Results

# Save the complete content analysis
content_analysis_file = PROCESSED_DATA_PATH / 'comprehensive_content_analysis.csv'
content_df.to_csv(content_analysis_file, index=False)
print(f"Comprehensive content analysis saved to: {content_analysis_file}")

# Save error log if any errors occurred
if error_pages:
    error_df = pd.DataFrame(error_pages)
    error_file = PROCESSED_DATA_PATH / 'extraction_errors.csv'
    error_df.to_csv(error_file, index=False)
    print(f"Error log saved to: {error_file}")

# Create summary statistics for next notebooks
summary_stats = {
    'total_pages': len(content_df),
    'error_pages': len(error_pages),
    'tier_distribution': content_df['content_tier'].value_counts().to_dict(),
    'type_distribution': content_df['content_type'].value_counts().to_dict(),
    'quality_metrics': {
        'substantial_content_pages': int((content_df['char_count'] > 500).sum()),
        'high_pandas_pages': int((content_df['pandas_score'] > 5).sum()),
        'code_heavy_pages': int((content_df['code_score'] > 3).sum()),
        'quiz_potential_pages': int((content_df['quiz_potential'] > 2).sum())
    },
    'statistical_summary': {
        'avg_chars_per_page': float(content_df['char_count'].mean()),
        'avg_words_per_page': float(content_df['word_count'].mean()),
        'avg_pandas_score': float(content_df['pandas_score'].mean()),
        'avg_code_score': float(content_df['code_score'].mean())
    },
    'utilization_strategy': {
        'tier_1_primary': int(len(content_df[content_df['content_tier'] == 'tier_1_primary'])),
        'tier_2_secondary': int(len(content_df[content_df['content_tier'] == 'tier_2_secondary'])),
        'tier_3_reference': int(len(content_df[content_df['content_tier'] == 'tier_3_reference'])),
        'tier_4_context': int(len(content_df[content_df['content_tier'] == 'tier_4_context'])),
        'tier_5_minimal': int(len(content_df[content_df['content_tier'] == 'tier_5_minimal']))
    }
}

# Save summary statistics
summary_file = PROCESSED_DATA_PATH / 'analysis_summary.json'
with open(summary_file, 'w') as f:
    json.dump(summary_stats, f, indent=2)
print(f"Analysis summary saved to: {summary_file}")

# Display final summary
print(f"\nCOMPREHENSIVE ANALYSIS COMPLETE!")
print(f"=" * 50)
print(f"Results saved and ready for next stage processing")


# Quick verification of saved data
print(f"\nSaved files verification:")
print(f"  Content analysis: {content_analysis_file.exists()} ({content_analysis_file.stat().st_size / 1024:.1f} KB)")
print(f"  Summary stats: {summary_file.exists()} ({summary_file.stat().st_size / 1024:.1f} KB)")
if error_pages:
    print(f"  Error log: {error_file.exists()} ({error_file.stat().st_size / 1024:.1f} KB)")

Comprehensive content analysis saved to: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\processed\comprehensive_content_analysis.csv
Analysis summary saved to: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\processed\analysis_summary.json

COMPREHENSIVE ANALYSIS COMPLETE!
Results saved and ready for next stage processing

Saved files verification:
  Content analysis: True (127.1 KB)
  Summary stats: True (0.9 KB)
