In [3]:
# 01_data_exploration.ipynb - Complete Document Analysis

import PyPDF2
import pandas as pd
import numpy as np
import os
import re
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Setup paths
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
RAW_DATA_PATH = PROJECT_ROOT / 'data' / 'raw'
PDF_FILE = RAW_DATA_PATH / 'mastering_pandas_2025.pdf'

print("Project Structure Check:")
print(f"Project Root: {PROJECT_ROOT}")
print(f"PDF File Path: {PDF_FILE}")
print(f"PDF File Exists: {PDF_FILE.exists()}")

if PDF_FILE.exists():
    file_size = PDF_FILE.stat().st_size
    print(f"File Size: {file_size / (1024*1024):.2f} MB")
else:
    print("ERROR: PDF file not found!")
    exit()

Project Structure Check:
Project Root: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project
PDF File Path: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\raw\mastering_pandas_2025.pdf
PDF File Exists: True
File Size: 29.36 MB


In [4]:
# PDF Structure Analysis
with open(PDF_FILE, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    total_pages = len(pdf_reader.pages)
    print(f"\nPDF Structure:")
    print(f"Total Pages: {total_pages}")
    
    if pdf_reader.metadata:
        print(f"\nMetadata:")
        for key, value in pdf_reader.metadata.items():
            print(f"{key}: {value}")


PDF Structure:
Total Pages: 473

Metadata:
/Author: Yildiz, Muslum
/CreationDate: D:20250520051054+00'00'
/Creator: calibre 7.16.0
/ModDate: D:20250520051054+00'00'
/Producer: calibre 7.16.0
/Title: MASTERING PANDAS: A Comprehensive Guide to Data Analysis in Python


In [6]:
# Content Quality Analysis Function
def analyze_page_content(pdf_path, sample_size=50):
    """Analyze content quality across entire document"""
    
    content_analysis = []
    
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        total_pages = len(pdf_reader.pages)
        
        # Sample pages evenly across document
        if sample_size > total_pages:
            sample_pages = list(range(total_pages))
        else:
            step = total_pages // sample_size
            sample_pages = list(range(0, total_pages, step))[:sample_size]
        
        for page_num in sample_pages:
            try:
                text = pdf_reader.pages[page_num].extract_text()
                
                if text.strip():
                    # Content quality metrics
                    char_count = len(text)
                    word_count = len(text.split())
                    line_count = len(text.split('\n'))
                    
                    # Technical content indicators
                    code_patterns = ['import ', 'pd.', 'df.', 'print(', '=', 'def ', 'class ']
                    code_score = sum(text.count(pattern) for pattern in code_patterns)
                    
                    # Pandas-specific content
                    pandas_terms = ['DataFrame', 'Series', 'pandas', 'groupby', 'merge', 'concat']
                    pandas_score = sum(text.lower().count(term.lower()) for term in pandas_terms)
                    
                    # Content type classification
                    if char_count < 200:
                        content_type = 'minimal'
                    elif 'TABLE OF CONTENTS' in text.upper() or 'CHAPTER' in text.upper()[:100]:
                        content_type = 'navigation'
                    elif code_score > 3:
                        content_type = 'code_heavy'
                    elif pandas_score > 2:
                        content_type = 'conceptual'
                    else:
                        content_type = 'general'
                    
                    content_analysis.append({
                        'page': page_num,
                        'char_count': char_count,
                        'word_count': word_count,
                        'line_count': line_count,
                        'code_score': code_score,
                        'pandas_score': pandas_score,
                        'content_type': content_type,
                        'text_preview': text[:200].replace('\n', ' ')
                    })
                    
            except Exception as e:
                print(f"Error processing page {page_num}: {e}")
    
    return pd.DataFrame(content_analysis)

# Run comprehensive content analysis
print(f"\nAnalyzing content quality across {total_pages} pages...")
content_df = analyze_page_content(PDF_FILE, sample_size=100)

print(f"\nContent Analysis Results:")
print(f"Analyzed {len(content_df)} pages")
print(f"\nContent Type Distribution:")
print(content_df['content_type'].value_counts())

print(f"\nContent Quality Statistics:")
print(f"Average words per page: {content_df['word_count'].mean():.1f}")
print(f"Pages with substantial content (>500 chars): {(content_df['char_count'] > 500).sum()}")
print(f"Pages with code content: {(content_df['code_score'] > 0).sum()}")
print(f"Pages with pandas content: {(content_df['pandas_score'] > 0).sum()}")



Analyzing content quality across 473 pages...

Content Analysis Results:
Analyzed 99 pages

Content Type Distribution:
content_type
general       61
conceptual    25
code_heavy     7
navigation     5
minimal        1
Name: count, dtype: int64

Content Quality Statistics:
Average words per page: 131.1
Pages with substantial content (>500 chars): 79
Pages with code content: 25
Pages with pandas content: 74


In [7]:
# Identify valuable content regions
substantial_pages = content_df[content_df['char_count'] > 500]
conceptual_pages = content_df[content_df['content_type'] == 'conceptual']
code_pages = content_df[content_df['content_type'] == 'code_heavy']

print(f"\nValuable Content Regions:")
print(f"Substantial content pages: {len(substantial_pages)}")
print(f"Conceptual content pages: {len(conceptual_pages)}")
print(f"Code-heavy pages: {len(code_pages)}")

if len(conceptual_pages) > 0:
    print(f"Conceptual content range: pages {conceptual_pages['page'].min()}-{conceptual_pages['page'].max()}")
if len(code_pages) > 0:
    print(f"Code content range: pages {code_pages['page'].min()}-{code_pages['page'].max()}")

# Document structure detection
def detect_document_structure(pdf_path):
    """Detect chapters, sections, and document organization"""
    
    structure_info = []
    
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        for page_num in range(min(50, len(pdf_reader.pages))):  # First 50 pages for structure
            try:
                text = pdf_reader.pages[page_num].extract_text()
                
                # Look for structural elements
                if 'CHAPTER' in text.upper():
                    chapters = re.findall(r'CHAPTER\s+(\d+)', text.upper())
                    for chapter in chapters:
                        structure_info.append({
                            'page': page_num,
                            'type': 'chapter',
                            'number': chapter,
                            'context': text[:300]
                        })
                
                if 'TABLE OF CONTENTS' in text.upper():
                    structure_info.append({
                        'page': page_num,
                        'type': 'toc',
                        'number': None,
                        'context': text[:500]
                    })
                    
            except Exception as e:
                continue
    
    return structure_info

print(f"\nDocument Structure Analysis:")
structure = detect_document_structure(PDF_FILE)

for item in structure:
    if item['type'] == 'chapter':
        print(f"Chapter {item['number']} found on page {item['page']}")
    elif item['type'] == 'toc':
        print(f"Table of Contents found on page {item['page']}")

# Content extraction recommendations
print(f"\nContent Extraction Recommendations:")

# Find the main content start
main_content_start = 11  # Default from your analysis
if len(conceptual_pages) > 0:
    main_content_start = min(conceptual_pages['page'].min(), main_content_start)

# Find content-rich regions
content_rich_pages = content_df[
    (content_df['char_count'] > 800) & 
    (content_df['content_type'].isin(['conceptual', 'code_heavy']))
]['page'].tolist()

print(f"Recommended content extraction range: pages {main_content_start} to {total_pages}")
print(f"High-value pages identified: {len(content_rich_pages)}")
print(f"Estimated valuable content: {len(content_rich_pages)/total_pages*100:.1f}% of document")

# Save analysis results
output_dir = PROJECT_ROOT / 'data' / 'processed'
output_dir.mkdir(exist_ok=True)

content_df.to_csv(output_dir / 'content_analysis.csv', index=False)
print(f"\nContent analysis saved to: {output_dir / 'content_analysis.csv'}")

# Text quality assessment
sample_pages = content_df[content_df['content_type'] == 'conceptual'].head(3)

print(f"\nText Quality Assessment:")
for _, page in sample_pages.iterrows():
    print(f"\nPage {page['page']} ({page['content_type']}):")
    print(f"  Length: {page['char_count']} chars, {page['word_count']} words")
    print(f"  Preview: {page['text_preview']}")

print(f"\nData exploration complete. Ready for chunking strategy development.")


Valuable Content Regions:
Substantial content pages: 79
Conceptual content pages: 25
Code-heavy pages: 7
Conceptual content range: pages 8-364
Code content range: pages 36-196

Document Structure Analysis:
Chapter 1 found on page 4
Chapter 2 found on page 4
Chapter 3 found on page 4
Table of Contents found on page 4
Chapter 4 found on page 5
Chapter 5 found on page 5
Chapter 6 found on page 5
Chapter 7 found on page 5
Chapter 8 found on page 5
Chapter 9 found on page 5
Chapter 10 found on page 6
Chapter 11 found on page 6
Chapter 12 found on page 6
Chapter 13 found on page 6
Chapter 14 found on page 6
Chapter 15 found on page 6
Chapter 16 found on page 6
Chapter 17 found on page 7
Chapter 18 found on page 7
Chapter 19 found on page 7
Chapter 20 found on page 7
Chapter 1 found on page 12
Chapter 1 found on page 17
Chapter 2 found on page 18
Chapter 3 found on page 18
Chapter 4 found on page 18
Chapter 5 found on page 18
Chapter 6 found on page 18
Chapter 7 found on page 18
Chapter 8 fo