In [1]:
# 01_data_exploration.ipynb - Chunk 1: Setup and PDF Loading

# Import required libraries
import PyPDF2
import pandas as pd
import numpy as np
import os
import re
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up paths
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
RAW_DATA_PATH = PROJECT_ROOT / 'data' / 'raw'
PDF_FILE = RAW_DATA_PATH / 'mastering_pandas_2025.pdf'

print("Project Structure Check:")
print(f"Project Root: {PROJECT_ROOT}")
print(f"Raw Data Path: {RAW_DATA_PATH}")
print(f"PDF File Path: {PDF_FILE}")
print(f"PDF File Exists: {PDF_FILE.exists()}")

# Basic file information
if PDF_FILE.exists():
    file_size = PDF_FILE.stat().st_size
    print(f"\nFile Information:")
    print(f"File Size: {file_size / (1024*1024):.2f} MB")
    print(f"File Name: {PDF_FILE.name}")
else:
    print("ERROR: PDF file not found!")

Project Structure Check:
Project Root: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project
Raw Data Path: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\raw
PDF File Path: c:\Users\MOHAMMED ZAKEER\Downloads\pandas_rag_project\data\raw\mastering_pandas_2025.pdf
PDF File Exists: True

File Information:
File Size: 29.36 MB
File Name: mastering_pandas_2025.pdf


In [2]:
# Load and examine PDF structure
try:
    with open(PDF_FILE, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        print(f"\nPDF Structure:")
        print(f"Total Pages: {len(pdf_reader.pages)}")
        
        # Check if PDF has metadata
        if pdf_reader.metadata:
            print(f"\nPDF Metadata:")
            for key, value in pdf_reader.metadata.items():
                print(f"{key}: {value}")
        else:
            print("\nNo metadata available")
            
except Exception as e:
    print(f"Error reading PDF: {e}")


PDF Structure:
Total Pages: 473

PDF Metadata:
/Author: Yildiz, Muslum
/CreationDate: D:20250520051054+00'00'
/Creator: calibre 7.16.0
/ModDate: D:20250520051054+00'00'
/Producer: calibre 7.16.0
/Title: MASTERING PANDAS: A Comprehensive Guide to Data Analysis in Python


In [3]:
# Extract sample pages to understand document structure
def extract_sample_pages(pdf_path, sample_pages=[0, 1, 2, 50, 100, 200, 300, 472]):
    """Extract text from key pages to understand document structure"""
    samples = {}
    
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        for page_num in sample_pages:
            if page_num < len(pdf_reader.pages):
                try:
                    text = pdf_reader.pages[page_num].extract_text()
                    samples[f"Page_{page_num}"] = text
                    print(f"Page {page_num} - Length: {len(text)} characters")
                except:
                    samples[f"Page_{page_num}"] = "Extraction failed"
                    print(f"Page {page_num} - Extraction failed")
    
    return samples

# Extract samples
page_samples = extract_sample_pages(PDF_FILE)

# Analyze first few pages for structure
print("\n=== FIRST PAGE CONTENT (First 500 chars) ===")
print(page_samples.get('Page_0', 'Not available')[:500])

print("\n=== SECOND PAGE CONTENT (First 500 chars) ===")
print(page_samples.get('Page_1', 'Not available')[:500])

print("\n=== TABLE OF CONTENTS AREA (Page 2, First 800 chars) ===")
print(page_samples.get('Page_2', 'Not available')[:800])

Page 0 - Length: 0 characters
Page 1 - Length: 121 characters
Page 2 - Length: 452 characters
Page 50 - Length: 600 characters
Page 100 - Length: 644 characters
Page 200 - Length: 1401 characters
Page 300 - Length: 945 characters
Page 472 - Length: 1110 characters

=== FIRST PAGE CONTENT (First 500 chars) ===


=== SECOND PAGE CONTENT (First 500 chars) ===
MASTERING
PANDAS
A Comprehensive Guide to
Data Analysis in P ython
By Dr . Muslum Yildiz
Copyright © Muslum Y ildiz, 2024

=== TABLE OF CONTENTS AREA (Page 2, First 800 chars) ===
All rights r eserved.
No part of this book may be r eproduced, distributed,
or transmitted in any for m or by any means without
the prior written per mission of the author, e xcept in
the case of brief quotations embodied in r eviews and
certain other noncommer cial uses per mitted by
copyright law .
To my one true love, my wife,
and to my two precious
daughters—thank you for being
my greatest adventure, my
constant joy, and my endless
inspiration.


In [4]:
# Chunk 3: Content Pattern Analysis

# Find Table of Contents and Chapter Structure
def find_toc_and_chapters(pdf_path, start_page=3, end_page=20):
    """Find TOC and identify chapter patterns"""
    
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        for page_num in range(start_page, min(end_page, len(pdf_reader.pages))):
            text = pdf_reader.pages[page_num].extract_text()
            
            # Look for TOC indicators
            if any(indicator in text.lower() for indicator in ['contents', 'chapter', 'introduction']):
                print(f"\n=== PAGE {page_num} - POTENTIAL TOC/CHAPTER ===")
                print(f"Length: {len(text)} chars")
                print("Content preview:")
                print(text[:600])
                print("---")

# Analyze text patterns in content pages
def analyze_content_patterns(samples):
    """Analyze patterns in extracted text"""
    
    print("\n=== CONTENT ANALYSIS ===")
    for page_name, content in samples.items():
        if content and len(content) > 200:  # Only analyze substantial content
            
            # Count different elements
            lines = content.split('\n')
            words = content.split()
            
            # Look for code patterns
            code_indicators = content.count('import ') + content.count('pd.') + content.count('df.')
            
            # Look for pandas-specific content
            pandas_terms = sum([content.lower().count(term) for term in 
                              ['dataframe', 'series', 'pandas', 'groupby', 'merge']])
            
            print(f"\n{page_name}:")
            print(f"  Lines: {len(lines)}, Words: {len(words)}")
            print(f"  Code indicators: {code_indicators}")
            print(f"  Pandas terms: {pandas_terms}")

# Run analyses
find_toc_and_chapters(PDF_FILE)
analyze_content_patterns(page_samples)


=== PAGE 4 - POTENTIAL TOC/CHAPTER ===
Length: 420 chars
Content preview:
TABLE OF CONTENTS
TABLE OF
CONTENTS………………………………………………………
…………………………4          
PREF ACE…………………………………………………………
………………………………………….7        
INTRODUCTION…………………………………………………
………………………………………9          
CHAPTER 1 :
INTRODUCTION TO
PANDAS………………………………………….………………
………...11                 
CHAPTER 2 :
WHY USE PANDAS? IMPOR TANCE IN DATA
ANAL YSIS……………………………20  
CHAPTER 3 :
INSTALLING AND SETTING UP
PANDAS………………………….……………………….26
---

=== PAGE 5 - POTENTIAL TOC/CHAPTER ===
Length: 410 chars
Content preview:
CHAPTER 4 :
DATA STRUCTURES IN PANDAS: SERIE S AND
DATAFRAMES…………….……30
CHAPTER 5 :
INDEXING AND SELECTION
TECHNIQUES…………………………….…………..……50
CHAPTER 6 :
PANDAS DATA TYPES AND
CONVERSIONS………………………………………………80
 
CHAPTER 7 :
HANDLING MISSING DATA IN
PANDAS………………………………………..…..……100  
CHAPTER 8 :
WORKING WITH TEXT
DATA………………………………………….……………………
…128
CHAPTER 9 :
PANDAS DATE AND TIME
HANDLING………………………………………………….…
152
---

=== PAGE 

In [5]:
# Chunk 4: Complete Structure Mapping

# Extract complete Table of Contents
def extract_complete_toc(pdf_path):
    """Extract full TOC from pages 4-8"""
    toc_text = ""
    
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        for page_num in range(4, 9):  # TOC likely spans pages 4-8
            if page_num < len(pdf_reader.pages):
                text = pdf_reader.pages[page_num].extract_text()
                toc_text += text + "\n"
    
    return toc_text

# Sample actual chapter content
def sample_chapter_content(pdf_path, chapter_pages=[11, 25, 60, 120, 250]):
    """Sample content from different chapters"""
    
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        for page_num in chapter_pages:
            if page_num < len(pdf_reader.pages):
                text = pdf_reader.pages[page_num].extract_text()
                
                print(f"\n=== CHAPTER CONTENT - PAGE {page_num} ===")
                print(f"Length: {len(text)} characters")
                
                # Show first part for structure analysis
                lines = text.split('\n')[:15]  # First 15 lines
                for i, line in enumerate(lines):
                    if line.strip():  # Only show non-empty lines
                        print(f"{i+1:2d}: {line.strip()[:80]}")

# Extract and analyze
print("=== COMPLETE TABLE OF CONTENTS ===")
full_toc = extract_complete_toc(PDF_FILE)
print(full_toc)

print("\n" + "="*60)
sample_chapter_content(PDF_FILE)

# Quick text quality assessment
print("\n=== TEXT QUALITY ASSESSMENT ===")
sample_text = page_samples.get('Page_200', '')
if sample_text:
    # Check for common PDF extraction issues
    spacing_issues = sample_text.count('  ')  # Multiple spaces
    broken_words = len([word for word in sample_text.split() if len(word) == 1 and word.isalpha()])
    
    print(f"Sample from Page 200:")
    print(f"  Multiple spaces found: {spacing_issues}")
    print(f"  Potential broken words: {broken_words}")
    print(f"  First 200 chars: {sample_text[:200]}")

=== COMPLETE TABLE OF CONTENTS ===
TABLE OF CONTENTS
TABLE OF
CONTENTS………………………………………………………
…………………………4          
PREF ACE…………………………………………………………
………………………………………….7        
INTRODUCTION…………………………………………………
………………………………………9          
CHAPTER 1 :
INTRODUCTION TO
PANDAS………………………………………….………………
………...11                 
CHAPTER 2 :
WHY USE PANDAS? IMPOR TANCE IN DATA
ANAL YSIS……………………………20  
CHAPTER 3 :
INSTALLING AND SETTING UP
PANDAS………………………….……………………….26
CHAPTER 4 :
DATA STRUCTURES IN PANDAS: SERIE S AND
DATAFRAMES…………….……30
CHAPTER 5 :
INDEXING AND SELECTION
TECHNIQUES…………………………….…………..……50
CHAPTER 6 :
PANDAS DATA TYPES AND
CONVERSIONS………………………………………………80
 
CHAPTER 7 :
HANDLING MISSING DATA IN
PANDAS………………………………………..…..……100  
CHAPTER 8 :
WORKING WITH TEXT
DATA………………………………………….……………………
…128
CHAPTER 9 :
PANDAS DATE AND TIME
HANDLING………………………………………………….…
152
CHAPTER 10 :
MASTERING DATA IMPOR T AND EXPOR T IN PANDAS
FOR AI……….………163
CHAPTER 11 :   
ESSENTIAL DATA EXPL ORATION TECHNIQUES IN


In [None]:
# Document Analysis Summary
# Structure:

# 473-page comprehensive pandas guide
# 26 chapters covering basics to advanced topics (time series, performance, visualization)
# Clear hierarchical organization with logical progression
# Chapter 1 starts page 11, substantial content throughout

# Content Quality:

# Good pandas-specific content with code examples
# Some spacing issues from PDF extraction (manageable)
# Mixed explanatory text and code blocks
# High information density in content pages

# Key Findings for Chunking:

# Section-based chunking will work best (follow chapter/subsection boundaries)
# Code-context preservation needed (keep examples with explanations)
# Text preprocessing required to fix spacing issues
# Hierarchical metadata available (chapter numbers, topics)

# Chunking Strategy Implications
# Based on this analysis, our optimal approach will be:

# Clean text during extraction (fix spacing)
# Detect section boundaries (chapters, subsections)
# Preserve code blocks with their context
# Add metadata (chapter, topic, page numbers)
# Variable chunk sizes based on content logical units

# The document structure is ideal for semantic chunking rather than fixed-size chunking.
