In [None]:
!pip install PyPDF2

In [1]:
import PyPDF2
from pathlib import Path
import io

def read_pdf_text(pdf_path, page_range=None):
    """
    Reads text content from a PDF file using PyPDF2.
    
    Args:
        pdf_path (str): Path to the PDF file
        page_range (tuple, optional): (start_page, end_page) - 1-indexed, inclusive
    
    Returns:
        str: Extracted text content, or None if extraction failed
    """
    
    if not Path(pdf_path).exists():
        print(f"Error: File '{pdf_path}' not found")
        return None
    
    try:
        text = ""
        
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            total_pages = len(pdf_reader.pages)
            
            if total_pages == 0:
                print("Error: PDF has no pages")
                return None
            
            # Determine page range
            start_page, end_page = _get_page_range(page_range, total_pages)
            
            print(f"Extracting text from pages {start_page} to {end_page} of {total_pages} total pages...")
            
            # Extract text from each page
            for page_num in range(start_page - 1, end_page):
                try:
                    page = pdf_reader.pages[page_num]
                    page_text = page.extract_text()
                    
                    if page_text.strip():  # Only add non-empty pages
                        text += f"\n--- Page {page_num + 1} ---\n"
                        text += page_text
                        text += "\n"
                    else:
                        text += f"\n--- Page {page_num + 1} (No extractable text) ---\n"
                        
                except Exception as e:
                    print(f"Error extracting text from page {page_num + 1}: {e}")
                    text += f"\n--- Page {page_num + 1} (Error during extraction) ---\n"
        
        return text.strip() if text.strip() else None
        
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

def read_pdf_text_simple(pdf_path):
    """
    Simple function to extract all text from a PDF without page markers.
    
    Args:
        pdf_path (str): Path to the PDF file
    
    Returns:
        str: Extracted text content as a continuous string
    """
    
    if not Path(pdf_path).exists():
        print(f"Error: File '{pdf_path}' not found")
        return None
    
    try:
        text = ""
        
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            for page in pdf_reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        
        return text.strip() if text.strip() else None
        
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

def _get_page_range(page_range, total_pages):
    """Helper function to determine start and end pages."""
    if page_range is None:
        return 1, total_pages
    
    start_page, end_page = page_range
    start_page = max(1, start_page)
    end_page = min(total_pages, end_page)
    
    if start_page > end_page:
        print(f"Warning: Invalid page range ({start_page}, {end_page}). Using full document.")
        return 1, total_pages
    
    return start_page, end_page

def get_pdf_info(pdf_path):
    """
    Gets basic information about the PDF file using PyPDF2.
    
    Args:
        pdf_path (str): Path to the PDF file
    
    Returns:
        dict: PDF information including metadata and page count
    """
    
    if not Path(pdf_path).exists():
        print(f"Error: File '{pdf_path}' not found")
        return None
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Basic info
            info = {
                'total_pages': len(pdf_reader.pages),
                'file_size': Path(pdf_path).stat().st_size,
                'encrypted': pdf_reader.is_encrypted
            }
            
            # Metadata (if available)
            if pdf_reader.metadata:
                metadata = pdf_reader.metadata
                info.update({
                    'title': metadata.get('/Title', 'N/A'),
                    'author': metadata.get('/Author', 'N/A'),
                    'subject': metadata.get('/Subject', 'N/A'),
                    'creator': metadata.get('/Creator', 'N/A'),
                    'producer': metadata.get('/Producer', 'N/A'),
                    'creation_date': metadata.get('/CreationDate', 'N/A'),
                    'modification_date': metadata.get('/ModDate', 'N/A')
                })
            else:
                info['metadata'] = 'No metadata available'
            
            return info
            
    except Exception as e:
        print(f"Error getting PDF info: {e}")
        return None

def save_text_to_file(text, output_path):
    """
    Saves extracted text to a file.
    
    Args:
        text (str): Text to save
        output_path (str): Path for the output text file
    """
    if not text:
        print("No text to save")
        return False
    
    try:
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(text)
        print(f"Text saved to: {output_path}")
        return True
    except Exception as e:
        print(f"Error saving text file: {e}")
        return False


def count_pages(pdf_path):
    """
    Quickly count the number of pages in a PDF.
    
    Args:
        pdf_path (str): Path to the PDF file
    
    Returns:
        int: Number of pages, or None if error
    """
    
    if not Path(pdf_path).exists():
        print(f"Error: File '{pdf_path}' not found")
        return None
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            return len(pdf_reader.pages)
    except Exception as e:
        print(f"Error counting pages: {e}")
        return None

# Example usage and testing
if __name__ == "__main__":
    # Replace with your PDF file path
    
    pdf_file = "Recommended_List_Under_ATAL_OFFLINE_FDPs_2025_26.pdf"
    # pdf_file = "sample_with_1_page.pdf" # DID NOT worked...probably because the pdf file was created using print command
    # pdf_file = "sample-tables.pdf"
    
    
    print("=" * 60)
    print("PDF TEXT READER USING PyPDF2")
    print("=" * 60)
    
    # Get PDF information
    print("\n1. PDF Information:")
    print("-" * 30)
    info = get_pdf_info(pdf_file)
    if info:
        for key, value in info.items():
            if key == 'file_size':
                print(f"  {key}: {value:,} bytes")
            else:
                print(f"  {key}: {value}")
    
    # Quick page count
    page_count = count_pages(pdf_file)
    if page_count:
        print(f"\nQuick page count: {page_count} pages")
    
    # Extract all text
    print("\n2. Extracting All Text:")
    print("-" * 30)
    text = read_pdf_text(pdf_file)
    
    if text:
        print(f"Successfully extracted {len(text)} characters")
        print(f"Word count (approximate): {len(text.split())}")
        
        # Show first 300 characters
        print("\nFirst 300 characters:")
        print("-" * 40)
        preview = text[:300].replace('\n', ' ').strip()
        print(f"{preview}{'...' if len(text) > 300 else ''}")
        
        # Save to file
        output_file = "extracted_text.txt"
        if save_text_to_file(text, output_file):
            print(f"\nFull text saved to: {output_file}")
    else:
        print("No text could be extracted from the PDF")
    

PDF TEXT READER USING PyPDF2

1. PDF Information:
------------------------------
  total_pages: 23
  file_size: 833,978 bytes
  encrypted: False
  title: Approved List Under ATAL OFFLINE FDPs 2025-26
  author: N/A
  subject: N/A
  creator: Google Sheets
  producer: N/A
  creation_date: N/A
  modification_date: N/A

Quick page count: 23 pages

2. Extracting All Text:
------------------------------
Extracting text from pages 1 to 23 of 23 total pages...
Successfully extracted 121160 characters
Word count (approximate): 13242

First 300 characters:
----------------------------------------
--- Page 1 --- 1ALL India council for Technical Education (AICTE) Training and Learning (ATAL)  Face to Face (Offline)  Faculty Development Programme 2025-26  Sanctioned List Sr NoFDP  Application  NumberCoordinator  NameCocoordinator  NameInstitute Name Institute State Institute DistrictInstitute T...
Text saved to: extracted_text.txt

Full text saved to: extracted_text.txt
