In [9]:
# Import necessary libraries and define a custom warning filter
import warnings

# Define a custom filter function
def custom_warning_filter(message, category, filename, lineno, file=None, line=None):
    if "camelot only works on text-based pages" in str(message):
        return
    return warnings.defaultaction

# Apply the custom filter
warnings.showwarning = custom_warning_filter

# Example code that generates the specific warning
warnings.warn("camelot only works on text-based pages", UserWarning)

In [38]:
# Import necessary libraries for PDF processing and extraction
import os
import logging
import time
import json
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Union
from dataclasses import dataclass, asdict
from enum import Enum

# Add these imports at the top
import pdfplumber
import pytesseract
from pytesseract import Output

# Core PDF processing libraries
import pymupdf
import camelot
import pandas as pd
from PIL import Image
import io
# from unstructured.partition.pdf import partition_pdf
from langchain.document_loaders import PyPDFLoader

# Define enums and data classes for extraction methods and statistics
class ExtractionMethod(Enum):
    """Available extraction methods"""
    PYMUPDF = "pymupdf"
    # UNSTRUCTURED = "unstructured"
    CAMELOT = "camelot"
    PYPDF = "pypdf"
    PDFPLUMBER = "pdfplumber"


@dataclass
class ExtractionStats:
    """Statistics for extraction process"""
    method: str
    start_time: datetime
    end_time: datetime
    execution_time: float
    memory_usage_mb: float
    num_pages: int
    items_extracted: int
    success: bool
    error_message: Optional[str] = None
    additional_info: Optional[Dict] = None

# Define the main PDFExtractor class
class PDFExtractor:
    """
    PDF Extractor with multiple extraction methods
    
    Features:
    - Multiple text extraction methods (PyMuPDF, Unstructured, PyPDF)
    - Table extraction (Camelot, Unstructured)
    - Image extraction (PyMuPDF)
    - Comprehensive statistics and metadata
    - Experiment logging
    """
    
    def __init__(
        self,
        output_dir: str = "extracted_content",
        extract_text_method: str = "pymupdf",
        extract_tables_method: str = "camelot",
        extract_images_method: str = "pymupdf",
        save_metadata: bool = True,
        log_level: str = "INFO",
        experiment_name: Optional[str] = None
    ):
        # Setup logging
        self.setup_logging(log_level)
        
        # Initialize parameters
        self.output_dir = Path(output_dir)
        self.save_metadata = save_metadata
        self.experiment_name = experiment_name or datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Validate and set extraction methods
        self.methods = {
            'text': self._validate_method(
                extract_text_method, 
                ['pymupdf', 'pypdf']
            ),
            'tables': self._validate_method(
                extract_tables_method, 
                ['camelot', 'pdfplumber']
            ),
            'images': self._validate_method(
                extract_images_method, 
                ['pymupdf']
            )
        }
        
        # Setup directory structure
        self.setup_directories()
        
        # Initialize statistics
        self.current_stats = {}

    def setup_logging(self, log_level: str):
        """Configure logging"""
        self.logger = logging.getLogger(self.__class__.__name__)
        self.logger.setLevel(log_level)
        
        if not self.logger.handlers:
            handler = logging.StreamHandler()
            formatter = logging.Formatter(
                '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
            )
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)

    def setup_directories(self):
        """Create output directories"""
        self.dirs = {
            'text': self.output_dir / 'text',
            'tables': self.output_dir / 'tables',
            'images': self.output_dir / 'images',
            'metadata': self.output_dir / 'metadata',
            'experiments': self.output_dir / 'experiments'
        }
        
        for path in self.dirs.values():
            path.mkdir(parents=True, exist_ok=True)

    def _validate_method(self, method: str, valid_methods: List[str]) -> str:
        """Validate extraction method"""
        method = method.lower()
        if method not in valid_methods:
            raise ValueError(
                f"Invalid method '{method}'. Valid options are: {valid_methods}"
            )
        return method

    def _get_memory_usage(self) -> float:
        """Get current memory usage in MB"""
        import psutil
        process = psutil.Process(os.getpid())
        memory_usage = process.memory_info().rss / 1024 / 1024
        return max(memory_usage, 0)
    
    # Add this new function to save metadata
    def save_structured_metadata(self, extraction_type: str, page_num: int, item_num: int, metadata: Dict):
        """Save structured metadata for each extracted element"""
        metadata_file = self.dirs['metadata'] / f'{extraction_type}_metadata.json'
        
        # Load existing metadata if file exists
        if metadata_file.exists():
            with open(metadata_file, 'r') as f:
                all_metadata = json.load(f)
        else:
            all_metadata = {}
        
        # Create unique identifier for the item
        item_id = f"page_{page_num}_item_{item_num}"
        all_metadata[item_id] = {
            'extraction_type': extraction_type,
            'page_number': page_num,
            'item_number': item_num,
            'timestamp': datetime.now().isoformat(),
            **metadata
        }
        
        # Save updated metadata
        with open(metadata_file, 'w') as f:
            json.dump(all_metadata, f, indent=2)

    def extract_text(self, pdf_path: str) -> Tuple[List[Dict], ExtractionStats]:
        """
        Extract text using configured method
        
        Returns:
            Tuple containing:
            - List of dictionaries with extracted text and metadata
            - ExtractionStats with performance metrics
        """
        method = self.methods['text']
        start_time = datetime.now()
        start_mem = self._get_memory_usage()
        
        try:
            docs = []
            additional_info = {}
            
            if method == 'pymupdf':
                doc = pymupdf.open(pdf_path)
                num_pages = doc.page_count
                
                for page_num in range(num_pages):
                    page = doc[page_num]
                    text_dict = page.get_text("dict")
                    text = page.get_text()
                    
                    # Save text
                    output_path = self.dirs['text'] / f'page_{page_num + 1}.txt'
                    with open(output_path, 'w', encoding='utf-8') as f:
                        f.write(text)
                    
                    # Save page metadata
                    docs.append({
                        'content': text,
                        'metadata': {
                            'page': page_num + 1,
                            'method': method,
                            'blocks': text_dict.get('blocks', []),
                            'file_path': str(output_path)
                        }
                    })
                    
                    # Collect additional information
                    additional_info[f'page_{page_num + 1}'] = {
                        'word_count': len(text.split()),
                        'char_count': len(text)
                    }
                
            elif method == 'pypdf':
                loader = PyPDFLoader(pdf_path)
                langchain_docs = loader.load()
                num_pages = len(langchain_docs)
                
                for page_num, doc in enumerate(langchain_docs):
                    output_path = self.dirs['text'] / f'page_{page_num + 1}.txt'
                    with open(output_path, 'w', encoding='utf-8') as f:
                        f.write(doc.page_content)
                    
                    docs.append({
                        'content': doc.page_content,
                        'metadata': {
                            'page': page_num + 1,
                            'method': method,
                            'source': doc.metadata.get('source'),
                            'file_path': str(output_path),
                            'langchain_metadata': doc.metadata
                        }
                    })
                    
                    additional_info[f'page_{page_num + 1}'] = {
                        'word_count': len(doc.page_content.split()),
                        'char_count': len(doc.page_content)
                    }
                
            # elif method == 'unstructured':
            #     elements = partition_pdf(pdf_path, strategy="hi_res")
            #     page_texts = {}
                
            #     for elem in elements:
            #         if hasattr(elem, 'metadata'):
            #             page_num = elem.metadata.get('page_number', 0)
            #             if page_num not in page_texts:
            #                 page_texts[page_num] = []
            #             page_texts[page_num].append(elem.text)
                
            #     num_pages = len(page_texts)
                
            #     for page_num, texts in page_texts.items():
            #         text = '\n'.join(texts)
            #         output_path = self.dirs['text'] / f'page_{page_num}.txt'
            #         with open(output_path, 'w', encoding='utf-8') as f:
            #             f.write(text)
                    
            #         docs.append({
            #             'content': text,
            #             'metadata': {
            #                 'page': page_num,
            #                 'method': method,
            #                 'file_path': str(output_path),
            #                 'element_count': len(texts)
            #             }
            #         })
                    
            #         additional_info[f'page_{page_num}'] = {
            #             'word_count': len(text.split()),
            #             'char_count': len(text),
            #             'elements': len(texts)
            #         }
            
            success = True
            error_msg = None
            items = len(docs)
                
        except Exception as e:
            self.logger.error(f"Error in text extraction: {str(e)}")
            docs = []
            num_pages = 0
            items = 0
            success = False
            error_msg = str(e)
            additional_info = {'error_details': str(e)}
        
        stats = ExtractionStats(
            method=method,
            start_time=start_time,
            end_time=datetime.now(),
            execution_time=(datetime.now() - start_time).total_seconds(),
            memory_usage_mb=self._get_memory_usage() - start_mem,
            num_pages=num_pages,
            items_extracted=items,
            success=success,
            error_message=error_msg,
            additional_info=additional_info
        )
        
        self.current_stats['text'] = stats
        return docs, stats

    def extract_tables(self, pdf_path: str) -> Tuple[List[Dict], ExtractionStats]:
        """Extract tables using configured method"""
        method = self.methods['tables']
        start_time = datetime.now()
        start_mem = self._get_memory_usage()
        
        try:
            tables = []
            additional_info = {}
            
            if method == 'camelot':
                # Extract tables with both lattice and stream parsers
                for parser in ['lattice', 'stream']:
                    extracted_tables = camelot.read_pdf(
                        pdf_path, 
                        pages='all',
                        flavor=parser
                    )
                    
                    for idx, table in enumerate(extracted_tables):
                        page_num = table.parsing_report['page']
                        table_num = len(tables) + 1

                        
                        # Save as CSV
                        csv_path = self.dirs['tables'] / f'page_{page_num}_table_{idx}.csv'
                        table.df.to_csv(csv_path, index=False)
                        
                        # Save as JSON for better metadata preservation
                        json_path = self.dirs['tables'] / f'table_{table_num}_{parser}.json'
                        table_data = {
                            'data': table.data,
                            'parsing_report': table.parsing_report
                        }
                        with open(json_path, 'w') as f:
                            json.dump(table_data, f, indent=2)
                        
                        tables.append({
                            'content': table.df.to_dict(),
                            'metadata': {
                                'table_number': table_num,
                                'parser': parser,
                                'page': table.parsing_report['page'],
                                'accuracy': table.parsing_report['accuracy'],
                                'whitespace': table.parsing_report['whitespace'],
                                'csv_path': str(csv_path),
                                'json_path': str(json_path)
                            }
                        })
                        
                        additional_info[f'table_{table_num}'] = {
                            'rows': len(table.df),
                            'columns': len(table.df.columns),
                            'parser': parser,
                            'accuracy': table.parsing_report['accuracy']
                        }
            elif method == 'pdfplumber':
                with pdfplumber.open(pdf_path) as pdf:
                    for page_num, page in enumerate(pdf.pages, 1):
                        extracted_tables = page.extract_tables()
                        
                        for table_num, table in enumerate(extracted_tables, 1):
                            if table and len(table) > 0:
                                # Convert to DataFrame
                                df = pd.DataFrame(table[1:], columns=table[0])
                                
                                # Save as CSV
                                csv_path = self.dirs['tables'] / f'page_{page_num}_table_{table_num}.csv'
                                df.to_csv(csv_path, index=False)
                                
                                # Save as JSON
                                json_path = self.dirs['tables'] / f'page_{page_num}_table_{table_num}.json'
                                table_data = {
                                    'data': table,
                                    'columns': table[0],
                                    'rows': table[1:],
                                    'dimensions': {
                                        'rows': len(table),
                                        'columns': len(table[0]) if table else 0
                                    }
                                }
                                
                                with open(json_path, 'w') as f:
                                    json.dump(table_data, f, indent=2)
                                
                                metadata = {
                                    'table_number': table_num,
                                    'page': page_num,
                                    'dimensions': table_data['dimensions'],
                                    'csv_path': str(csv_path),
                                    'json_path': str(json_path)
                                }
                                
                                # Save structured metadata
                                self.save_structured_metadata(
                                    'table', 
                                    page_num, 
                                    table_num, 
                                    metadata
                                )
                                
                                tables.append({
                                    'content': df.to_dict(),
                                    'metadata': metadata
                                })
                                
                                additional_info[f'table_{page_num}_{table_num}'] = {
                                    'rows': len(df),
                                    'columns': len(df.columns)
                                }
            # elif method == 'unstructured':
            #     elements = partition_pdf(pdf_path, strategy="hi_res")
            #     table_elements = [e for e in elements if e.category == 'table']
                
            #     for idx, table in enumerate(table_elements):
            #         output_path = self.dirs['tables'] / f'table_{idx + 1}.txt'
            #         with open(output_path, 'w') as f:
            #             f.write(table.text)
                    
            #         tables.append({
            #             'content': table.text,
            #             'metadata': {
            #                 'table_number': idx + 1,
            #                 'page': table.metadata.get('page_number'),
            #                 'file_path': str(output_path)
            #             }
            #         })
                    
            #         additional_info[f'table_{idx + 1}'] = {
            #             'page': table.metadata.get('page_number'),
            #             'text_length': len(table.text)
            #         }
            
            success = True
            error_msg = None
            num_pages = len(set(t['metadata']['page'] for t in tables))
            items = len(tables)
            
        except Exception as e:
            self.logger.error(f"Error in table extraction: {str(e)}")
            tables = []
            num_pages = 0
            items = 0
            success = False
            error_msg = str(e)
            additional_info = {'error_details': str(e)}
        
        stats = ExtractionStats(
            method=method,
            start_time=start_time,
            end_time=datetime.now(),
            execution_time=(datetime.now() - start_time).total_seconds(),
            memory_usage_mb=self._get_memory_usage() - start_mem,
            num_pages=num_pages,
            items_extracted=items,
            success=success,
            error_message=error_msg,
            additional_info=additional_info
        )
        
        self.current_stats['tables'] = stats
        return tables, stats

    def extract_images(self, pdf_path: str) -> Tuple[List[Dict], ExtractionStats]:
        """Extract images using configured method with OCR"""
        method = self.methods['images']
        start_time = datetime.now()
        start_mem = self._get_memory_usage()
        
        try:
            images = []
            additional_info = {}
            
            if method == 'pymupdf':
                doc = pymupdf.open(pdf_path)
                
                for page_num in range(doc.page_count):
                    page = doc[page_num]
                    image_list = page.get_images()
                    
                    for img_idx, img in enumerate(image_list):
                        xref = img[0]
                        base_image = doc.extract_image(xref)
                        
                        if base_image:
                            image_bytes = base_image["image"]
                            image = Image.open(io.BytesIO(image_bytes))
                            
                            # Save image
                            output_path = self.dirs['images'] / f'page_{page_num + 1}_img_{img_idx + 1}.png'
                            image.save(output_path)
                            
                            # Perform OCR
                            try:
                                ocr_result = pytesseract.image_to_data(image, output_type=Output.DICT)
                                ocr_text = pytesseract.image_to_string(image)
                            except Exception as e:
                                ocr_result = {"error": str(e)}
                                ocr_text = ""
                            
                            metadata = {
                                'page': page_num + 1,
                                'image_number': img_idx + 1,
                                'width': image.width,
                                'height': image.height,
                                'format': image.format,
                                'mode': image.mode,
                                'file_path': str(output_path),
                                'colorspace': base_image.get('colorspace'),
                                'extension': base_image.get('ext'),
                                'ocr': {
                                    'text': ocr_text,
                                    'confidence': ocr_result.get('conf', []),
                                    'words': ocr_result.get('text', []),
                                    'word_coordinates': list(zip(
                                        ocr_result.get('left', []),
                                        ocr_result.get('top', []),
                                        ocr_result.get('width', []),
                                        ocr_result.get('height', [])
                                    ))
                                }
                            }
                            
                            # Save structured metadata
                            self.save_structured_metadata(
                                'image', 
                                page_num + 1, 
                                img_idx + 1, 
                                metadata
                            )
                            
                            images.append({
                                'image': image,
                                'metadata': metadata
                            })
                            
                            additional_info[f'image_{page_num + 1}_{img_idx + 1}'] = {
                                'size': os.path.getsize(output_path),
                                'dimensions': f"{image.width}x{image.height}",
                                'format': image.format,
                                'ocr_confidence': sum(ocr_result.get('conf', [0])) / len(ocr_result.get('conf', [1]))
                            }
                
            success = True
            error_msg = None
            num_pages = len(set(img['metadata']['page'] for img in images))
            items = len(images)
            
        except Exception as e:
            self.logger.error(f"Error in image extraction: {str(e)}")
            images = []
            num_pages = 0
            items = 0
            success = False
            error_msg = str(e)
            additional_info = {'error_details': str(e)}
        
        stats = ExtractionStats(
            method=method,
            start_time=start_time,
            end_time=datetime.now(),
            execution_time=(datetime.now() - start_time).total_seconds(),
            memory_usage_mb=self._get_memory_usage() - start_mem,
            num_pages=num_pages,
            items_extracted=items,
            success=success,
            error_message=error_msg,
            additional_info=additional_info
        )
        
        self.current_stats['images'] = stats
        return images, stats

    def save_experiment_results(self, pdf_path: str, results: Dict):
        """Save experiment results and metadata"""
        experiment_dir = self.dirs['experiments'] / self.experiment_name
        experiment_dir.mkdir(exist_ok=True)
        
        # Save statistics
        stats_file = experiment_dir / 'extraction_stats.json'
        stats_data = {
            'pdf_file': pdf_path,
            'timestamp': datetime.now().isoformat(),
            'methods_used': self.methods,
            'statistics': {
                k: asdict(v) for k, v in self.current_stats.items()
            }
        }
        
        with open(stats_file, 'w') as f:
            json.dump(stats_data, f, indent=2, default=str)
        
        # Save results summary
        summary_file = experiment_dir / 'results_summary.json'
        summary_data = {
            'text_extracted': len(results.get('text', {}).get('data', [])),
            'tables_extracted': len(results.get('tables', {}).get('data', [])),
            'images_extracted': len(results.get('images', {}).get('data', [])),
            'output_directory': str(self.output_dir)
        }
        
        with open(summary_file, 'w') as f:
            json.dump(summary_data, f, indent=2)
    
    def generate_performance_report(self) -> Dict:
        """
        Generate comprehensive performance report for the extraction process
        
        Returns:
            Dictionary containing detailed performance metrics
        """
        report = {
            'summary': {
                'total_execution_time': 0,
                'total_memory_used': 0,
                'total_items_extracted': 0,
                'success_rate': 0
            },
            'methods_used': self.methods,
            'detailed_metrics': {},
            'extraction_counts': {},
            'errors': []
        }
        
        successful_extractions = 0
        total_extractions = 0
        
        for content_type, stats in self.current_stats.items():
            # Accumulate summary metrics
            report['summary']['total_execution_time'] += stats.execution_time
            report['summary']['total_memory_used'] += stats.memory_usage_mb
            report['summary']['total_items_extracted'] += stats.items_extracted
            
            # Track success rate
            total_extractions += 1
            if stats.success:
                successful_extractions += 1
            
            # Detailed metrics per content type
            report['detailed_metrics'][content_type] = {
                'method_used': stats.method,
                'execution_time': f"{stats.execution_time:.2f} seconds",
                'memory_usage': f"{stats.memory_usage_mb:.2f} MB",
                'pages_processed': stats.num_pages,
                'items_extracted': stats.items_extracted,
                'success': stats.success
            }
            
            # Extraction counts
            report['extraction_counts'][content_type] = {
                'total_items': stats.items_extracted,
                'items_per_page': stats.items_extracted / stats.num_pages if stats.num_pages > 0 else 0
            }
            
            # Collect any errors
            if stats.error_message:
                report['errors'].append({
                    'content_type': content_type,
                    'error': stats.error_message
                })
        
        # Calculate final summary metrics
        report['summary']['success_rate'] = (
            successful_extractions / total_extractions * 100 
            if total_extractions > 0 else 0
        )
        
        return report

    def print_performance_report(self):
        """Print formatted performance report to console"""
        report = self.generate_performance_report()
        
        print("\n" + "="*50)
        print("PDF EXTRACTION PERFORMANCE REPORT")
        print("="*50)
        
        # Overall Summary
        print("\nüîç OVERALL SUMMARY:")
        print(f"Total Execution Time: {report['summary']['total_execution_time']:.2f} seconds")
        print(f"Total Memory Used: {report['summary']['total_memory_used']:.2f} MB")
        print(f"Total Items Extracted: {report['summary']['total_items_extracted']}")
        print(f"Overall Success Rate: {report['summary']['success_rate']:.1f}%")
        
        # Methods Used
        print("\nüõ†Ô∏è METHODS USED:")
        for content_type, method in report['methods_used'].items():
            print(f"{content_type.title()}: {method}")
        
        # Detailed Metrics
        print("\nüìä DETAILED METRICS:")
        for content_type, metrics in report['detailed_metrics'].items():
            print(f"\n{content_type.upper()}:")
            print(f"  Method: {metrics['method_used']}")
            print(f"  Execution Time: {metrics['execution_time']}")
            print(f"  Memory Usage: {metrics['memory_usage']}")
            print(f"  Pages Processed: {metrics['pages_processed']}")
            print(f"  Items Extracted: {metrics['items_extracted']}")
            print(f"  Success: {'‚úÖ' if metrics['success'] else '‚ùå'}")
        
        # Extraction Counts
        print("\nüìà EXTRACTION STATISTICS:")
        for content_type, counts in report['extraction_counts'].items():
            print(f"\n{content_type.title()}:")
            print(f"  Total Items: {counts['total_items']}")
            print(f"  Items per Page: {counts['items_per_page']:.2f}")
        
        # Errors
        if report['errors']:
            print("\n‚ö†Ô∏è ERRORS ENCOUNTERED:")
            for error in report['errors']:
                print(f"\n{error['content_type'].upper()}:")
                print(f"  {error['error']}")
        
        print("\n" + "="*50)

    def process_pdf(self, pdf_path: str, extract_types: Optional[List[str]] = None, show_performance: bool = True) -> Dict:
        """
        Process PDF with configured methods
        
        Args:
            pdf_path: Path to PDF file
            extract_types: List of types to extract ('text', 'tables', 'images'). 
                         If None, extracts all types.
        
        Returns:
            Dictionary with extraction results and statistics
        """
        self.logger.info(f"Processing PDF: {pdf_path}")
        extract_types = extract_types or ['text', 'tables', 'images']
        results = {}
        
        # Validate PDF exists
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
        
        # Extract each type
        extraction_methods = {
            'text': self.extract_text,
            'tables': self.extract_tables,
            'images': self.extract_images
        }
        
        for extract_type in extract_types:
            if extract_type in extraction_methods:
                self.logger.info(
                    f"Extracting {extract_type} using {self.methods[extract_type]}"
                )
                data, stats = extraction_methods[extract_type](pdf_path)
                results[extract_type] = {
                    'data': data,
                    'stats': asdict(stats)
                }
        
        # Save experiment results if enabled
        if self.save_metadata:
            self.save_experiment_results(pdf_path, results)

        if show_performance:
            self.print_performance_report()
        
        return results

    def get_extraction_summary(self) -> Dict:
        """Get summary of current extraction process"""
        summary = {
            'methods_used': self.methods,
            'output_directory': str(self.output_dir),
            'statistics': {
                k: asdict(v) for k, v in self.current_stats.items()
            }
        }
        return summary
    
    # save results to a json file in experiment directory
    def save_results(self, results: Dict):
        """Save extraction results to a JSON file"""
        experiment_dir = self.dirs['experiments'] / self.experiment_name
        experiment_dir.mkdir(exist_ok=True)
        
        results_file = experiment_dir / 'extraction_results.json'
        with open(results_file, 'w') as f:
            json.dump(results, f, indent=4)


In [23]:
# Define the path to the PDF file to be processed
pdf_path = "2024-2029_NationalOCSProgram_PFP_Sept_2023_Compliant_distilled.pdf"

In [24]:
# Initialize the PDFExtractor with default settings
EXPERIMENT_NAME = "Default_Experiment_pymupdf_camelot"
extractor = PDFExtractor(
    output_dir= EXPERIMENT_NAME,
    experiment_name= EXPERIMENT_NAME,
)

In [25]:
# Process the PDF and display the performance report
results = extractor.process_pdf(pdf_path)

# Or get performance report later
# extractor.print_performance_report()

# Get raw performance data for custom analysis
performance_data = extractor.generate_performance_report()

2024-12-13 01:43:07,884 - PDFExtractor - INFO - Processing PDF: 2024-2029_NationalOCSProgram_PFP_Sept_2023_Compliant_distilled.pdf
2024-12-13 01:43:07,885 - PDFExtractor - INFO - Extracting text using pymupdf
2024-12-13 01:43:08,640 - PDFExtractor - INFO - Extracting tables using camelot
2024-12-13 01:43:31,828 - PDFExtractor - INFO - Extracting images using pymupdf



PDF EXTRACTION PERFORMANCE REPORT

üîç OVERALL SUMMARY:
Total Execution Time: 38.95 seconds
Total Memory Used: 1.21 MB
Total Items Extracted: 144
Overall Success Rate: 100.0%

üõ†Ô∏è METHODS USED:
Text: pymupdf
Tables: camelot
Images: pymupdf

üìä DETAILED METRICS:

TEXT:
  Method: pymupdf
  Execution Time: 0.75 seconds
  Memory Usage: 0.00 MB
  Pages Processed: 53
  Items Extracted: 53
  Success: ‚úÖ

TABLES:
  Method: camelot
  Execution Time: 23.18 seconds
  Memory Usage: 1.21 MB
  Pages Processed: 49
  Items Extracted: 64
  Success: ‚úÖ

IMAGES:
  Method: pymupdf
  Execution Time: 15.01 seconds
  Memory Usage: 0.00 MB
  Pages Processed: 22
  Items Extracted: 27
  Success: ‚úÖ

üìà EXTRACTION STATISTICS:

Text:
  Total Items: 53
  Items per Page: 1.00

Tables:
  Total Items: 64
  Items per Page: 1.31

Images:
  Total Items: 27
  Items per Page: 1.23



In [28]:
# Initialize the PDFExtractor with PyPDF for text extraction
EXPERIMENT_NAME = "Default_Experiment_pypdf_camelot_pymupdf"
extractor = PDFExtractor(
    output_dir= EXPERIMENT_NAME,
    experiment_name= EXPERIMENT_NAME,
    extract_text_method="pypdf",
    save_metadata=True
)


# Process the PDF and display the performance report
results = extractor.process_pdf(pdf_path)

2024-12-13 01:45:28,254 - PDFExtractor - INFO - Processing PDF: 2024-2029_NationalOCSProgram_PFP_Sept_2023_Compliant_distilled.pdf
2024-12-13 01:45:28,255 - PDFExtractor - INFO - Extracting text using pypdf
2024-12-13 01:45:29,126 - PDFExtractor - INFO - Extracting tables using camelot
2024-12-13 01:45:52,414 - PDFExtractor - INFO - Extracting images using pymupdf



PDF EXTRACTION PERFORMANCE REPORT

üîç OVERALL SUMMARY:
Total Execution Time: 37.66 seconds
Total Memory Used: 268.45 MB
Total Items Extracted: 144
Overall Success Rate: 100.0%

üõ†Ô∏è METHODS USED:
Text: pypdf
Tables: camelot
Images: pymupdf

üìä DETAILED METRICS:

TEXT:
  Method: pypdf
  Execution Time: 0.87 seconds
  Memory Usage: 0.00 MB
  Pages Processed: 53
  Items Extracted: 53
  Success: ‚úÖ

TABLES:
  Method: camelot
  Execution Time: 23.29 seconds
  Memory Usage: 268.45 MB
  Pages Processed: 49
  Items Extracted: 64
  Success: ‚úÖ

IMAGES:
  Method: pymupdf
  Execution Time: 13.50 seconds
  Memory Usage: 0.00 MB
  Pages Processed: 22
  Items Extracted: 27
  Success: ‚úÖ

üìà EXTRACTION STATISTICS:

Text:
  Total Items: 53
  Items per Page: 1.00

Tables:
  Total Items: 64
  Items per Page: 1.31

Images:
  Total Items: 27
  Items per Page: 1.23



In [39]:
# Initialize the PDFExtractor with PyPDF for text extraction and pdfplumber for table extraction
EXPERIMENT_NAME = "Default_Experiment_pypdf_pdfblumber_pymupdf"
extractor = PDFExtractor(
    output_dir= EXPERIMENT_NAME,
    experiment_name= EXPERIMENT_NAME,
    extract_text_method="pypdf",
    extract_tables_method="pdfplumber",
    save_metadata=True
)


# Process PDF and see performance report
results = extractor.process_pdf(pdf_path)


2024-12-13 01:54:30,918 - PDFExtractor - INFO - Processing PDF: 2024-2029_NationalOCSProgram_PFP_Sept_2023_Compliant_distilled.pdf
2024-12-13 01:54:30,920 - PDFExtractor - INFO - Extracting text using pypdf
2024-12-13 01:54:33,509 - PDFExtractor - INFO - Extracting tables using pdfplumber
2024-12-13 01:54:42,199 - PDFExtractor - INFO - Extracting images using pymupdf



PDF EXTRACTION PERFORMANCE REPORT

üîç OVERALL SUMMARY:
Total Execution Time: 25.09 seconds
Total Memory Used: 89.02 MB
Total Items Extracted: 102
Overall Success Rate: 100.0%

üõ†Ô∏è METHODS USED:
Text: pypdf
Tables: pdfplumber
Images: pymupdf

üìä DETAILED METRICS:

TEXT:
  Method: pypdf
  Execution Time: 2.59 seconds
  Memory Usage: -240.00 MB
  Pages Processed: 53
  Items Extracted: 53
  Success: ‚úÖ

TABLES:
  Method: pdfplumber
  Execution Time: 8.69 seconds
  Memory Usage: 127.05 MB
  Pages Processed: 11
  Items Extracted: 22
  Success: ‚úÖ

IMAGES:
  Method: pymupdf
  Execution Time: 13.82 seconds
  Memory Usage: 201.96 MB
  Pages Processed: 22
  Items Extracted: 27
  Success: ‚úÖ

üìà EXTRACTION STATISTICS:

Text:
  Total Items: 53
  Items per Page: 1.00

Tables:
  Total Items: 22
  Items per Page: 2.00

Images:
  Total Items: 27
  Items per Page: 1.23

