# 02 - Text Extraction Experiments

This notebook tests and compares different PDF text extraction methods.

## Objectives
- Compare pdfplumber vs PyPDF2 extraction
- Test multi-column layout handling
- Measure extraction accuracy
- Handle edge cases

In [None]:
# Setup
import sys
sys.path.insert(0, '..')

import os
from pathlib import Path
import time

# Import extraction modules
from src.preprocessing.pdf_extractor import extract_text_from_pdf, PDFExtractionError

## 1. Test PDF Extraction

In [None]:
# List available PDF files
raw_data_dir = Path('../data/raw')
pdf_files = list(raw_data_dir.glob('*.pdf'))

print(f"Found {len(pdf_files)} PDF files:")
for pdf in pdf_files:
    print(f"  - {pdf.name}")

if not pdf_files:
    print("\n⚠️ No PDF files found. Add sample resumes to data/raw/ directory.")

In [None]:
# Function to test extraction with both methods
def compare_extraction_methods(pdf_path):
    results = {}
    
    # Test pdfplumber
    try:
        start = time.time()
        result_plumber = extract_text_from_pdf(pdf_path, method='pdfplumber')
        results['pdfplumber'] = {
            'time': time.time() - start,
            'text_length': len(result_plumber['text']),
            'pages': result_plumber['num_pages'],
            'success': result_plumber['success'],
            'preview': result_plumber['text'][:500]
        }
    except Exception as e:
        results['pdfplumber'] = {'error': str(e)}
    
    # Test PyPDF2
    try:
        start = time.time()
        result_pypdf2 = extract_text_from_pdf(pdf_path, method='pypdf2')
        results['pypdf2'] = {
            'time': time.time() - start,
            'text_length': len(result_pypdf2['text']),
            'pages': result_pypdf2['num_pages'],
            'success': result_pypdf2['success'],
            'preview': result_pypdf2['text'][:500]
        }
    except Exception as e:
        results['pypdf2'] = {'error': str(e)}
    
    return results

In [None]:
# Run comparison on available PDFs
if pdf_files:
    for pdf in pdf_files[:3]:  # Test first 3 PDFs
        print(f"\n{'='*50}")
        print(f"Testing: {pdf.name}")
        print('='*50)
        
        results = compare_extraction_methods(str(pdf))
        
        for method, data in results.items():
            print(f"\n{method.upper()}:")
            if 'error' in data:
                print(f"  Error: {data['error']}")
            else:
                print(f"  Time: {data['time']:.2f}s")
                print(f"  Text Length: {data['text_length']} chars")
                print(f"  Pages: {data['pages']}")

## 2. Text Cleaning Tests

In [None]:
from src.preprocessing.text_cleaner import clean_text, remove_headers_footers

# Test text with various issues
messy_text = """
Page 1 of 3

John    Doe
Software      Engineer



Email:    john@example.com

Page 2 of 3

EXPERIENCE
Company ABC    —    Senior Developer

Confidential
"""

# Clean text
cleaned = clean_text(messy_text)
print("After clean_text():")
print(cleaned)

print("\n" + "-"*50 + "\n")

# Remove headers/footers
no_headers = remove_headers_footers(messy_text)
print("After remove_headers_footers():")
print(no_headers)

## 3. Edge Case Testing

In [None]:
# Test error handling
test_cases = [
    ('nonexistent.pdf', 'File not found'),
    (None, 'None input'),
]

for test_path, description in test_cases:
    print(f"\nTest: {description}")
    try:
        result = extract_text_from_pdf(test_path)
        print(f"  Result: {result}")
    except Exception as e:
        print(f"  Exception: {type(e).__name__}: {e}")

## 4. Performance Metrics

Based on v0.1 requirements:
- PDF extraction accuracy ≥95% (single-column)
- Multi-column accuracy ≥80%
- Email detection ≥90%
- Processing speed ≤5s per resume

In [None]:
# Placeholder for accuracy metrics
# TODO: Add labeled test data and calculate metrics

metrics = {
    'single_column_accuracy': 'TBD',
    'multi_column_accuracy': 'TBD',
    'email_detection_rate': 'TBD',
    'avg_processing_time': 'TBD'
}

print("Performance Metrics (to be calculated with test data):")
for metric, value in metrics.items():
    print(f"  {metric}: {value}")