# Resume and Transcript Redaction
___
## Step 1. Install dependences

In [43]:
# Uncomment if needed.
#!pip install -r requirements.txt
#!pip install streamlit


## Step 2. Build a catalogue of resumes for creating test sets

In [14]:
import os
import pandas as pd

def scan_folders_to_catalogue(root_folder, output_csv_path):
    """
    Scans the /data folder structure starting from the root folder and creates a CSV file
    cataloging all files in the subfolders.

    Assumes you've extracted all the folders from the Kaggle dataset: 
    https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset

    Args:
        root_folder (str): The path to the root folder to scan.
        output_csv_path (str): Path to save the resulting catalog CSV file.

    Returns:
        pd.DataFrame: DataFrame of the scanned file information.
    """
    # List to hold file details
    file_data = []

    # Walk through all subdirectories and files
    for dirpath, _, filenames in os.walk(root_folder):
        for filename in filenames:
            # Skip unnecessary files
            if filename in [".DS_Store", ".keep", "resume_sampling_catalogue.csv"]:
                continue         
            
            # Construct file details
            file_path = os.path.join(dirpath, filename)
            relative_path = os.path.relpath(file_path, root_folder)
            file_data.append({
                "folder": os.path.dirname(relative_path),  # Relative folder path
                "filename": filename,                     # File name
                "absolute_path": file_path                # Absolute file path
            })

    # Create a DataFrame from the collected data
    file_df = pd.DataFrame(file_data)

    # Save the catalog to CSV
    file_df.to_csv(output_csv_path, index=False)
    print(f"Resume catalog saved to {output_csv_path}")

    return file_df

# Example Usage
root_folder = "../data"  # Replace with the root directory containing the subfolders
output_csv_path = "../data/resume_sampling_catalogue.csv"  # Path to save the catalog CSV

# Run the function to generate the catalogue
resume_catalog = scan_folders_to_catalogue(root_folder, output_csv_path)

# Display the first few rows of the catalog
resume_catalog.head()


Resume catalog saved to ../data/resume_sampling_catalogue.csv


Unnamed: 0,folder,filename,absolute_path
0,AGRICULTURE,37201447.pdf,../data/AGRICULTURE/37201447.pdf
1,AGRICULTURE,12674256.pdf,../data/AGRICULTURE/12674256.pdf
2,AGRICULTURE,29968330.pdf,../data/AGRICULTURE/29968330.pdf
3,AGRICULTURE,81042872.pdf,../data/AGRICULTURE/81042872.pdf
4,AGRICULTURE,20006992.pdf,../data/AGRICULTURE/20006992.pdf


# Step 3. Create a random sample of 20 resumes for the Input Folder
Here’s a clean and modular code block to create the `../data/redact_input` folder and populate it with a test sample from the `resume_sampling_catalogue.csv`. This keeps the sampling and redaction workflows separate, allowing you to modify the input folder manually as needed.

---
### Explanation:
1. **Purpose**:
   - Populates `../data/redact_input` with a random sample of PDFs from `resume_sampling_catalogue.csv`.
   - Allows users to add or modify files in the `redact_input` folder later.

2. **Random Sampling**:
   - Reads `resume_sampling_catalogue.csv` to randomly select files.
   - If the catalogue has fewer files than the `SAMPLE_SIZE`, it copies all files.

3. **Copying Files**:
   - Creates the `redact_input` folder if it doesn’t exist.
   - Copies sampled PDFs to `../data/redact_input`.

4. **Print Statements**:
   - Tracks progress for better visibility.

---

### Directory Structure After Execution:
- `../data/redact_input/`: Contains the sampled files, ready for redaction.
- `../data/resume_sampling_catalogue.csv`: The source catalogue file remains untouched.

This approach ensures that you can **manually adjust the contents** of `../data/redact_input` before running the redaction step.

In [None]:
import os
import pandas as pd
import shutil
import random

def populate_redact_input(catalogue_path, input_folder, sample_size):
    """
    Populates the redact_input folder with a random sample of PDFs from the catalogue.
    Allows users to manually adjust the input folder later if needed.

    Args:
        catalogue_path (str): Path to the resume_sampling_catalogue.csv file.
        input_folder (str): Path to the redact_input folder to populate.
        sample_size (int): Number of PDFs to copy to the input folder.
    """
    # Read the catalogue
    print(f"Loading file catalogue from {catalogue_path}...")
    file_catalogue = pd.read_csv(catalogue_path)

    # Ensure the input folder exists
    os.makedirs(input_folder, exist_ok=True)

    # Select a random sample from the catalogue
    print(f"Selecting a random sample of {sample_size} PDFs...")
    if len(file_catalogue) <= sample_size:
        sample_catalogue = file_catalogue
    else:
        sample_catalogue = file_catalogue.sample(n=sample_size, random_state=42)

    # Copy the sampled PDFs to the redact_input folder
    print(f"Copying sampled files to {input_folder}...")
    for _, row in sample_catalogue.iterrows():
        src_path = row['absolute_path']
        filename = os.path.basename(src_path)
        dest_path = os.path.join(input_folder, filename)
        shutil.copy(src_path, dest_path)

    print(f"Populated {input_folder} with {len(sample_catalogue)} files.")

# Example Usage
CATALOGUE_PATH = "../data/resume_sampling_catalogue.csv"
INPUT_FOLDER = "../data/redact_input"
SAMPLE_SIZE = 20

populate_redact_input(CATALOGUE_PATH, INPUT_FOLDER, SAMPLE_SIZE)


# Step 4.Redaction Workflow

### Features:
1. **Test Mode**:
   - Controlled via the `TEST_MODE` parameter.
   - `True`: Adds semi-transparent grey redaction boxes.
   - `False`: Applies solid black redaction boxes permanently.

2. **Output Folder Management**:
   - Checks if `../data/redact_output` exists and contains files.
   - Prompts the user to clear it before proceeding.

3. **Clear Feedback**:
   - Summarizes the redaction process for all PDFs.
   - Tracks progress using a `tqdm` progress bar.

4. **Unified Input and Output**:
   - Processes files from `../data/redact_input`.
   - Outputs files to `../data/redact_output`.

---

## To use:
1. Place files to redact in `../data/redact_input`. (use sample creator above or add a custom set)
2. Run the script. If `../data/redact_output` contains files, you’ll be prompted to clear it.
3. Outputs:
   - Redacted files are saved in `../data/redact_output` with `_output` appended to their filenames.
   - A summary of the process is printed.
___

In [19]:
import os
import fitz  # PyMuPDF
from tqdm import tqdm  # For progress bar
from presidio_analyzer import AnalyzerEngine

##############################################################################
# CONFIGURATION
##############################################################################
INPUT_FOLDER = "../data/redact_input"
OUTPUT_FOLDER = "../data/redact_output"
TEST_MODE = True  # Set to True for test mode (transparent boxes), False for finalized redactions
LANGUAGE = "en"   # Language code for Presidio

# Initialize Presidio analyzer engine
analyzer = AnalyzerEngine()

##############################################################################
# HELPER FUNCTIONS
##############################################################################

def clear_output_folder(output_folder):
    """
    Clears the output folder after user confirmation.
    """
    if os.path.exists(output_folder) and os.listdir(output_folder):
        user_input = input(f"The folder '{output_folder}' is not empty. Do you want to clear it? (yes/no): ")
        if user_input.lower() in ["yes", "y"]:
            for file in os.listdir(output_folder):
                file_path = os.path.join(output_folder, file)
                os.remove(file_path)
            print(f"Cleared all files from '{output_folder}'.")
        else:
            print("Operation aborted. Please clear the folder and re-run the script.")
            exit()
    os.makedirs(output_folder, exist_ok=True)

def detect_pii_in_word(word_text):
    """
    Use Presidio to analyze a single word (string).
    Returns True if the word is detected as PII, otherwise False.
    """
    if not word_text.strip():
        return False
    results = analyzer.analyze(text=word_text, language=LANGUAGE)
    return len(results) > 0

def redact_pii_in_pdf(input_pdf_path, output_pdf_path, test_mode=True):
    """
    Opens a PDF, detects PII word-by-word, and applies redaction.
    - In test mode: Transparent redaction boxes are added.
    - In final mode: Solid black redactions are permanently applied.
    """
    doc = fitz.open(input_pdf_path)
    total_words = 0
    redacted_words = 0

    for page_index in range(len(doc)):
        page = doc[page_index]
        wordlist = page.get_text("words")
        redact_areas = []

        for w in wordlist:
            text = w[4]
            total_words += 1
            if detect_pii_in_word(text):
                x0, y0, x1, y1 = w[0], w[1], w[2], w[3]
                rect = fitz.Rect(x0, y0, x1, y1)
                redact_areas.append(rect)
                redacted_words += 1

        # Add redaction annotations to the page
        for rect in redact_areas:
            if test_mode:
                # Test mode: Transparent grey boxes
                page.add_redact_annot(rect, fill=(192, 192, 192, 128))  # Semi-transparent grey
            else:
                # Final mode: Solid black boxes
                page.add_redact_annot(rect, fill=(0, 0, 0))  # Solid black

        # Apply the redactions (only has an effect in final mode)
        if not test_mode:
            page.apply_redactions()

    # Save the document
    doc.save(output_pdf_path, garbage=4, deflate=True)
    doc.close()

    return {"total_words": total_words, "redacted_words": redacted_words}

##############################################################################
# MAIN WORKFLOW
##############################################################################

def main():
    # 1) Check and clear the output folder
    clear_output_folder(OUTPUT_FOLDER)

    # 2) Get all input PDFs
    input_pdfs = [os.path.join(INPUT_FOLDER, f) for f in os.listdir(INPUT_FOLDER) if f.lower().endswith(".pdf")]
    if not input_pdfs:
        print(f"No PDF files found in '{INPUT_FOLDER}'. Exiting.")
        return

    # 3) Process each PDF and redact PII
    print("Redacting PDFs...")
    overall_summary = []
    for pdf_path in tqdm(input_pdfs, desc="Processing PDFs", unit="file"):
        base_name = os.path.basename(pdf_path)
        base, ext = os.path.splitext(base_name)
        redacted_filename = f"{base}_output{ext}"
        redacted_filepath = os.path.join(OUTPUT_FOLDER, redacted_filename)
        summary = redact_pii_in_pdf(pdf_path, redacted_filepath, test_mode=TEST_MODE)
        overall_summary.append({
            "input_pdf": base_name,
            "redacted_pdf": os.path.basename(redacted_filepath),
            "total_words": summary["total_words"],
            "redacted_words": summary["redacted_words"]
        })

    # 4) Print summary statistics
    print("=" * 60)
    print(" Redaction Summary ")
    print("=" * 60)
    total_documents = len(overall_summary)
    total_words_processed = sum(d['total_words'] for d in overall_summary)
    total_redacted = sum(d['redacted_words'] for d in overall_summary)

    for doc_summary in overall_summary:
        print(f"File: {doc_summary['input_pdf']} -> {doc_summary['redacted_pdf']}")
        print(f"  Words: {doc_summary['total_words']}, Redacted: {doc_summary['redacted_words']}")
        print("-" * 60)

    print(f"Processed {total_documents} PDFs")
    print(f"Total words processed: {total_words_processed}")
    print(f"Total words redacted: {total_redacted}")
    print("=" * 60)

if __name__ == "__main__":
    main()


The folder '../data/redact_output' is not empty. Do you want to clear it? (yes/no):  yes


Cleared all files from '../data/redact_output'.
Redacting PDFs...


Processing PDFs: 100%|██████████████████████████| 1/1 [00:02<00:00,  2.08s/file]

 Redaction Summary 
File: brock-webb-resume-Sep2022.rtf.pdf -> brock-webb-resume-Sep2022.rtf_output.pdf
  Words: 1199, Redacted: 68
------------------------------------------------------------
Processed 1 PDFs
Total words processed: 1199
Total words redacted: 68





# Step 5 Extending Functionality using jobBERT 

## 5.1 Set up a Hugging Face environment 
1. install required libraries (M1 has special 
2. install ipykernel
3. add to jupyter `python -m ipykernel install --user --name=hf_env --display-name "Python (hf_env)"`
4. restart jupyter

### Packages (in order)
1. torch
1. torchvision 
1. torchaudio
1. transformers
1. tf-keras
1. pandas
1. numpy
1. matplotlib

### M1/M2/M3 Macs ... 
- You have to use the CPU (not GPU) no CUDA support.
- torch, torchvision, torchaudio are different
- `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu`

In [21]:
# Mac M1 install:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
#!pip install transformers

___
## 5.2 JobBERT Setup
- load and save the JobBERT model locally

In [23]:
from transformers import AutoModel, AutoTokenizer

# Download and save JobBERT locally (one-time)
model_name = "jjzha/jobbert-base-cased"  # Replace with the correct JobBERT model
save_directory = "./jobbert_model"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)
print("JobBERT model saved locally.")


tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/603 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at jjzha/jobbert-base-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


JobBERT model saved locally.


## 5.2.1 Loading the model

In [24]:
from transformers import AutoModel, AutoTokenizer

# Load JobBERT locally
model_directory = "./jobbert_model"
tokenizer = AutoTokenizer.from_pretrained(model_directory)
model = AutoModel.from_pretrained(model_directory)

print("JobBERT loaded successfully.")


JobBERT loaded successfully.


In [2]:
import os
import fitz  # PyMuPDF
import yaml
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern
from presidio_analyzer.nlp_engine import NlpEngine
from tqdm import tqdm
from typing import Dict, List, Any
from pathlib import Path
import logging
from dataclasses import dataclass
from collections import defaultdict
import regex as re
import sys

##############################################################################
# CONFIGURATION
##############################################################################
@dataclass
class Config:
    input_folder: str
    output_folder: str
    model_directory: str
    test_mode: bool
    language: str
    confidence_threshold: float
    batch_size: int
    cache_enabled: bool
    pii_types: List[str]
    redaction_options: Dict[str, Any]
    logging: Dict[str, Any]


def load_config(config_path: str = "config.yaml") -> Config:
    """Load configuration from YAML file"""
    default_config = {
        "input_folder": "../data/redact_input",
        "output_folder": "../data/redact_output",
        "model_directory": "../models/jobbert_model",
        "test_mode": True,
        "language": "en",
        "confidence_threshold": 0.75,
        "batch_size": 32,
        "cache_enabled": True,
        "pii_types": ["PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "LOCATION"],
        "redaction_options": {
            "minimum_confidence_score": 0.65,
            "context_words_before": 2,
            "context_words_after": 2,
            "preserve_formatting": True,
            "redaction_char": "█",
        },
        "logging": {
            "level": "INFO",
            "file": "redaction.log",
            "console_output": True,
            "detailed_summary": True,
        },
    }
    
    if os.path.exists(config_path):
        with open(config_path, 'r') as f:
            config_data = yaml.safe_load(f)
            default_config.update(config_data)
    
    return Config(**default_config)


##############################################################################
# LOGGING SETUP
##############################################################################
def setup_logging(config: Config):
    """Setup logging with proper handlers and avoid duplicate outputs"""
    # First, remove any existing handlers
    root = logging.getLogger()
    for handler in root.handlers[:]:
        root.removeHandler(handler)
    
    # Create formatter
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    
    # Setup handlers based on config
    handlers = []
    if config.logging.get("console_output", True):
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(formatter)
        handlers.append(console_handler)
    
    if config.logging.get("file"):
        file_handler = logging.FileHandler(config.logging["file"])
        file_handler.setFormatter(formatter)
        handlers.append(file_handler)

    # Configure root logger
    logging.basicConfig(
        level=getattr(logging, config.logging.get("level", "ERROR").upper(), logging.ERROR),
        handlers=handlers
    )

    # Enhanced logging suppression for external libraries
    logging.getLogger("transformers").setLevel(logging.ERROR)
    logging.getLogger("pytorch").setLevel(logging.ERROR)
    logging.getLogger("presidio").setLevel(logging.ERROR)
    logging.getLogger("presidio.analyzer").setLevel(logging.ERROR)
    logging.getLogger("presidio.analyzer.analyzer_engine").setLevel(logging.ERROR)
    logging.getLogger("presidio.analyzer.pattern").setLevel(logging.ERROR)
    logging.getLogger("presidio.analyzer.pattern_recognizer").setLevel(logging.ERROR)

##############################################################################
# MODEL MANAGEMENT
##############################################################################

class ModelManager:
    def __init__(self, model_directory: str, device: str = None):
        """
        Initialize the ModelManager to handle JobBERT and Presidio integration.
        """
        self.device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = None
        self.model = None
        self.analyzer = None
        self.word_cache = defaultdict(dict)
        
        # Completely suppress Presidio logging during initialization
        logging.getLogger("presidio").disabled = True
        
        self.custom_recognizers = self.setup_custom_recognizers()
        self.load_models(model_directory)
        
        # Re-enable logging after initialization if needed
        logging.getLogger("presidio").disabled = False

    def load_models(self, model_directory: str):
        """
        Load JobBERT and initialize Presidio Analyzer with custom recognizers.
        """
        try:
            # Load JobBERT model and tokenizer
            self.tokenizer = AutoTokenizer.from_pretrained(model_directory)
            self.model = AutoModel.from_pretrained(model_directory).to(self.device)

            # Initialize Presidio Analyzer
            self.analyzer = AnalyzerEngine()

            # Add custom recognizers to the Presidio registry
            for recognizer in self.custom_recognizers:
                self.analyzer.registry.add_recognizer(recognizer)

            logging.info("Models loaded successfully")

        except Exception as e:
            logging.error(f"Error loading models: {str(e)}")
            raise

    def setup_custom_recognizers(self) -> List[PatternRecognizer]:
        """
        Define and return custom recognizers for specific PII types.
        """
        custom_recognizers = []

        # Example: Custom recognizer for dates
        date_patterns = [
            Pattern(
                name="date_numeric",
                regex=r"(\d{1,2}/\d{4}|\d{4})–?(Present|\d{1,2}/\d{4})",
                score=0.85
            ),
            Pattern(
                name="date_alphanumeric",
                regex=r"(?i)(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{4}",
                score=0.85
            )
        ]
        date_recognizer = PatternRecognizer(
            supported_entity="DATE",
            patterns=date_patterns,
            context=["date", "period", "timeline"]
        )
        custom_recognizers.append(date_recognizer)

        # Add more custom recognizers as needed (e.g., certifications, financial terms)

        return custom_recognizers

    def detect_pii_in_word(self, word_text: str, config: Config) -> bool:
        """
        Detect PII in a given word using Presidio and custom recognizers.
        Args:
            word_text (str): The word to check for PII.
            config (Config): Configuration object with relevant settings.
        Returns:
            bool: True if the word is detected as PII, otherwise False.
        """
        if not word_text.strip():
            return False

        # Temporarily disable logging during analysis
        logging.getLogger("presidio").disabled = True
        try:
            results = self.analyzer.analyze(
                text=word_text,
                language=config.language
            )
        finally:
            # Re-enable logging after analysis
            logging.getLogger("presidio").disabled = False

        # Check if any recognized entity matches PII types with sufficient confidence
        return any(
            result.entity_type in config.pii_types
            and result.score >= config.redaction_options.get("minimum_confidence_score", 0.5)
            for result in results
        )

    def validate_with_jobbert(self, texts: List[str], config: Config) -> List[bool]:
        """
        Validate multiple texts with JobBERT embeddings.
        Args:
            texts (List[str]): List of text fragments to validate.
            config (Config): Configuration object.
        Returns:
            List[bool]: List of validation results for each text.
        """
        if not texts:
            return []

        # Check cache if enabled
        if config.cache_enabled:
            cached_results = [self.word_cache.get(text, None) for text in texts]
            if all(result is not None for result in cached_results):
                return cached_results

        # Process uncached texts
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        confidence_scores = [torch.norm(emb).item() for emb in embeddings]
        results = [score > config.confidence_threshold for score in confidence_scores]

        # Update cache
        if config.cache_enabled:
            for text, result in zip(texts, results):
                self.word_cache[text] = result

        return results


##############################################################################
# PDF PROCESSING
##############################################################################
class PDFProcessor:
    def __init__(self, model_manager: ModelManager, config: Config):
        self.model_manager = model_manager
        self.config = config

    def process_page(self, page: fitz.Page) -> Dict[str, Any]:
        """Process a single PDF page for PII redaction."""
        wordlist = page.get_text("words")
        redact_areas = []
        batch_texts = []
        batch_coords = []
        context_words_before = self.config.redaction_options.get("context_words_before", 0)
        context_words_after = self.config.redaction_options.get("context_words_after", 0)

        for idx, word in enumerate(wordlist):
            text = word[4]
            if self.model_manager.detect_pii_in_word(text, self.config):
                # Include context words
                context_start = max(0, idx - context_words_before)
                context_end = min(len(wordlist), idx + context_words_after + 1)
                context = " ".join([w[4] for w in wordlist[context_start:context_end]])
                batch_texts.append(context)
                batch_coords.append((word[0], word[1], word[2], word[3]))

        # Validate with JobBERT
        if batch_texts:
            validation_results = self.model_manager.validate_with_jobbert(batch_texts, self.config)
            for (coords, should_redact) in zip(batch_coords, validation_results):
                if should_redact:
                    redact_areas.append(fitz.Rect(*coords))

        return {"redact_areas": redact_areas, "word_count": len(wordlist)}

    def redact_pdf(self, input_path: str, output_path: str) -> Dict[str, int]:
        """Redact PII from a PDF document."""
        try:
            doc = fitz.open(input_path)
            total_words = 0
            redacted_words = 0

            for page_index in range(len(doc)):
                page = doc[page_index]
                result = self.process_page(page)
                total_words += result["word_count"]
                redacted_words += len(result["redact_areas"])

                # Apply redactions
                for rect in result["redact_areas"]:
                    fill_color = (192, 192, 192, 128) if self.config.test_mode else (0, 0, 0)
                    page.add_redact_annot(rect, fill=fill_color)
                if not self.config.test_mode:
                    page.apply_redactions()

            doc.save(output_path, garbage=4, deflate=True)
            doc.close()
            return {"total_words": total_words, "redacted_words": redacted_words}

        except Exception as e:
            logging.error(f"Error processing PDF {input_path}: {str(e)}")
            return {"total_words": 0, "redacted_words": 0}


##############################################################################
# FILE MANAGEMENT
##############################################################################
class FileManager:
    @staticmethod
    def clear_output_folder(output_folder: str):
        """Clear output folder with user confirmation"""
        if os.path.exists(output_folder) and os.listdir(output_folder):
            user_input = input(f"The folder '{output_folder}' is not empty. Clear it? (yes/no): ")
            if user_input.lower() in ["yes", "y"]:
                for file in os.listdir(output_folder):
                    os.remove(os.path.join(output_folder, file))
                logging.info(f"Cleared all files from '{output_folder}'")
            else:
                logging.info("Operation aborted")
                exit()
        os.makedirs(output_folder, exist_ok=True)

    @staticmethod
    def get_pdf_files(input_folder: str) -> List[str]:
        """Get list of PDF files from input folder"""
        return [os.path.join(input_folder, f) for f in os.listdir(input_folder) 
                if f.lower().endswith(".pdf")]

##############################################################################
# MAIN WORKFLOW
##############################################################################
def main():
    # Load configuration
    config = load_config()

    # Setup logging with reduced output
    setup_logging(config)

    try:
        # Initialize components
        logging.info("Initializing model manager...")
        model_manager = ModelManager(config.model_directory)
        pdf_processor = PDFProcessor(model_manager, config)

        # Prepare folders
        FileManager.clear_output_folder(config.output_folder)
        input_pdfs = FileManager.get_pdf_files(config.input_folder)

        if not input_pdfs:
            logging.warning(f"No PDF files found in '{config.input_folder}'")
            return

        # Process PDFs
        logging.info(f"Found {len(input_pdfs)} PDF files to process")
        overall_summary = []

        # Disable tqdm in non-interactive environments
        disable_tqdm = not sys.stdout.isatty()
        
        with tqdm(total=len(input_pdfs), 
                 desc="Processing PDFs", 
                 unit="file",
                 disable=disable_tqdm) as pbar:
            
            for pdf_path in input_pdfs:
                try:
                    base_name = os.path.basename(pdf_path)
                    output_filename = f"{os.path.splitext(base_name)[0]}_redacted.pdf"
                    output_path = os.path.join(config.output_folder, output_filename)

                    logging.debug(f"Processing: {base_name}")
                    summary = pdf_processor.redact_pdf(pdf_path, output_path)
                    
                    overall_summary.append({
                        "input_pdf": base_name,
                        "redacted_pdf": output_filename,
                        **summary
                    })

                except Exception as e:
                    logging.error(f"Error processing {base_name}: {str(e)}")
                    continue
                finally:
                    pbar.update(1)

        # Print summary
        print("\n" + "=" * 60)
        print(" Redaction Summary ")
        print("=" * 60)

        total_documents = len(overall_summary)
        total_words = sum(d['total_words'] for d in overall_summary)
        total_redacted = sum(d['redacted_words'] for d in overall_summary)

        # Print detailed summary if enabled
        if config.logging.get("detailed_summary", True):
            for doc_summary in overall_summary:
                print(f"\nFile: {doc_summary['input_pdf']} → {doc_summary['redacted_pdf']}")
                print(f"  Words processed: {doc_summary['total_words']:,}")
                print(f"  Words redacted:  {doc_summary['redacted_words']:,}")
                redaction_rate = (doc_summary['redacted_words'] / doc_summary['total_words'] * 100 
                                if doc_summary['total_words'] > 0 else 0)
                print(f"  Redaction rate:  {redaction_rate:.2f}%")
                print("-" * 60)

        # Print overall statistics
        print("\nOverall Statistics:")
        print(f"Documents processed: {total_documents:,}")
        print(f"Total words processed: {total_words:,}")
        print(f"Total words redacted: {total_redacted:,}")
        overall_rate = (total_redacted / total_words * 100 if total_words > 0 else 0)
        print(f"Overall redaction rate: {overall_rate:.2f}%")
        print("=" * 60)

        # Log completion
        logging.info("PDF processing completed successfully")

    except Exception as e:
        logging.error(f"Fatal error in main execution: {str(e)}")
        raise

if __name__ == "__main__":
    main()

The folder '../data/redact_output' is not empty. Clear it? (yes/no):  yes



 Redaction Summary 

File: brock-webb-resume-Sep2022.rtf.pdf → brock-webb-resume-Sep2022.rtf_redacted.pdf
  Words processed: 1,199
  Words redacted:  67
  Redaction rate:  5.59%
------------------------------------------------------------

Overall Statistics:
Documents processed: 1
Total words processed: 1,199
Total words redacted: 67
Overall redaction rate: 5.59%
