In [2]:
!pip install -r requirements.txt

Collecting presidio_analyzer==2.2.30 (from -r requirements.txt (line 1))
  Downloading presidio_analyzer-2.2.30-py3-none-any.whl.metadata (2.3 kB)
Collecting presidio_anonymizer==2.2.30 (from -r requirements.txt (line 2))
  Downloading presidio_anonymizer-2.2.30-py3-none-any.whl.metadata (8.0 kB)
Collecting PyMuPDF==1.22.5 (from -r requirements.txt (line 3))
  Downloading PyMuPDF-1.22.5-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.3 kB)
Collecting spacy>=3.2.0 (from presidio_analyzer==2.2.30->-r requirements.txt (line 1))
  Downloading spacy-3.8.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting tldextract (from presidio_analyzer==2.2.30->-r requirements.txt (line 1))
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting phonenumbers>=8.12 (from presidio_analyzer==2.2.30->-r requirements.txt (line 1))
  Downloading phonenumbers-8.13.52-py2.py3-none-any.whl.metadata (10 kB)
Collecting pycryptodome>=3.10.1 (from presidio_anonymizer==2.2.30->-r requirem

In [None]:
import os
import random
import shutil
import fitz  # PyMuPDF
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer import RecognizerResult

# If you have additional or custom recognizers, import them here
# from presidio_analyzer import PatternRecognizer

##############################################################################
# CONFIGURATION
##############################################################################
INPUT_FOLDER = "/data/resumes"
OUTPUT_FOLDER = "/data/redaction_test"
SAMPLE_SIZE = 20  # Number of resumes to sample
LANGUAGE = "en"   # Language code for Presidio

# Initialize Presidio engines
analyzer = AnalyzerEngine()

##############################################################################
# HELPER FUNCTIONS
##############################################################################

def select_random_pdfs(input_folder, sample_size):
    """
    Selects a random sample of PDF files from the input folder.
    Returns a list of absolute file paths.
    """
    all_pdfs = [f for f in os.listdir(input_folder)
                if f.lower().endswith(".pdf")]
    if len(all_pdfs) <= sample_size:
        return [os.path.join(input_folder, pdf) for pdf in all_pdfs]
    else:
        return [os.path.join(input_folder, pdf)
                for pdf in random.sample(all_pdfs, sample_size)]


def copy_and_rename_pdfs(pdfs, output_folder):
    """
    Copies each PDF into the output folder, appending '_input' to the filename.
    Returns a list of the newly copied file paths.
    """
    os.makedirs(output_folder, exist_ok=True)
    copied_paths = []

    for pdf_path in pdfs:
        filename = os.path.basename(pdf_path)
        base, ext = os.path.splitext(filename)
        new_filename = f"{base}_input{ext}"
        new_path = os.path.join(output_folder, new_filename)
        shutil.copy(pdf_path, new_path)
        copied_paths.append(new_path)

    return copied_paths


def detect_pii_in_word(word_text):
    """
    Use Presidio to analyze a single word (string).
    Returns True if the word is detected as PII, otherwise False.
    """
    if not word_text.strip():
        return False  # skip empty or whitespace

    results = analyzer.analyze(
        text=word_text,
        language=LANGUAGE
    )
    # If Presidio found any recognized entity, we consider it PII
    return len(results) > 0


def redact_pii_in_pdf(input_pdf_path, output_pdf_path):
    """
    Opens a PDF with PyMuPDF (fitz), detects PII word-by-word with Presidio,
    and draws a black rectangle over any PII word. Then saves the redacted PDF.

    Returns a summary dict with counts of total words and how many were redacted.
    """
    doc = fitz.open(input_pdf_path)
    total_words = 0
    redacted_words = 0

    for page_index in range(len(doc)):
        page = doc[page_index]
        # Extract words in the format: [x0, y0, x1, y1, "word", block_no, line_no, word_no]
        wordlist = page.get_text("words")

        # We'll track the areas to redact in a list
        redact_areas = []

        for w in wordlist:
            # w is (x0, y0, x1, y1, "text", block_no, line_no, word_no)
            text = w[4]
            total_words += 1

            if detect_pii_in_word(text):
                # If PII, create a rectangle for redaction
                x0, y0, x1, y1 = w[0], w[1], w[2], w[3]
                rect = fitz.Rect(x0, y0, x1, y1)
                redact_areas.append(rect)
                redacted_words += 1

        # Now apply redaction annotations
        for rect in redact_areas:
            page.add_redact_annot(rect, fill=(0, 0, 0))  # black rectangle

    # Actually apply the redactions
    doc.saveIncr()  # Make sure to incrementally save before applying
    doc.apply_redactions()
    doc.save(output_pdf_path)
    doc.close()

    return {
        "total_words": total_words,
        "redacted_words": redacted_words
    }


def main():
    # 1) Select random PDFs
    pdfs_to_process = select_random_pdfs(INPUT_FOLDER, SAMPLE_SIZE)

    # 2) Copy them into /data/redaction_test with _input appended
    input_pdfs = copy_and_rename_pdfs(pdfs_to_process, OUTPUT_FOLDER)

    # 3) For each newly copied PDF, create a redacted version with _redacted appended
    overall_summary = []

    for pdf_path in input_pdfs:
        base_name = os.path.basename(pdf_path)
        base, ext = os.path.splitext(base_name)

        # Output redacted file
        redacted_filename = f"{base.replace('_input','')}_redacted{ext}"
        redacted_filepath = os.path.join(OUTPUT_FOLDER, redacted_filename)

        summary = redact_pii_in_pdf(pdf_path, redacted_filepath)

        # Store the summary data
        overall_summary.append({
            "input_pdf": base_name,
            "redacted_pdf": os.path.basename(redacted_filepath),
            "total_words": summary["total_words"],
            "redacted_words": summary["redacted_words"]
        })

    # 4) Print summary statistics
    print("="*60)
    print(" Redaction Summary ")
    print("="*60)
    total_documents = len(overall_summary)
    total_words_processed = 0
    total_redacted = 0

    for doc_summary in overall_summary:
        total_words_processed += doc_summary["total_words"]
        total_redacted += doc_summary["redacted_words"]

        print(f"File: {doc_summary['input_pdf']} -> {doc_summary['redacted_pdf']}")
        print(f"  Words: {doc_summary['total_words']}, Redacted: {doc_summary['redacted_words']}")
        print("-"*60)

    print(f"Processed {total_documents} PDFs")
    print(f"Total words processed: {total_words_processed}")
    print(f"Total words redacted: {total_redacted}")
    print("="*60)


if __name__ == "__main__":
    main()
