In [1]:
import os
import sys
import argparse
import numpy as np
from typing import List, Tuple, Optional

In [2]:
try:
    import onnxruntime as ort
    from transformers import AutoTokenizer
except ImportError as e:
    print(f"Missing required dependency: {e}")
    print("Install with: pip install onnxruntime transformers")

In [5]:
redactor = SimplePIIRedactor(model_path='models/onnx/pii-redaction-model')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading tokenizer from models/onnx/pii-redaction-model
Loading ONNX model from models/onnx/pii-redaction-model/model_optimized.onnx
Model loaded successfully!


In [6]:
english_text = """
My name is John Doe and my email is john@example.com
You can reach me at (123) 456-7890 or visit my website at https://www.johndoe.com.
I love programming in Python and my favorite library is NumPy.
"""
redacted = redactor.redact(english_text, redaction_token="***")
print(f"Original: {english_text}")
print(f"Redacted: {redacted}")

Original: 
My name is John Doe and my email is john@example.com
You can reach me at (123) 456-7890 or visit my website at https://www.johndoe.com.
I love programming in Python and my favorite library is NumPy.

Redacted: 
My name is *** and my email is ***
You can reach me at *** or visit my website at ***
I love programming in Python and my favorite library is NumPy.



In [7]:
hebrew_text = """
שם חולה: יוסי כהן
תאריך לידה: 01/01/1980
מספר טלפון: 050-1234567
כתובת: רחוב ירושלים 10, תל אביב
תעודת זהות: 123456789
הערות: יוסי סובל מאלרגיות למזון מסוים.  
יש להימנע ממתן מזון המכיל אגוזים.
"""
redacted_hebrew = redactor.redact(hebrew_text, redaction_token="***")
print(f"Original Hebrew: {hebrew_text}")
print(f"Redacted Hebrew: {redacted_hebrew}")

Original Hebrew: 
שם חולה: יוסי כהן
תאריך לידה: 01/01/1980
מספר טלפון: 050-1234567
כתובת: רחוב ירושלים 10, תל אביב
תעודת זהות: 123456789
הערות: יוסי סובל מאלרגיות למזון מסוים.  
יש להימנע ממתן מזון המכיל אגוזים.

Redacted Hebrew: 
שם חולה: ***
תאריך לידה: ***
מספר טלפון: ***
כתובת: ***
תעודת זהות: ***
הערות: *** מאלרגיות למזון מסוים.  
יש להימנע ממתן מזון המכיל אגוזים.



In [4]:

class SimplePIIRedactor:
    """Minimal PII redactor using ONNX model."""
    
    def __init__(self, model_path: str = "models/onnx/pii-redaction-model"):
        """
        Initialize the PII redactor.
        
        Args:
            model_path: Path to directory containing ONNX model files
        """
        self.model_path = model_path
        self.max_length = 128
        
        # Load tokenizer
        print(f"Loading tokenizer from {model_path}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        # Load ONNX model
        onnx_model_path = os.path.join(model_path, "model_optimized.onnx")
        if not os.path.exists(onnx_model_path):
            onnx_model_path = os.path.join(model_path, "model.onnx")
        
        print(f"Loading ONNX model from {onnx_model_path}")
        self.session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
        
        # Labels
        self.id_to_label = {0: "O", 1: "B-PII", 2: "I-PII"}
        
        print("Model loaded successfully!")
    
    def redact(self, text: str, redaction_token: str = "[REDACTED]") -> str:
        """
        Redact PII from text.
        
        Args:
            text: Input text
            redaction_token: Token to replace PII with
            
        Returns:
            Text with PII redacted
        """
        # Tokenize with offset mapping to preserve character positions
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="np",
            return_token_type_ids=True,
            return_offsets_mapping=True
        )
        
        # Get offset mapping for token-to-character alignment
        offset_mapping = inputs.pop("return_offsets_mapping", None)
        if offset_mapping is None:
            offset_mapping = inputs.pop("offset_mapping", None)
        
        # Prepare inputs for ONNX
        onnx_inputs = {
            "input_ids": inputs["input_ids"].astype(np.int64),
            "attention_mask": inputs["attention_mask"].astype(np.int64)
        }
        
        # Note: DistilBERT-based models don't use token_type_ids in ONNX export
        
        # Run inference
        outputs = self.session.run(None, onnx_inputs)
        
        # Get predictions
        predictions = np.argmax(outputs[0], axis=-1)[0]
        
        # Find PII entities using offset mapping
        entities = []
        current_entity_start = None
        current_entity_end = None
        
        for i, (pred, (start, end)) in enumerate(zip(predictions, offset_mapping[0])):
            if inputs["attention_mask"][0][i] == 0:  # Padding token
                break
                
            # Skip special tokens (offset mapping is (0, 0) for special tokens)
            if start == 0 and end == 0:
                continue
                
            label = self.id_to_label[pred]
            
            if label == "B-PII":  # Beginning of PII
                # Save previous entity if exists
                if current_entity_start is not None:
                    entities.append((current_entity_start, current_entity_end))
                # Start new entity
                current_entity_start = start
                current_entity_end = end
            elif label == "I-PII":  # Inside PII
                # Extend current entity
                if current_entity_start is not None:
                    current_entity_end = end
            else:  # Not PII
                # End current entity
                if current_entity_start is not None:
                    entities.append((current_entity_start, current_entity_end))
                    current_entity_start = None
                    current_entity_end = None
        
        # Add final entity if exists
        if current_entity_start is not None:
            entities.append((current_entity_start, current_entity_end))
        
        # Apply redaction by replacing character ranges (reverse order to avoid offset issues)
        redacted_text = text
        for start, end in reversed(entities):
            redacted_text = redacted_text[:start] + redaction_token + redacted_text[end:]
        
        return redacted_text

