In [3]:
%pip install azure-ai-textanalytics azure-identity

Collecting azure-ai-textanalytics
  Using cached azure_ai_textanalytics-5.3.0-py3-none-any.whl.metadata (82 kB)
Collecting azure-identity
  Using cached azure_identity-1.25.2-py3-none-any.whl.metadata (90 kB)
Collecting azure-core<2.0.0,>=1.24.0 (from azure-ai-textanalytics)
  Using cached azure_core-1.38.2-py3-none-any.whl.metadata (48 kB)
Collecting azure-common~=1.1 (from azure-ai-textanalytics)
  Using cached azure_common-1.1.28-py2.py3-none-any.whl.metadata (5.0 kB)
Collecting isodate<1.0.0,>=0.6.1 (from azure-ai-textanalytics)
  Using cached isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Collecting cryptography>=2.5 (from azure-identity)
  Downloading cryptography-46.0.5-cp38-abi3-macosx_10_9_universal2.whl.metadata (5.7 kB)
Collecting msal>=1.31.0 (from azure-identity)
  Using cached msal-1.35.0-py3-none-any.whl.metadata (11 kB)
Collecting msal-extensions>=1.2.0 (from azure-identity)
  Using cached msal_extensions-1.3.1-py3-none-any.whl.metadata (7.8 kB)
Collecting cffi>=2.0.0 

In [18]:
import os
import json
from collections import defaultdict
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient

# Mapping

In [23]:
AZURE_PII_MAPPING = {
    'ADDRESS': 'Address',
    'AGE': 'Age',
    'BIRTHDAY': 'DateTime',
    'CITY': 'Location',               # Azure often categorizes cities as Location or Address
    'CREDIT_CARD': 'CreditCardNumber',
    'CREDIT_CARD_NUMBER': 'CreditCardNumber',
    'DATE': 'DateTime',
    'DATE_OF_BIRTH': 'DateTime',
    'DATE_TIME': 'DateTime',
    'DOB': 'DateTime',
    'DOMAIN': 'URL',
    'DOMAIN_NAME': 'URL',
    'EMAIL': 'Email',
    'EMAIL_ADDRESS': 'Email',
    'FACILITY': 'Organization',       # Facilities/Hospitals roll into Organization
    'FIRST_NAME': 'Person',
    'GPE': 'Location',
    'HCW': 'Person',                  # Health Care Worker
    'HOSP': 'Organization',
    'HOSPITAL': 'Organization',
    'IBAN': 'InternationalBankingAccountNumber',
    'IBAN_CODE': 'InternationalBankingAccountNumber',
    'ID': 'NationalIdentityNumber',   # Generic fallback for IDs in Azure
    'IP_ADDRESS': 'IPAddress',
    'LAST_NAME': 'Person',
    'LOC': 'Location',
    'LOCATION': 'Location',
    'NAME': 'Person',
    'NATIONALITY': 'O',               # Azure PII doesn't strictly flag Nationality/NRP as PII
    'NORP': 'O',                      # (Nationalities, Religious, Political groups)
    'NRP': 'O',
    'O': 'O',
    'ORG': 'Organization',
    'ORGANIZATION': 'Organization',
    'PATIENT': 'Person',
    'PATORG': 'Organization',
    'PER': 'Person',
    'PERSON': 'Person',
    'PHONE': 'PhoneNumber',
    'PHONE_NUMBER': 'PhoneNumber',
    'PREFIX': 'PersonType',           # Azure uses PersonType for Mr., Mrs., Dr., etc.
    'SSN': 'USSocialSecurityNumber',
    'STAFF': 'Person',
    'STREET_ADDRESS': 'Address',
    'TIME': 'DateTime',
    'TITLE': 'PersonType',            # Azure uses PersonType for job titles/roles
    'URL': 'URL',
    'US_DRIVER_LICENSE': 'USDriversLicenseNumber',
    'US_SSN': 'USSocialSecurityNumber',
    'VENDOR': 'Organization',
    'ZIP': 'Address',                 # Azure groups zip codes under the larger Address entity
    'ZIP_CODE': 'Address'
}

# Load Dataset

In [24]:
def load_dataset(file_path):
    """Loads the JSON dataset from the specified file path."""
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

In [25]:
# 1. Load the data from your specific JSON file
json_file_path = "synth_dataset_v2.json"
my_dataset = load_dataset(json_file_path)

# Evaluation

In [26]:
def evaluate_pii_dataset(dataset):
    # Metrics counters
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    # Process in batches of 5 (Azure's standard limit for this API call)
    batch_size = 5
    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i + batch_size]
        texts = [doc["full_text"] for doc in batch]
        
        # Call Azure API
        response = client.recognize_pii_entities(texts, language="en")
        
        for doc_idx, result in enumerate(response):
            if result.is_error:
                print(f"Error in document: {result.error}")
                continue
            
            ground_truth_spans = batch[doc_idx]["spans"]
            
            # Extract predicted spans formatting them like your ground truth
            predicted_spans = []
            for entity in result.entities:
                predicted_spans.append({
                    "entity_type": entity.category,
                    "start_position": entity.offset,
                    "end_position": entity.offset + entity.length
                })
            
            # --- EVALUATION LOGIC ---
            # Create sets of tuples for easy comparison: (start, end, type)
            gt_set = set()
            for span in ground_truth_spans:
                # Map your dataset's category to Azure's format for a fair comparison
                mapped_type = AZURE_PII_MAPPING.get(span["entity_type"], span["entity_type"])
                gt_set.add((span["start_position"], span["end_position"], mapped_type))
                
            pred_set = set()
            for span in predicted_spans:
                pred_set.add((span["start_position"], span["end_position"], span["entity_type"]))

            # Calculate intersections and differences
            matched = gt_set.intersection(pred_set)
            
            # True Positives: Predicted perfectly matches Ground Truth (offset and type)
            tp = len(matched)
            # False Positives: Predicted by Azure, but not in Ground Truth
            fp = len(pred_set - gt_set)
            # False Negatives: In Ground Truth, but missed by Azure
            fn = len(gt_set - pred_set)
            
            true_positives += tp
            false_positives += fp
            false_negatives += fn

    # --- CALCULATE METRICS ---
    # Precision: When Azure predicts PII, how often is it right?
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    
    # Recall: Out of all actual PII, how much did Azure find?
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    
    # F1-Score: The harmonic mean of Precision and Recall
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    print("--- EVALUATION RESULTS ---")
    print(f"True Positives (Exact Matches): {true_positives}")
    print(f"False Positives (Hallucinations/Over-tagging): {false_positives}")
    print(f"False Negatives (Missed PII): {false_negatives}")
    print("-" * 26)
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1_score:.4f}")

evaluate_pii_dataset(my_dataset)

--- EVALUATION RESULTS ---
True Positives (Exact Matches): 1515
False Positives (Hallucinations/Over-tagging): 1183
False Negatives (Missed PII): 1348
--------------------------
Precision: 0.5615
Recall:    0.5292
F1-Score:  0.5449
