# Problem 3: Privacy-Preserving Document Processing

Building a multilingual PII detection and redaction system for English and French documents.

Approach:
1. Detection: Regex + spaCy NER
2. Redaction: Three strategies - Strict, Typed, Surrogate
3. Consistency: Maintain entity mapping for surrogate replacement

In [None]:
!pip install spacy faker langdetect
!python -m spacy download fr_core_news_md
!python -m spacy download en_core_web_lg

Collecting faker
  Downloading faker-40.1.2-py3-none-any.whl.metadata (16 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading faker-40.1.2-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=eb30766ee7a9cad6ebb7e67ccd358e71e205603d5fd73fc3cfcd9d18241918ba
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect, faker
Successfully installed faker-40.1.2 lan

In [None]:
import re
import spacy
from faker import Faker
from collections import defaultdict
import json

nlp_fr = spacy.load("fr_core_news_md")
nlp_en = spacy.load("en_core_web_lg")

faker_fr = Faker('fr_FR')
faker_en = Faker('en_US')

print("All dependencies loaded")

✓ All dependencies loaded


In [None]:
# Test documents
french_medical_doc = """Dossier Médical: Patient Marie Dubois (née le 15/03/1978)
Sécurité Sociale: 2 78 03 75 116 025 43
Adresse: 42 Rue de la République, 59000 Lille
Tél: +33 6 12 34 56 78
Email: marie.dubois@example.fr
Diagnostic: Hypertension (ICD-10: I10)
Médecin: Dr. Jean Martin, ID médical: FR789456123
"""

french_legal_doc = """CONTRAT DE TRAVAIL
Entre l'entreprise TechCorp SAS, 15 Avenue des Champs-Élysées, 75008 Paris
Et Monsieur Pierre Lefebvre, né le 22/07/1985
Numéro de Sécurité Sociale: 1 85 07 75 238 045 12
Domicile: 28 Boulevard Voltaire, 75011 Paris
Téléphone: +33 1 42 56 78 90
Email: p.lefebvre@techcorp.fr
Signé à Paris, le 10/01/2024
Directeur RH: Mme Sophie Moreau, ID: FR456123789
"""

french_financial_doc = """RELEVÉ BANCAIRE - Banque de France
Titulaire: Mme Isabelle Bernard
Date de naissance: 18/11/1990
Sécurité Sociale: 2 90 11 92 145 678 23
Adresse: 67 Rue de Rivoli, 75001 Paris
Téléphone: 06 78 90 12 34
Email: isabelle.bernard@gmail.com
IBAN: FR76 3000 6000 0112 3456 7890 189
Conseiller: M. Laurent Petit, ID: FR234567890
Date: 15/12/2023
"""

french_edge_cases = """DOSSIER CONFIDENTIEL
Ancien nom: Marie-Claire Lefèvre-Dubois (née Lefèvre le 31/12/1995)
Nouveau nom après mariage: Marie-Claire Martin
SSN: 2 95 12 75 456 789 01
Téléphone portable: 0612345678
Téléphone fixe: +33 (0)1 23 45 67 89
Emails: marie.claire@example.fr, mc.martin@work.com
Adresse: 123 Rue du Faubourg Saint-Antoine, 75012 Paris
Contact d'urgence: Jean Martin (époux), 06-78-90-12-34
Médecin: Docteur Sophie Lefebvre, FR890123456
"""

consistency_test_doc = """SUIVI MÉDICAL
Première visite de Marie Dubois le 15/01/2024
Marie Dubois (SSN: 2 78 03 75 116 025 43) a consulté.
Contact: marie.dubois@example.fr ou +33 6 12 34 56 78

Deuxième visite de Marie Dubois le 22/01/2024
Marie Dubois a montré des améliorations.

Troisième visite de Marie Dubois le 05/02/2024
Le dossier de Marie Dubois est à jour.
"""

english_medical_doc = """Patient Name: John Smith (DOB: 03/15/1978)
SSN: 123-45-6789
Address: 123 Main Street, New York, NY 10001
Phone: (212) 555-1234
Email: john.smith@example.com
Diagnosis: Hypertension (ICD-10: I10)
Physician: Dr. Sarah Johnson, Medical ID: US987654321
"""

english_legal_doc = """EMPLOYMENT AGREEMENT
Between TechCorp Inc., 500 5th Avenue, New York, NY 10110
And Ms. Emily Davis, born 07/22/1985
Social Security Number: 987-65-4321
Address: 789 Broadway, Brooklyn, NY 11211
Phone: (646) 555-9876
Email: e.davis@techcorp.com
Executed in New York, 01/10/2024
HR Director: Mr. Michael Brown, ID: US123456789
"""

english_financial_doc = """BANK STATEMENT - First National Bank
Account Holder: Mr. Robert Williams
Date of Birth: 11/18/1990
SSN: 456-78-9012
Address: 456 Park Avenue, Manhattan, NY 10022
Phone: (917) 555-4567
Email: robert.williams@gmail.com
Account #: 1234567890
Advisor: Ms. Jennifer Lee, ID: US345678901
"""

english_edge_cases = """CONFIDENTIAL FILE
Former name: Mary-Jane Smith-Johnson (née Smith, DOB: 12/31/1995)
Current name: Mary-Jane Williams
SSN: 567-89-0123
Cell: 6465559876
Office: +1 (212) 555-4321
Emails: mary.jane@example.com, mj.williams@work.org
Address: 123 West 42nd Street, Apt 5B, New York, NY 10036
Emergency Contact: John Williams (spouse), 646-555-7890
Physician: Doctor Jennifer Smith, US901234567
"""

TEST_DOCUMENTS = {
    'french': {
        'medical': french_medical_doc,
        'legal': french_legal_doc,
        'financial': french_financial_doc,
        'edge_cases': french_edge_cases,
        'consistency': consistency_test_doc,
    },
    'english': {
        'medical': english_medical_doc,
        'legal': english_legal_doc,
        'financial': english_financial_doc,
        'edge_cases': english_edge_cases,
    }
}

print(f"✓ Test documents loaded:")
print(f"  French: {len(TEST_DOCUMENTS['french'])} documents")
print(f"  English: {len(TEST_DOCUMENTS['english'])} documents")
print(f"  Total: {sum(len(docs) for docs in TEST_DOCUMENTS.values())} documents")

✓ Test documents loaded:
  French: 5 documents
  English: 4 documents
  Total: 9 documents


In [None]:
# Language detection
from langdetect import detect as detect_lang

def detect_language(text: str) -> str:
    detected = detect_lang(text)
    if detected == 'fr':
        return 'fr'
    elif detected == 'en':
        return 'en'
    else:
        # Default to English for other languages
        return 'en'

In [None]:
# Regex patterns for PII
FRENCH_PHONE_PATTERNS = [
    r'\+33\s\d\s\d{2}\s\d{2}\s\d{2}\s\d{2}',
    r'\+33\d\s?\d{2}\s?\d{2}\s?\d{2}\s?\d{2}',
    r'0[1-9]\s\d{2}\s\d{2}\s\d{2}\s\d{2}',
    r'0[1-9][\.-]\d{2}[\.-]\d{2}[\.-]\d{2}[\.-]\d{2}',
    r'0[1-9]\d{8}',
]

ENGLISH_PHONE_PATTERNS = [
    r'\+?1?\s?\(?\d{3}\)?[\s\.-]?\d{3}[\s\.-]?\d{4}',
    r'\d{10}',
]

FRENCH_SSN_PATTERNS = [
    r'[12]\s?\d{2}\s?\d{2}\s?\d{2}\s?\d{3}\s?\d{3}\s?\d{2}',
]

US_SSN_PATTERNS = [
    r'\d{3}[-\s]?\d{2}[-\s]?\d{4}',
]

EMAIL_PATTERN = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
DATE_PATTERNS = [r'\b\d{2}[/-]\d{2}[/-]\d{4}\b']
FRENCH_POSTAL_PATTERN = r'\b\d{5}\b'
US_ZIPCODE_PATTERN = r'\b\d{5}(?:-\d{4})?\b'
MEDICAL_ID_PATTERN = r'\b[A-Z]{2}\d{9}\b'
IBAN_PATTERN = r'\b[A-Z]{2}\d{2}[\s]?(?:\d{4}[\s]?){4,7}\d{1,4}\b'

def detect_pii_regex(text, language=None):
    if language is None:
        language = detect_language(text)

    entities = []

    # Email
    for match in re.finditer(EMAIL_PATTERN, text):
        entities.append({
            'type': 'EMAIL',
            'text': match.group(),
            'start': match.start(),
            'end': match.end(),
            'method': 'regex'
        })

    # Phone numbers
    if language == 'fr':
        for pattern in FRENCH_PHONE_PATTERNS:
            for match in re.finditer(pattern, text):
                entities.append({
                    'type': 'FR_PHONE',
                    'text': match.group(),
                    'start': match.start(),
                    'end': match.end(),
                    'method': 'regex'
                })
    else:
        for pattern in ENGLISH_PHONE_PATTERNS:
            for match in re.finditer(pattern, text):
                matched_text = match.group()
                if len(matched_text) == 10 and matched_text.isdigit():
                    start = max(0, match.start() - 20)
                    end = min(len(text), match.end() + 20)
                    context = text[start:end].lower()
                    if not any(word in context for word in ['phone', 'tel', 'call', 'contact', 'cell', 'mobile']):
                        continue

                entities.append({
                    'type': 'US_PHONE',
                    'text': matched_text,
                    'start': match.start(),
                    'end': match.end(),
                    'method': 'regex'
                })

    # SSN
    if language == 'fr':
        for pattern in FRENCH_SSN_PATTERNS:
            for match in re.finditer(pattern, text):
                entities.append({
                    'type': 'FR_SSN',
                    'text': match.group(),
                    'start': match.start(),
                    'end': match.end(),
                    'method': 'regex'
                })
    else:
        for pattern in US_SSN_PATTERNS:
            for match in re.finditer(pattern, text):
                entities.append({
                    'type': 'US_SSN',
                    'text': match.group(),
                    'start': match.start(),
                    'end': match.end(),
                    'method': 'regex'
                })

    # Dates
    for match in re.finditer(DATE_PATTERNS[0], text):
        entities.append({
            'type': 'DATE',
            'text': match.group(),
            'start': match.start(),
            'end': match.end(),
            'method': 'regex'
        })

    # Postal codes
    if language == 'fr':
        for match in re.finditer(FRENCH_POSTAL_PATTERN, text):
            entities.append({
                'type': 'FR_POSTAL',
                'text': match.group(),
                'start': match.start(),
                'end': match.end(),
                'method': 'regex'
            })
    else:
        for match in re.finditer(US_ZIPCODE_PATTERN, text):
            entities.append({
                'type': 'US_ZIPCODE',
                'text': match.group(),
                'start': match.start(),
                'end': match.end(),
                'method': 'regex'
            })

    # Medical IDs
    for match in re.finditer(MEDICAL_ID_PATTERN, text):
        entities.append({
            'type': 'MEDICAL_ID',
            'text': match.group(),
            'start': match.start(),
            'end': match.end(),
            'method': 'regex'
        })

    # IBAN for French docs
    if language == 'fr':
        for match in re.finditer(IBAN_PATTERN, text):
            entities.append({
                'type': 'IBAN',
                'text': match.group(),
                'start': match.start(),
                'end': match.end(),
                'method': 'regex'
            })

    return entities

# Test
print("Testing Regex Detection:\n")
print("="*70)
print("FRENCH MEDICAL DOCUMENT")
print("="*70)
detected_lang = detect_language(french_medical_doc)
print(f"Detected Language: {detected_lang}\n")
fr_regex_entities = detect_pii_regex(french_medical_doc)
for entity in fr_regex_entities:
    print(f"  {entity['type']:15s}: {entity['text']}")

print("\n" + "="*70)
print("ENGLISH MEDICAL DOCUMENT")
print("="*70)
detected_lang = detect_language(english_medical_doc)
print(f"Detected Language: {detected_lang}\n")
en_regex_entities = detect_pii_regex(english_medical_doc)
for entity in en_regex_entities:
    print(f"  {entity['type']:15s}: {entity['text']}")

Testing Regex Detection:

FRENCH MEDICAL DOCUMENT
Detected Language: fr

  EMAIL          : marie.dubois@example.fr
  FR_PHONE       : +33 6 12 34 56 78
  FR_SSN         : 2 78 03 75 116 025 43
  DATE           : 15/03/1978
  FR_POSTAL      : 59000
  MEDICAL_ID     : FR789456123

ENGLISH MEDICAL DOCUMENT
Detected Language: en

  EMAIL          : john.smith@example.com
  US_PHONE       :  (212) 555-1234
  US_SSN         : 123-45-6789
  US_SSN         : 987654321
  DATE           : 03/15/1978
  US_ZIPCODE     : 10001
  MEDICAL_ID     : US987654321


In [None]:
# NER detection with spaCy
def detect_pii_ner(text):
    language = detect_language(text)
    nlp = nlp_fr if language == 'fr' else nlp_en
    doc = nlp(text)

    # print("got the doc like:",doc,"\n end of doc\nentities detected are:",doc.ents)
    entities = []

    pii_mapping = {
        'PER': 'PERSON',
        'PERSON': 'PERSON',
        'LOC': 'LOCATION',
        'GPE': 'LOCATION',
        'ORG': 'ORGANIZATION',
        'DATE': 'DATE',
    }

    for ent in doc.ents:
        if ent.label_ in pii_mapping:
            entities.append({
                'type': pii_mapping[ent.label_],
                'text': ent.text,
                'start': ent.start_char,
                'end': ent.end_char,
                'method': 'ner',
                'language': language
            })

    return entities

print("Testing NER Detection:\n")
print("="*70)
print("FRENCH MEDICAL DOCUMENT")
print("="*70)
fr_ner_entities = detect_pii_ner(french_medical_doc)
print(f"Detected Language: {fr_ner_entities[0]['language'] if fr_ner_entities else 'N/A'}\n")
for entity in fr_ner_entities:
    print(f"  {entity['type']:15s}: {entity['text']}")

print("\n" + "="*70)
print("ENGLISH MEDICAL DOCUMENT")
print("="*70)
en_ner_entities = detect_pii_ner(english_medical_doc)
print(f"Detected Language: {en_ner_entities[0]['language'] if en_ner_entities else 'N/A'}\n")
for entity in en_ner_entities:
    print(f"  {entity['type']:15s}: {entity['text']}")

Testing NER Detection:

FRENCH MEDICAL DOCUMENT
Detected Language: fr

  PERSON         : Marie Dubois
  ORGANIZATION   : Sécurité Sociale
  LOCATION       : Lille
  PERSON         : Médecin
  PERSON         : Jean Martin
  ORGANIZATION   : ID

ENGLISH MEDICAL DOCUMENT
Detected Language: en

  PERSON         : John Smith
  DATE           : 03/15/1978
  ORGANIZATION   : SSN
  LOCATION       : New York
  LOCATION       : NY
  PERSON         : Sarah Johnson


In [None]:
# Hybrid detection - combine regex and NER
def detect_pii_hybrid(text):
    regex_entities = detect_pii_regex(text)
    ner_entities = detect_pii_ner(text)

    all_entities = regex_entities + ner_entities

    # Remove duplicates based on overlapping positions
    unique_entities = []
    for entity in all_entities:
        overlap = False
        for existing in unique_entities:
            if (entity['start'] < existing['end'] and entity['end'] > existing['start']):
                if entity['method'] == 'regex':
                    overlap = True
                    break
        if not overlap:
            unique_entities.append(entity)

    return sorted(unique_entities, key=lambda x: x['start'])

print("FRENCH MEDICAL DOCUMENT - Hybrid Detection:")
fr_hybrid_entities = detect_pii_hybrid(french_medical_doc)
for entity in fr_hybrid_entities:
    print(f"  {entity['type']:15s}: {entity['text']:30s} [{entity['method']}]")

print("\n" + "="*50 + "\n")

print("ENGLISH MEDICAL DOCUMENT - Hybrid Detection:")
en_hybrid_entities = detect_pii_hybrid(english_medical_doc)
for entity in en_hybrid_entities:
    print(f"  {entity['type']:15s}: {entity['text']:30s} [{entity['method']}]")

FRENCH MEDICAL DOCUMENT - Hybrid Detection:
  PERSON         : Marie Dubois                   [ner]
  DATE           : 15/03/1978                     [regex]
  ORGANIZATION   : Sécurité Sociale               [ner]
  FR_SSN         : 2 78 03 75 116 025 43          [regex]
  FR_POSTAL      : 59000                          [regex]
  LOCATION       : Lille                          [ner]
  FR_PHONE       : +33 6 12 34 56 78              [regex]
  EMAIL          : marie.dubois@example.fr        [regex]
  PERSON         : Médecin                        [ner]
  PERSON         : Jean Martin                    [ner]
  ORGANIZATION   : ID                             [ner]
  MEDICAL_ID     : FR789456123                    [regex]


ENGLISH MEDICAL DOCUMENT - Hybrid Detection:
  PERSON         : John Smith                     [ner]
  DATE           : 03/15/1978                     [regex]
  DATE           : 03/15/1978                     [ner]
  ORGANIZATION   : SSN                            [ner]

In [None]:
# Redaction Strategy 1: Strict
def redact_strict(text, entities):
    sorted_entities = sorted(entities, key=lambda x: x['start'], reverse=True)
    redacted_text = text
    for entity in sorted_entities:
        redacted_text = redacted_text[:entity['start']] + '[REDACTED]' + redacted_text[entity['end']:]
    return redacted_text

print("Strict Redaction - French:")
print(redact_strict(french_medical_doc, fr_hybrid_entities))
print("\n" + "="*50 + "\n")
print("Strict Redaction - English:")
print(redact_strict(english_medical_doc, en_hybrid_entities))

Strict Redaction - French:
Dossier Médical: Patient [REDACTED] (née le [REDACTED])
[REDACTED]: [REDACTED]
Adresse: 42 Rue de la République, [REDACTED] [REDACTED]
Tél: [REDACTED]
Email: [REDACTED]
Diagnostic: Hypertension (ICD-10: I10)
[REDACTED]: Dr. [REDACTED], [REDACTED] médical: [REDACTED]



Strict Redaction - English:
Patient Name: [REDACTED] (DOB: [REDACTED])
[REDACTED]: [REDACTED]
Address: 123 Main Street, [REDACTED], [REDACTED] [REDACTED]
Phone:[REDACTED]
Email: [REDACTED]
Diagnosis: Hypertension (ICD-10: I10)
Physician: Dr. [REDACTED], Medical ID: US[REDACTED]



In [None]:
# Redaction Strategy 2: Typed
def redact_typed(text, entities):
    type_mapping = {
        'PERSON': 'NAME',
        'FR_SSN': 'SSN',
        'US_SSN': 'SSN',
        'FR_PHONE': 'PHONE',
        'US_PHONE': 'PHONE',
        'EMAIL': 'EMAIL',
        'LOCATION': 'ADDRESS',
        'DATE': 'DATE',
        'MEDICAL_ID': 'MEDICAL_ID',
        'FR_POSTAL': 'ZIPCODE',
        'US_ZIPCODE': 'ZIPCODE',
    }

    sorted_entities = sorted(entities, key=lambda x: x['start'], reverse=True)
    redacted_text = text
    for entity in sorted_entities:
        entity_type = type_mapping.get(entity['type'], entity['type'])
        redacted_text = (
            redacted_text[:entity['start']] +
            f'[{entity_type}]' +
            redacted_text[entity['end']:]
        )

    return redacted_text

print("Typed Redaction - French:")
print(redact_typed(french_medical_doc, fr_hybrid_entities))
print("\n" + "="*50 + "\n")
print("Typed Redaction - English:")
print(redact_typed(english_medical_doc, en_hybrid_entities))

Typed Redaction - French:
Dossier Médical: Patient [NAME] (née le [DATE])
[ORGANIZATION]: [SSN]
Adresse: 42 Rue de la République, [ZIPCODE] [ADDRESS]
Tél: [PHONE]
Email: [EMAIL]
Diagnostic: Hypertension (ICD-10: I10)
[NAME]: Dr. [NAME], [ORGANIZATION] médical: [MEDICAL_ID]



Typed Redaction - English:
Patient Name: [NAME] (DOB: [DATE]RGANIZATION]: [SSN]
Address: 123 Main Street, [ADDRESS], [ADDRESS] [ZIPCODE]
Phone:[PHONE]
Email: [EMAIL]
Diagnosis: Hypertension (ICD-10: I10)
Physician: Dr. [NAME], Medical ID: US[SSN]



In [None]:
# Redaction Strategy 3: Surrogate with Faker
# Maintains consistency - same entity always maps to same fake value

def generate_surrogate(entity_type, original_text, language, mapping):
    # Check existing mapping
    if original_text in mapping[entity_type]:
        return mapping[entity_type][original_text]

    faker = faker_fr if language == 'fr' else faker_en
    seed = hash(original_text) % (2**32)
    Faker.seed(seed)

    if entity_type == 'PERSON':
        surrogate = faker.name()
    elif entity_type in ['FR_SSN', 'US_SSN']:
        if language == 'fr':
            surrogate = f"{faker.random_int(1, 2)} {faker.random_int(10, 99)} {faker.random_int(10, 99)} {faker.random_int(10, 99)} {faker.random_int(100, 999)} {faker.random_int(100, 999)} {faker.random_int(10, 99)}"
        else:
            surrogate = faker.ssn()
    elif entity_type in ['FR_PHONE', 'US_PHONE']:
        surrogate = faker.phone_number()
    elif entity_type == 'EMAIL':
        surrogate = faker.email()
    elif entity_type == 'LOCATION':
        surrogate = faker.address().replace('\n', ', ')
    elif entity_type == 'DATE':
        surrogate = faker.date(pattern='%d/%m/%Y')
    elif entity_type == 'MEDICAL_ID':
        prefix = 'FR' if language == 'fr' else 'US'
        surrogate = f"{prefix}{faker.random_number(digits=9)}"
    elif entity_type in ['FR_POSTAL', 'US_ZIPCODE']:
        surrogate = faker.postcode()
    elif entity_type == 'ORGANIZATION':
        surrogate = faker.company()
    elif entity_type == 'IBAN':
        surrogate = faker.iban()
    else:
        surrogate = '[UNKNOWN]'

    mapping[entity_type][original_text] = surrogate
    return surrogate

def redact_surrogate(text, entities):
    language = detect_language(text)
    mapping = defaultdict(dict)

    sorted_entities = sorted(entities, key=lambda x: x['start'], reverse=True)
    redacted_text = text

    for entity in sorted_entities:
        surrogate = generate_surrogate(entity['type'], entity['text'], language, mapping)
        redacted_text = (
            redacted_text[:entity['start']] +
            surrogate +
            redacted_text[entity['end']:]
        )

    return redacted_text, mapping

print("Surrogate Redaction - French:")
fr_surrogate, fr_mapping = redact_surrogate(french_medical_doc, fr_hybrid_entities)
print(fr_surrogate)
print("\nMapping Stats:")
stats = {k: len(v) for k, v in fr_mapping.items()}
print(json.dumps(stats, indent=2))

print("\n" + "="*50 + "\n")

print("Surrogate Redaction - English:")
en_surrogate, en_mapping = redact_surrogate(english_medical_doc, en_hybrid_entities)
print(en_surrogate)
print("\nMapping Stats:")
stats = {k: len(v) for k, v in en_mapping.items()}
print(json.dumps(stats, indent=2))

Surrogate Redaction - French:
Dossier Médical: Patient Renée Lemaire de la Bigot (née le 30/12/1993)
Vidal: 1 55 25 89 175 905 38
Adresse: 42 Rue de la République, 07685 50, rue Anaïs Laurent, 93686 Daniel-les-Bains
Tél: 0480403479
Email: renee20@example.org
Diagnostic: Hypertension (ICD-10: I10)
Daniel-Jules François: Dr. Nicole de la Bouvier, Tanguy Berthelot SARL médical: FR609974142


Mapping Stats:
{
  "MEDICAL_ID": 1,
  "ORGANIZATION": 2,
  "PERSON": 3,
  "EMAIL": 1,
  "FR_PHONE": 1,
  "LOCATION": 1,
  "FR_POSTAL": 1,
  "FR_SSN": 1,
  "DATE": 1
}


Surrogate Redaction - English:
Patient Name: Laura Garcia (DOB: 09/04/2023)
Washington-Mcdonald: 230-63-9872
Address: 123 Main Street, 5780 Christina Fort, New Lancebury, DE 80467, 297 Cheryl Tunnel Suite 666, North Ricardoville, ID 43504 18006
Phone:328.345.4200x20380
Email: markfitzgerald@example.org
Diagnosis: Hypertension (ICD-10: I10)
Physician: Dr. David May, Medical ID: US039-76-7708


Mapping Stats:
{
  "US_SSN": 2,
  "PERSON":

In [None]:
# Test consistency - same entity should map to same fake value
test_consistency_doc = """Patient Marie Dubois visited on 15/03/2023.
Marie Dubois has email marie.dubois@example.fr.
Contact Marie Dubois at +33 6 12 34 56 78.
Marie Dubois's SSN: 2 78 03 75 116 025 43
"""

print("Original Document with Repeated Entities:")
print(test_consistency_doc)
print("\n" + "="*50 + "\n")

consistency_entities = detect_pii_hybrid(test_consistency_doc)
print("Detected Entities:")
for entity in consistency_entities:
    print(f"  {entity['type']:15s}: {entity['text']}")

print("\n" + "="*50 + "\n")

redacted_consistency, consistency_mapping = redact_surrogate(test_consistency_doc, consistency_entities)
print("Surrogate Redacted (with Consistency):")
print(redacted_consistency)

print("\n" + "="*50 + "\n")

print("Consistency Mapping:")
for entity_type, mappings in consistency_mapping.items():
    print(f"\n{entity_type}:")
    for original, fake in mappings.items():
        print(f"  {original} → {fake}")

Original Document with Repeated Entities:
Patient Marie Dubois visited on 15/03/2023.
Marie Dubois has email marie.dubois@example.fr.
Contact Marie Dubois at +33 6 12 34 56 78.
Marie Dubois's SSN: 2 78 03 75 116 025 43



Detected Entities:
  PERSON         : Marie Dubois
  DATE           : 15/03/2023
  PERSON         : Marie Dubois
  EMAIL          : marie.dubois@example.fr
  PERSON         : Marie Dubois
  PERSON         : Marie Dubois's
  ORGANIZATION   : SSN


Surrogate Redacted (with Consistency):
Patient Renée Lemaire de la Bigot visited on 26/07/2020.
Renée Lemaire de la Bigot has email renee20@example.org.
Contact Renée Lemaire de la Bigot at +33 6 12 34 56 78.
Madeleine Devaux-Pruvost Vallée: 2 78 03 75 116 025 43



Consistency Mapping:

ORGANIZATION:
  SSN → Vallée

PERSON:
  Marie Dubois's → Madeleine Devaux-Pruvost
  Marie Dubois → Renée Lemaire de la Bigot

EMAIL:
  marie.dubois@example.fr → renee20@example.org

DATE:
  15/03/2023 → 26/07/2020


In [None]:
# Evaluation metrics
def calculate_metrics_by_type_text_based(predicted_entities, ground_truth_entities):
    metrics_by_type = defaultdict(lambda: {'tp': 0, 'fp': 0, 'fn': 0})

    predicted_set = {(e['type'], e['text'].strip()) for e in predicted_entities}
    ground_truth_set = {(e['type'], e['text'].strip()) for e in ground_truth_entities}

    # True positives
    for entity_type, text in predicted_set & ground_truth_set:
        metrics_by_type[entity_type]['tp'] += 1

    # False positives
    for entity_type, text in predicted_set - ground_truth_set:
        metrics_by_type[entity_type]['fp'] += 1

    # False negatives
    for entity_type, text in ground_truth_set - predicted_set:
        metrics_by_type[entity_type]['fn'] += 1

    return dict(metrics_by_type)

# Test cases with ground truth
EVALUATION_TEST_SET = [
    {
        'id': 'test_fr_001',
        'text': "Rapport: Marie Dubois, né le 15/03/1985, SSN: 2 78 03 75 116 025 43, Tel: 06 12 34 56 78, Email: marie.dubois@gmail.com",
        'ground_truth': [
            {'type': 'PERSON', 'text': 'Marie Dubois'},
            {'type': 'DATE', 'text': '15/03/1985'},
            {'type': 'FR_SSN', 'text': '2 78 03 75 116 025 43'},
            {'type': 'FR_PHONE', 'text': '06 12 34 56 78'},
            {'type': 'EMAIL', 'text': 'marie.dubois@gmail.com'},
        ]
    },
    {
        'id': 'test_en_001',
        'text': "Patient: John Smith, DOB: 03/15/1990, SSN: 123-45-6789, Phone: (212) 555-1234, Email: john.smith@email.com",
        'ground_truth': [
            {'type': 'PERSON', 'text': 'John Smith'},
            {'type': 'DATE', 'text': '03/15/1990'},
            {'type': 'US_SSN', 'text': '123-45-6789'},
            {'type': 'US_PHONE', 'text': '(212) 555-1234'},
            {'type': 'EMAIL', 'text': 'john.smith@email.com'},
        ]
    }
]

print("="*70)
print("EVALUATION ON TEST SET (Text-based matching)")
print("="*70)

all_metrics = defaultdict(lambda: {'tp': 0, 'fp': 0, 'fn': 0})

for test_case in EVALUATION_TEST_SET:
    print(f"\n{test_case['id']}:")
    print(f"  Text: {test_case['text'][:80]}...")

    predicted = detect_pii_hybrid(test_case['text'])

    detected_str = ', '.join([f"{e['type']}:{e['text']}" for e in predicted])
    expected_str = ', '.join([f"{e['type']}:{e['text']}" for e in test_case['ground_truth']])

    print(f"  Detected: {detected_str}")
    print(f"  Expected: {expected_str}")

    doc_metrics = calculate_metrics_by_type_text_based(predicted, test_case['ground_truth'])

    for entity_type, metrics in doc_metrics.items():
        all_metrics[entity_type]['tp'] += metrics['tp']
        all_metrics[entity_type]['fp'] += metrics['fp']
        all_metrics[entity_type]['fn'] += metrics['fn']

print("\n" + "="*70)
print("RESULTS BY ENTITY TYPE")
print("="*70)
print(f"\n{'Entity Type':<15} {'Precision':<12} {'Recall':<12} {'F1 Score':<12} {'TP':<6} {'FP':<6} {'FN':<6}")
print("-"*70)

overall_tp = overall_fp = overall_fn = 0

for entity_type in sorted(all_metrics.keys()):
    counts = all_metrics[entity_type]
    tp, fp, fn = counts['tp'], counts['fp'], counts['fn']
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"{entity_type:<15} {precision:<12.3f} {recall:<12.3f} {f1:<12.3f} {tp:<6} {fp:<6} {fn:<6}")

    overall_tp += tp
    overall_fp += fp
    overall_fn += fn

print("-"*70)
overall_precision = overall_tp / (overall_tp + overall_fp) if (overall_tp + overall_fp) > 0 else 0
overall_recall = overall_tp / (overall_tp + overall_fn) if (overall_tp + overall_fn) > 0 else 0
overall_f1 = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0

print(f"{'OVERALL':<15} {overall_precision:<12.3f} {overall_recall:<12.3f} {overall_f1:<12.3f} {overall_tp:<6} {overall_fp:<6} {overall_fn:<6}")
print("="*70)

EVALUATION ON TEST SET (Text-based matching)

test_fr_001:
  Text: Rapport: Marie Dubois, né le 15/03/1985, SSN: 2 78 03 75 116 025 43, Tel: 06 12 ...
  Detected: PERSON:Marie Dubois, DATE:15/03/1985, FR_SSN:2 78 03 75 116 025 43, PERSON:Tel, FR_PHONE:06 12 34 56 78, EMAIL:marie.dubois@gmail.com
  Expected: PERSON:Marie Dubois, DATE:15/03/1985, FR_SSN:2 78 03 75 116 025 43, FR_PHONE:06 12 34 56 78, EMAIL:marie.dubois@gmail.com

test_en_001:
  Text: Patient: John Smith, DOB: 03/15/1990, SSN: 123-45-6789, Phone: (212) 555-1234, E...
  Detected: PERSON:John Smith, DATE:03/15/1990, ORGANIZATION:SSN, US_SSN:123-45-6789, US_PHONE: (212) 555-1234, PERSON:Email, EMAIL:john.smith@email.com
  Expected: PERSON:John Smith, DATE:03/15/1990, US_SSN:123-45-6789, US_PHONE:(212) 555-1234, EMAIL:john.smith@email.com

RESULTS BY ENTITY TYPE

Entity Type     Precision    Recall       F1 Score     TP     FP     FN    
----------------------------------------------------------------------
DATE            1.

In [None]:
# Complete pipeline function
def anonymize_document(text, strategy='surrogate'):
    language = detect_language(text)
    entities = detect_pii_hybrid(text)

    if strategy == 'strict':
        redacted_text = redact_strict(text, entities)
        mapping = None
    elif strategy == 'typed':
        redacted_text = redact_typed(text, entities)
        mapping = None
    elif strategy == 'surrogate':
        redacted_text, mapping = redact_surrogate(text, entities)
    else:
        raise ValueError(f"Unknown strategy: {strategy}")

    return {
        'original_text': text,
        'redacted_text': redacted_text,
        'entities_detected': entities,
        'num_entities': len(entities),
        'language': language,
        'strategy': strategy,
        'mapping': mapping
    }

# Test all documents with all strategies
print("="*70)
print("TESTING ALL DOCUMENTS WITH ALL STRATEGIES")
print("="*70)

for lang, documents in TEST_DOCUMENTS.items():
    print(f"\n{'='*70}")
    print(f"LANGUAGE: {lang.upper()}")
    print(f"{'='*70}")

    for doc_type, doc_text in documents.items():
        print(f"\n{'-'*70}")
        print(f"Document Type: {doc_type}")
        print(f"{'-'*70}")

        for strategy in ['strict', 'typed', 'surrogate']:
            print(f"\n\n{'-'*70}")
            print(f"Strategy: {strategy.upper()}")
            print(f"{'-'*70}")

            result = anonymize_document(doc_text, strategy)

            # print(f"got the doc like: {doc_text} \n end of doc")
            entities = result['entities_detected']
            # print(f"entities detected are: {tuple(e['text'] for e in entities)}")

            print(f"Detected language: {result['language']}")
            print(f"Entities detected: {result['num_entities']}")

            entity_counts = defaultdict(int)
            for entity in entities:
                entity_counts[entity['type']] += 1
            print(f"Entity breakdown: {dict(entity_counts)}")

            print(f"\nRedacted text:")
            print(result['redacted_text'])

            if strategy == 'surrogate' and result['mapping']:
                print(f"\nMappings created: {sum(len(v) for v in result['mapping'].values())} unique values")

print("\n" + "="*70)
print("TESTING COMPLETE")
print("="*70)

TESTING ALL DOCUMENTS WITH ALL STRATEGIES

LANGUAGE: FRENCH

----------------------------------------------------------------------
Document Type: medical
----------------------------------------------------------------------


----------------------------------------------------------------------
Strategy: STRICT
----------------------------------------------------------------------
Detected language: fr
Entities detected: 12
Entity breakdown: {'PERSON': 3, 'DATE': 1, 'ORGANIZATION': 2, 'FR_SSN': 1, 'FR_POSTAL': 1, 'LOCATION': 1, 'FR_PHONE': 1, 'EMAIL': 1, 'MEDICAL_ID': 1}

Redacted text:
Dossier Médical: Patient [REDACTED] (née le [REDACTED])
[REDACTED]: [REDACTED]
Adresse: 42 Rue de la République, [REDACTED] [REDACTED]
Tél: [REDACTED]
Email: [REDACTED]
Diagnostic: Hypertension (ICD-10: I10)
[REDACTED]: Dr. [REDACTED], [REDACTED] médical: [REDACTED]



----------------------------------------------------------------------
Strategy: TYPED
----------------------------------------------