# Data Masking Sensitive Attributes
Notebook for masking names and gender pronouns in bios data.

In [12]:
# Imports
import pandas as pd
import os

In [13]:
# Load data

data_path = "../data/bios/en/bias_in_bios.json"
train_split_path = os.path.join(os.path.dirname(data_path), 'train_split_balanced.csv')
test_split_path = os.path.join(os.path.dirname(data_path), 'test_split_balanced.csv')
train_df = pd.read_csv(train_split_path)
test_df = pd.read_csv(test_split_path)

## English

### Remove Gendered Pronouns

In [14]:
def remove_pronouns(text):
    """
    Remove gendered pronouns from text and replace with gender-neutral alternatives.

    Args:
        text (str): Input text containing potentially gendered pronouns
        
    Returns:
        str: Text with gendered pronouns replaced or removed
    """
    import re

    if not isinstance(text, str):
        return text
    
    # Apply replacements in specific order
    modified_text = text

    # 1. Handle reflexive pronouns first
    modified_text = re.sub(r'\bhimself\b', 'themselves', modified_text, flags=re.IGNORECASE)
    modified_text = re.sub(r'\bherself\b', 'themselves', modified_text, flags=re.IGNORECASE)
    
    # 2. Handle possessive pronouns
    modified_text = re.sub(r'\bhis\b', 'their', modified_text, flags=re.IGNORECASE)
    modified_text = re.sub(r'\bher\b(?=\s+[a-zA-Z]+)', 'their', modified_text, flags=re.IGNORECASE)
    modified_text = re.sub(r'\bhers\b', 'theirs', modified_text, flags=re.IGNORECASE)
    
    # 3. Handle subject pronouns WITH verb conjugation
    # Common verb patterns that need fixing after "they"
    verb_fixes = [
        # "he/she is" -> "they are"
        (r'\b(he|she)\s+is\b', r'they are', re.IGNORECASE),
        # "he/she was" -> "they were" 
        (r'\b(he|she)\s+was\b', r'they were', re.IGNORECASE),
        # "he/she has" -> "they have"
        (r'\b(he|she)\s+has\b', r'they have', re.IGNORECASE),
        # "he/she does" -> "they do"
        (r'\b(he|she)\s+does\b', r'they do', re.IGNORECASE),
        # "he/she goes" -> "they go" (remove 's' from third person singular)
        (r'\b(he|she)\s+(\w+)s\b', r'they \2', re.IGNORECASE),
    ]
    
    # Apply verb conjugation fixes
    for pattern, replacement, flags in verb_fixes:
        modified_text = re.sub(pattern, replacement, modified_text, flags=flags)
    
    # Handle remaining he/she (cases not caught by verb patterns)
    modified_text = re.sub(r'\bhe\b', 'they', modified_text, flags=re.IGNORECASE)
    modified_text = re.sub(r'\bshe\b', 'they', modified_text, flags=re.IGNORECASE)
    
    # 4. Handle object pronouns
    modified_text = re.sub(r'\bhim\b', 'them', modified_text, flags=re.IGNORECASE)
    modified_text = re.sub(r'\bher\b', 'them', modified_text, flags=re.IGNORECASE)

    return modified_text

In [15]:
# Test the function with some examples
def test_pronoun_removal():
    """Test the pronoun removal functions"""
    test_cases = [
        "He is a doctor. She works at the hospital.",
        "The surgeon completed his surgery. Her skills are excellent.",
        "He gave the book to her. She thanked him for his kindness.",
        "The nurse prepared herself for the long shift. He helped himself to coffee."
    ]
    
    print("=== REPLACEMENT VERSION ===")
    for i, text in enumerate(test_cases, 1):
        print(f"Original {i}: {text}")
        print(f"Modified {i}: {remove_pronouns(text)}")
        print()
    
# Run tests
test_pronoun_removal()

=== REPLACEMENT VERSION ===
Original 1: He is a doctor. She works at the hospital.
Modified 1: they are a doctor. they work at the hospital.

Original 2: The surgeon completed his surgery. Her skills are excellent.
Modified 2: The surgeon completed their surgery. their skills are excellent.

Original 3: He gave the book to her. She thanked him for his kindness.
Modified 3: they gave the book to them. they thanked them for their kindness.

Original 4: The nurse prepared herself for the long shift. He helped himself to coffee.
Modified 4: The nurse prepared themselves for the long shift. they helped themselves to coffee.



In [16]:
# Apply to the data and save results
# Apply pronoun removal to both splits and save masked versions
train_df['hard_text_masked'] = train_df['hard_text'].apply(remove_pronouns)
test_df['hard_text_masked'] = test_df['hard_text'].apply(remove_pronouns)

# Save masked CSVs next to original split files
train_masked_path = os.path.splitext(train_split_path)[0] + '_masked.csv'
test_masked_path = os.path.splitext(test_split_path)[0] + '_masked.csv'

train_df.to_csv(train_masked_path, index=False)
test_df.to_csv(test_masked_path, index=False)

print(f"Saved masked train to: {train_masked_path} ({len(train_df)} rows)")
print(f"Saved masked test  to: {test_masked_path} ({len(test_df)} rows)")

# Show a few examples to verify
train_df[['hard_text', 'hard_text_masked']].head(3)


Saved masked train to: ../data/bios/en/train_split_balanced_masked.csv (4480 rows)
Saved masked test  to: ../data/bios/en/test_split_balanced_masked.csv (1120 rows)


Unnamed: 0,hard_text,hard_text_masked
0,He is Advanced Jivamukti Certified and a mento...,they are Advanced Jivamukti Certified and a me...
1,Damien is a graduate of the University of Illi...,Damien is a graduate of the University of Illi...
2,"Having lost his dad at just 11 years old, he b...","Having lost their dad at just 11 years old, th..."


---
### Remove first and last names 

In [17]:
import spacy
nlp = spacy.load("en_core_web_sm")

def remove_names(text):
    """
    Remove names using Named Entity Recognition (more accurate but requires spacy).
    
    Args:
        text (str): Input text containing names
        
    Returns:
        str: Text with names removed
    """
    
    # Load English model (install with: python -m spacy download en_core_web_sm)
   
    
    doc = nlp(text)
    modified_text = text
    
    # Replace PERSON entities with [NAME]
    for ent in reversed(doc.ents):  # Reverse to maintain string indices
        if ent.label_ == "PERSON":
            modified_text = modified_text[:ent.start_char] + "[NAME]" + modified_text[ent.end_char:]
        
    return modified_text
        

In [18]:
def test_name_removal():
    """Test the name removal functions"""
    test_cases = [
        "John Smith is a surgeon at the hospital.",
        "Dr. Sarah Johnson completed her residency last year.",
        "The patient, Mary Williams, was admitted yesterday.",
        "Professor Michael Brown teaches at the university.",
        "Emily Davis and Robert Wilson work together."
    ]
    
    print("=== NER-BASED NAME REMOVAL ===")
    for i, text in enumerate(test_cases, 1):
        print(f"Original {i}: {text}")
        print(f"Modified {i}: {remove_names(text)}")
        print()

# Run tests
test_name_removal()

=== NER-BASED NAME REMOVAL ===
Original 1: John Smith is a surgeon at the hospital.
Modified 1: [NAME] is a surgeon at the hospital.

Original 2: Dr. Sarah Johnson completed her residency last year.
Modified 2: Dr. [NAME] completed her residency last year.

Original 3: The patient, Mary Williams, was admitted yesterday.
Modified 3: The patient, [NAME], was admitted yesterday.

Original 4: Professor Michael Brown teaches at the university.
Modified 4: Professor [NAME] teaches at the university.

Original 5: Emily Davis and Robert Wilson work together.
Modified 5: [NAME] and [NAME] work together.



In [19]:
from tqdm import tqdm

# Apply name removal to both splits with progress tracking
print("Applying name removal to training data...")
tqdm.pandas(desc="Processing train data")
train_df['hard_text_masked'] = train_df['hard_text_masked'].progress_apply(remove_names)

print("\nApplying name removal to test data...")
tqdm.pandas(desc="Processing test data")
test_df['hard_text_masked'] = test_df['hard_text_masked'].progress_apply(remove_names)

print("\nSaving masked CSV files...")

# Save masked CSVs next to original split files
train_masked_path = os.path.splitext(train_split_path)[0] + '_masked.csv'
test_masked_path = os.path.splitext(test_split_path)[0] + '_masked.csv'

print(f"Saving training data to: {train_masked_path}")
train_df.to_csv(train_masked_path, index=False)

print(f"Saving test data to: {test_masked_path}")
test_df.to_csv(test_masked_path, index=False)

print(f"\n✓ Saved masked train to: {train_masked_path} ({len(train_df)} rows)")
print(f"✓ Saved masked test to: {test_masked_path} ({len(test_df)} rows)")

# Show a few examples to verify
print("\n=== EXAMPLES OF MASKING ===")
print("Showing first 3 examples:")
for i in range(min(3, len(train_df))):
    print(f"\n--- Example {i+1} ---")
    print(f"Original:      {train_df['hard_text'].iloc[i][:100]}...")
    print(f"Masked:        {train_df['hard_text_masked'].iloc[i][:100]}...")
    print("-" * 80)

# Display the dataframe view
train_df[['hard_text', 'hard_text_masked']].head(3)

Applying name removal to training data...


Processing train data: 100%|██████████| 4480/4480 [00:39<00:00, 112.59it/s]



Applying name removal to test data...


Processing test data: 100%|██████████| 1120/1120 [00:10<00:00, 111.28it/s]


Saving masked CSV files...
Saving training data to: ../data/bios/en/train_split_balanced_masked.csv
Saving test data to: ../data/bios/en/test_split_balanced_masked.csv

✓ Saved masked train to: ../data/bios/en/train_split_balanced_masked.csv (4480 rows)
✓ Saved masked test to: ../data/bios/en/test_split_balanced_masked.csv (1120 rows)

=== EXAMPLES OF MASKING ===
Showing first 3 examples:

--- Example 1 ---
Original:      He is Advanced Jivamukti Certified and a mentor and teacher in the method since 2005. He is also a t...
Masked:        they are Advanced Jivamukti Certified and a mentor and teacher in the method since 2005. they are al...
--------------------------------------------------------------------------------

--- Example 2 ---
Original:      Damien is a graduate of the University of Illinois at Urbana-Champaign where he received a Bachelors...
Masked:        [NAME] is a graduate of the University of Illinois at Urbana-Champaign where they received a Bachelo...
------------




Unnamed: 0,hard_text,hard_text_masked
0,He is Advanced Jivamukti Certified and a mento...,they are Advanced Jivamukti Certified and a me...
1,Damien is a graduate of the University of Illi...,[NAME] is a graduate of the University of Illi...
2,"Having lost his dad at just 11 years old, he b...","Having lost their dad at just 11 years old, th..."
