### PII Removal using spacy

In [None]:
import spacy
import re
import pandas as pd

#download spacy models - note this is one time activity
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
# !python -m spacy download en_core_web_lg

In [39]:
# Load spaCy English model once

# nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_md")
nlp = spacy.load("en_core_web_lg")

# Define PII-related NER labels
PII_ENTITY_LABELS = {"PERSON", "GPE", "ORG", "DATE", "LOC"}

In [40]:
# any other keywords that need to be added will be added in regex
REGEX_PATTERNS = {
    "PHONE_NUMBER": r'\b(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b',
    "SSN": r'\b\d{3}-\d{2}-\d{4}\b',
    "EMAIL": r'\b[\w.-]+@[\w.-]+\.\w{2,4}\b',
    "DEPARTMENT": r'\b(?:Human Resources|HR|Finance(?: Department)?|Accounting|Legal(?: Team)?|Marketing|Sales(?: Team)?|Customer Support|IT|Information Technology|Engineering|Operations|R&D|Research and Development|Admin(?:istration)?)\b'
}

In [41]:

def remove_pii_from_text(text: str) -> str:
    if not isinstance(text, str):
        return text

    doc = nlp(text)
    spans = [ent for ent in doc.ents if ent.label_ in PII_ENTITY_LABELS]
                
    for label, pattern in REGEX_PATTERNS.items():
        for match in re.finditer(pattern, text, flags=re.IGNORECASE):
            span = doc.char_span(match.start(), match.end(), label=label.upper())
            if span:
                spans.append(span)

    # Remove overlapping spans
    spans = sorted(set(span for span in spans if span), key=lambda x: x.start_char)
    
    # Replace PII spans with "[REDACTED]"
    redacted_text = text
    offset = 0
    for span in spans:
        start = span.start_char + offset
        end = span.end_char + offset
        redacted_text = redacted_text[:start] + "[REDACTED]" + redacted_text[end:]
        offset += len("[REDACTED]") - (end - start)

    return redacted_text

In [42]:

# Sample data for testing
df = pd.DataFrame({
    "message": [
        "John Smith lives in New York and works at Google.",
        "Contact me at jane.doe@example.com or call 123-456-7890.",
        "His SSN is 987-65-4320 and birth date is Jan 5, 1985."
    ]
})

In [43]:

# Apply the function
df["clean_message"] = df["message"].apply(remove_pii_from_text)

#printing the results
print(df[["message", "clean_message"]])

                                             message  \
0  John Smith lives in New York and works at Google.   
1  Contact me at jane.doe@example.com or call 123...   
2  His SSN is 987-65-4320 and birth date is Jan 5...   

                                       clean_message  
0  [REDACTED] lives in [REDACTED] and works at [R...  
1       Contact me at [REDACTED] or call [REDACTED].  
2  His [REDACTED] is [REDACTED] and birth date is...  


In [44]:
df

Unnamed: 0,message,clean_message
0,John Smith lives in New York and works at Google.,[REDACTED] lives in [REDACTED] and works at [R...
1,Contact me at jane.doe@example.com or call 123...,Contact me at [REDACTED] or call [REDACTED].
2,His SSN is 987-65-4320 and birth date is Jan 5...,His [REDACTED] is [REDACTED] and birth date is...


### PII removal using Huggingface

In [45]:
from transformers import pipeline

# Load Hugging Face NER pipeline
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)

# Define PII entity types (based on common NER schemes)
PII_ENTITY_GROUPS = {"PER", "ORG", "LOC", "MISC"}

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [46]:

def remove_pii_hf(text: str, pii_tags=PII_ENTITY_GROUPS, mask="[REDACTED]") -> str:
    if not isinstance(text, str):
        return text

    entities = ner_pipeline(text)

    # Sort entities in reverse order to avoid messing up indices
    entities = sorted(entities, key=lambda x: x['start'], reverse=True)

    # Replace PII spans
    for ent in entities:
        if ent["entity_group"] in pii_tags:
            start, end = ent["start"], ent["end"]
            text = text[:start] + mask + text[end:]

    return text


In [47]:
text = "John Smith lives in New York and works at Microsoft. His birthday is on July 5th."

clean_text = remove_pii_hf(text)
print(clean_text)

[REDACTED] lives in [REDACTED] and works at [REDACTED]. His birthday is on July 5th.


In [48]:
#how to use on dataframe
df = pd.DataFrame({
    "text": [
        "Elon Musk is the CEO of Tesla and lives in Texas.",
        "Sundar Pichai was born in India and works at Google."
    ]
})

df["clean_text"] = df["text"].apply(remove_pii_hf)
print(df)

                                                text  \
0  Elon Musk is the CEO of Tesla and lives in Texas.   
1  Sundar Pichai was born in India and works at G...   

                                          clean_text  
0  [REDACTED][REDACTED] is the CEO of [REDACTED] ...  
1  [REDACTED] was born in [REDACTED] and works at...  
