In [73]:
import spacy
import re


In [74]:

# Load Spacy model
nlp = spacy.load("en_core_web_sm")


In [75]:

# Example text
text = """
John Doe called in to report that he lost his credit card.
John Doe's email is john.doe@example.com. 
His address is 123 Main Street, Columbus, Ohio 43240. 
This can also be stated as 123 Main St., Columbus, OH 43241.
Zip code 43333.
Now lets see if it can find this zipcode 43701
Another Zip code is 43065-1234.
He has 43054 chickens and chickens are not zip codes.
You can reach him at 555-123-4567. 
His SSN is 123-45-6789. 
His Credit Card number is 1234-5678-1234-5678.
He was born on 01/01/1983. 
He is a U.S. citizen.
His ip address is 192.168.1.1.
"""

#print("Original text:" + text)

In [76]:
# Mapping of state abbreviations to full names
state_mapping = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California',
    'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia',
    'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
    'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts',
    'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana',
    'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico',
    'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
    'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota',
    'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington',
    'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'
}

# Regex to match state abbreviations followed by ZIP codes or in address context
state_abbr_pattern = r'\b({})\b(?=\s+\d{{5}}(?:-\d{{4}})?|,|\s+\b[A-Za-z]+\b)'.format('|'.join(state_mapping.keys()))


In [77]:

# Function to replace state abbreviations with full names
def replace_state_abbreviations(text):
    def replacer(match):
        abbr = match.group(0)
        return state_mapping.get(abbr, abbr)
    
    # Replace state abbreviations with full names
    modified_text = re.sub(state_abbr_pattern, replacer, text)
    
    return modified_text

In [78]:
#Function to correctly find zip codes vs. other numbers
def is_likely_zip_code(text, match):

    """Determines if a 5-digit number is likely a ZIP code based on its context."""

    #print(f'Evaluating {match.group()}')
    # start_index = match.start()
    # end_index = match.end()
    
    #print(f'Match Start: {start_index}, Match End: {end_index}')
    
    before = text[:match.start()].strip().lower()
    # after = text[match.end():].strip().lower()
    #print(f'Original: {text}\n')
    #print(f'Before match: {before}\n')
    #print(f'Four before: {before.split()[-5:]}')
    #print(f'After match: {after}\n')

    # Get the last 5 words in the 'before' string
    before_words = before.split()[-5:]

    # Remove commas from the last 5 words
    cleaned_before_words = [word.replace(',', '').replace('.', '') for word in before_words]

    
    # List of variations to check
    zip_variations = ['zip', 'zipcode', 'zip_code']
    
    # Check if any variation of "ZIP" precedes the number
    if any(variation in cleaned_before_words for variation in zip_variations):
        #print("Zip Before \n")
        return True
    
    
    
    # Check if a street, road, or city name comes before the number
    address_keywords = ['street', 'st', 'avenue', 'ave', 'boulevard', 'blvd', 'road', 'rd', 'lane', 'ln', 'drive', 'dr', 'court', 'ct', 'place', 'pl', 'circle', 'cir']
    
    if any(keyword in cleaned_before_words for keyword in address_keywords):
        #print("Address Before \n")
        return True
    
    # Otherwise, assume it's not a ZIP code
    #print("Not Zip \n")
    return False

In [79]:

# Function to redact PII
def redact_pii(text):
 
    # Regex patterns for various PII items (comment out or remove to keep from redacting specific patterns)
    patterns = {
        'Zip_Code': r'\b\d{5}(?:-\d{4})?\b',
        'Email': r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}',
        'Phone': r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b',
        'SSN': r'\b\d{3}-\d{2}-\d{4}\b',
        'Credit_Card': r'\b(?:\d{4}[-.\s]?){3}\d{4}\b',
        'DOB': r'\b\d{2}[/-]\d{2}[/-]\d{4}\b',
        'IP_Address': r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b',
        'Address': r'\d+\s+\w+\s+(Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Lane|Ln|Drive|Dr|Court|Ct|Circle|Cir|Place|Pl)\.?\s*(Apt\.?|Apartment|Unit)?\s*\d*'
    }
    
    # Replace state abbreviations with full state names
    modified_text = replace_state_abbreviations(text)

    # Apply the Zip_Code pattern first with contextual logic
    zip_code_pattern = patterns['Zip_Code']
    matches = list(re.finditer(zip_code_pattern, modified_text))

    # Process matches
    for original_match in matches:
        match = re.search(re.escape(original_match.group()), modified_text)
        #print(f"-"*50)
        if match and is_likely_zip_code(modified_text, match):
            #print(f"Redacting Zip Code...by pattern {zip_code_pattern}")
            #print(f"Match: {match.group()}")
            modified_text = modified_text[:match.start()] + '[Zip_Code_REDACTED_By_Pattern]' + modified_text[match.end():]
            #print(f"Modified text: {modified_text}\n")
            
     # Apply each pattern in the order they are defined in the dictionary
    for label, pattern in patterns.items():
        if label != 'Zip_Code': # Skip Zip_Code pattern since we handled it separately
            #print(f"Redacting {label}...by pattern {pattern}")
            modified_text = re.sub(pattern, f'[{label}_REDACTED_By_Pattern]', modified_text)
            #print(f"Modified text: {modified_text}\n")

    # Spacy for NER (Names, Locations, etc.)
    doc = nlp(modified_text)

    # List of terms to exclude from redaction to avoid mistakenly redacting non-PII
    exclude_terms = ['SSN', 'Credit Card', 'Zip']

    # Remove entities that are in the exclude list
    new_ents = []

    for ent in doc.ents:
        if ent.text in exclude_terms:
            continue # Skip redacting excluded terms
        else:
            new_ents.append(ent)

    # Update the entities we want to redact
    doc.ents = new_ents

    # List of Spacy NER labels to redact (comment out or remove to keep from redacting specific labels)
    ner_labels = [
        'PERSON',        # People, including fictional characters
        'NORP',          # Nationalities, religious or political groups
        'FAC',           # Buildings, airports, highways, bridges, etc.
        'ORG',           # Companies, agencies, institutions, etc.
        'GPE',           # Countries, cities, states
        'LOC',           # Non-GPE locations, mountain ranges, bodies of water
        'PRODUCT',       # Objects, vehicles, foods, etc. (not services)
        'EVENT',         # Named events, historical or cultural moments
        'WORK_OF_ART',   # Titles of books, songs, paintings, etc.
        'LAW',           # Named legal documents or laws
        'LANGUAGE',      # Named languages
        'DATE',          # Absolute or relative dates or periods
        'TIME',          # Times smaller than a day
        'PERCENT',       # Percentage values
        'MONEY',         # Monetary values, including unit
        'QUANTITY',      # Measurements, as of weight or distance
        'ORDINAL'        # "First," "second," etc.
        #'CARDINAL'       # Numerals that do not fall under another type
    ]
    
    # Redact entities based on NER labels  
    for ent in doc.ents:
        if ent.label_ in ner_labels:
            modified_text = modified_text.replace(ent.text, f'[{ent.label_}_REDACTED_By_Spacy]')
    
    
    return modified_text



In [80]:

# Redact PII from text
redacted_text = redact_pii(text)
print(f'Orginal text: {text}')
print(f'Redacted text: {redacted_text}')


Orginal text: 
John Doe called in to report that he lost his credit card.
John Doe's email is john.doe@example.com. 
His address is 123 Main Street, Columbus, Ohio 43240. 
This can also be stated as 123 Main St., Columbus, OH 43241.
Zip code 43333.
Now lets see if it can find this zipcode 43701
Another Zip code is 43065-1234.
He has 43054 chickens and chickens are not zip codes.
You can reach him at 555-123-4567. 
His SSN is 123-45-6789. 
His Credit Card number is 1234-5678-1234-5678.
He was born on 01/01/1983. 
He is a U.S. citizen.
His ip address is 192.168.1.1.

Redacted text: 
[PERSON_REDACTED_By_Spacy] called in to report that he lost his credit card.
[PERSON_REDACTED_By_Spacy]'s email is [Email_REDACTED_By_Pattern]. 
His address is [Address_REDACTED_By_Pattern], [GPE_REDACTED_By_Spacy], [GPE_REDACTED_By_Spacy] [Zip_Code_REDACTED_By_Pattern]. 
This can also be stated as [Address_REDACTED_By_Pattern], [GPE_REDACTED_By_Spacy], [GPE_REDACTED_By_Spacy] [Zip_Code_REDACTED_By_Pattern].
