In [11]:
import os
import pandas as pd
import numpy as np
import spacy
import pprint
import xlrd
import re
import unidecode
import unicodedata
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer.pattern import Pattern
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult, AnonymizerConfig

## Preprocess Functions

In [2]:
# Functions to preprocess text before anonymizing
def replace_accent(s): # apply this in analyzer
    res = s
    replace_dict={
        "á":"a",
        "à":"a",
        "é":"e",
        "è":"e",
        "í":"i",
        "ì":"i",
        "ó":"o",
        "ò":"o",
        "ú":"u",
        "ù":"u",
        "Á":"A",
        "À":"A",
        "É":"E",
        "È":"E",
        "Í":"I",
        "Ì":"I",
        "Ó":"O",
        "Ò":"O",
        "Ú":"U",
        "Ù":"U",
    }
    for i,j in replace_dict.items():
        res = res.replace(i,j)
    return res

## Custom Recognizers

# TEST

In [3]:
from anonymizer import Anonymizer

In [4]:
anonymizer=Anonymizer()

### Phone

In [5]:
# Phone recognizer using regex as input
phone_recognizer=anonymizer.recognizer_regex("\d{9}|(\d{3}[ ]){2}(\d{1})|\d{3}[ ](\d{2}[ ]){2}(\d{2})", "MY_PHONE", "es")
anonymizer.anonymize_text("Mi telefono es 963456678 y mi email javispp2.0@gmail.com.",
                          recognizers=[phone_recognizer], entities=["MY_PHONE"])

'Mi telefono es MY_PHONE y mi email javispp2.0@gmail.com.'

In [6]:
import pandas as pd
text="Mi telefono es 963456678 y mi email javispp2.0@gmail.com."
df=pd.DataFrame(data=[text], columns=["text"])
df = anonymizer.anonymize_dataset(df, recognizers=[phone_recognizer], entities=["MY_PHONE"])
df

Unnamed: 0,text,has_PII
0,Mi telefono es MY_PHONE y mi email javispp2.0@...,True


### Surnames

In [7]:
# Create surname deny_list
surname_ds = pd.read_excel('../data/apellidos_frecuencia.xls',sheet_name=0) 
surname_ds = surname_ds[4:]  # skipp caption rows
surname_ds = surname_ds[surname_ds["Unnamed: 2"].astype("int")>=2000] # parameter

#Create my own surname blacklist
surname_blacklist = surname_ds["Unnamed: 1"].to_list()

# List of surnames(or other words) we dont want to consider as surnames
not_meaningful_surnames = [name for name in surname_blacklist if len(name) < 4] # parameter

#Unfortunate surnames
unfortunate_surnames=["CALLE", "PUERTA", "CALLEJON", "CALZADA", "DE LA CALLE", "ABRIL", "ALCALDE", "ALEGRE",
                        "ALEGRIA", "BAJO", "BARRIO", "BAÑOS", "BUENO", "CALLE", "CALLEJON", "CALZADA", "CARO", "CASAS",
                        "CIUDAD", "DIAS", "GRACIA", "GRANDE", "IGUAL", "JUSTICIA", "LLAMAS", "MAS", "MAYO", "MERCADO", 
                        "MIRA", "SOLA", "ESCALERA", "CORTES"] # parameter

surname_blacklist = list(set(surname_blacklist) - set(not_meaningful_surnames) - set(unfortunate_surnames))
surname_blacklist += list(map(lambda x: x[0]+x[1:].lower(), surname_blacklist)) #1st char is not lowercase

In [8]:
# Surname recognizer with deny_list
surname_recognizer=anonymizer.recognizer_deny_list(surname_blacklist, "SURNAMES", "es")
anonymizer.anonymize_text("Mi nombre es Javier López Sanz",
                          recognizers=[surname_recognizer], entities=["SURNAMES"], preprocess=replace_accent)

'Mi nombre es Javier SURNAMES SURNAMES'

## Using datasets

In [12]:
df = pd.DataFrame(data=["Javier López Sanz con tlf: 987654321"], columns=["text"])
anonymizer.anonymize_dataset(df, recognizers=[phone_recognizer, surname_recognizer], entities=["SURNAMES", "MY_PHONE"], 
                             preprocess=replace_accent)

Unnamed: 0,text,has_PII
0,Javier SURNAMES SURNAMES con tlf: MY_PHONE,True
