In [6]:
import os
import pandas as pd
import numpy as np
import spacy
import pprint
import xlrd
import re
import unidecode
import unicodedata
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer.pattern import Pattern
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult, AnonymizerConfig

## Preprocess Functions

In [7]:
# Functions to preprocess text before anonymizing
def replace_accent(s): # apply this in analyzer
    res = s
    replace_dict={
        "á":"a",
        "à":"a",
        "é":"e",
        "è":"e",
        "í":"i",
        "ì":"i",
        "ó":"o",
        "ò":"o",
        "ú":"u",
        "ù":"u",
        "Á":"A",
        "À":"A",
        "É":"E",
        "È":"E",
        "Í":"I",
        "Ì":"I",
        "Ó":"O",
        "Ò":"O",
        "Ú":"U",
        "Ù":"U",
    }
    for i,j in replace_dict.items():
        res = res.replace(i,j)
    return res

## Custom Recognizers

# TEST

In [8]:
from anonymize import Anonymizer

In [9]:
anonymizer=Anonymizer(default_entities=[])

### Phone

In [11]:
anonymizer.add_recognizer_regex("\d{9}|(\d{3}[ ]){2}(\d{1})|\d{3}[ ](\d{2}[ ]){2}(\d{2})", "MY_PHONE")

In [12]:
anonymizer.anonymize_text("Mi tlf es 987654321")

('Mi tlf es <MY_PHONE>', True)

In [13]:
import pandas as pd
text="Mi telefono es 963456678 y mi email javispp2.0@gmail.com."
df=pd.DataFrame(data=[text], columns=["text"])
df = anonymizer.anonymize_dataset(df)
df

Unnamed: 0,text,has_PII
0,Mi telefono es <MY_PHONE> y mi email javispp2....,True


### Surnames List

In [14]:
#Unfortunate surnames
unfortunate_surnames=["CALLE", "PUERTA", "CALLEJON", "CALZADA", "DE LA CALLE", "ABRIL", "ALCALDE", "ALEGRE",
                        "ALEGRIA", "BAJO", "BARRIO", "BAÑOS", "BUENO", "CALLE", "CALLEJON", "CALZADA", "CARO", "CASAS",
                        "CIUDAD", "DIAS", "GRACIA", "GRANDE", "IGUAL", "JUSTICIA", "LLAMAS", "MAS", "MAYO", "MERCADO", 
                        "MIRA", "SOLA", "ESCALERA", "CORTES"] # parameter

In [15]:
# Surname recognizer with deny_list
anonymizer.add_recognizer_deny_list(unfortunate_surnames, "SURNAMES")

In [16]:
anonymizer.anonymize_text("Mi tlf es 987654321 y mi apellido ALEGRÍA", preprocess=replace_accent)

('Mi tlf es <MY_PHONE> y mi apellido <SURNAMES>', True)

In [17]:
text="Mi tlf es 987654321 y mi apellido ALEGRÍA"
df=pd.DataFrame(data=[text], columns=["text"])
df = anonymizer.anonymize_dataset(df, preprocess=replace_accent)
df

Unnamed: 0,text,has_PII
0,Mi tlf es <MY_PHONE> y mi apellido <SURNAMES>,True
