In [1]:
import os
import pandas as pd
import numpy as np
import spacy
import pprint
import xlrd
import re
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, PatternRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer.pattern import Pattern
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities.engine import OperatorConfig
from presidio_analyzer.recognizer_result import RecognizerResult

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed',)).History will not be written to the database.


# Preprocess Functions

In [2]:
# Functions to preprocess text before anonymizing
def replace_accent(s): # apply this in analyzer
    res = s
    replace_dict={
        "á":"a",
        "à":"a",
        "é":"e",
        "è":"e",
        "í":"i",
        "ì":"i",
        "ó":"o",
        "ò":"o",
        "ú":"u",
        "ù":"u",
        "Á":"A",
        "À":"A",
        "É":"E",
        "È":"E",
        "Í":"I",
        "Ì":"I",
        "Ó":"O",
        "Ò":"O",
        "Ú":"U",
        "Ù":"U",
    }
    for i,j in replace_dict.items():
        res = res.replace(i,j)
    return res

## Test

In [3]:
from anonymize import Anonymizer

In [4]:
anonymizer=Anonymizer(model_name="es_core_news_md", default_entities=["IP_ADDRESS"])

### Phone

In [5]:
anonymizer.add_recognizer_regex(r"(?<= )(" + "[9876](\d[\s-]*){7}\d" + r")(?=[^a-zA-Z0-9]+|$)", "MY_PHONE")

In [6]:
anonymizer.anonymize_text("Mi tlf es 987654321")

('Mi tlf es MY_PHONE', True)

In [7]:
import pandas as pd
text="Mi telefono es 963456678 y mi IP 140.184.234.132"
df=pd.DataFrame(data=[text], columns=["text"])
df = anonymizer.anonymize_dataset(df)
df

Unnamed: 0,text,has_PII
0,Mi telefono es MY_PHONE y mi IP IP_ADDRESS,True


### Surnames List

In [8]:
#Unfortunate surnames
unfortunate_surnames=["CALLE", "PUERTA", "CALLEJON", "CALZADA", "DE LA CALLE", "ABRIL", "ALCALDE", "ALEGRE",
                        "ALEGRIA", "BAJO", "BARRIO", "BAÑOS", "BUENO", "CALLE", "CALLEJON", "CALZADA", "CARO", "CASAS",
                        "CIUDAD", "DIAS", "GRACIA", "GRANDE", "IGUAL", "JUSTICIA", "LLAMAS", "MAS", "MAYO", "MERCADO", 
                        "MIRA", "SOLA", "ESCALERA", "CORTES"] # parameter

In [9]:
# Surname recognizer with deny_list
anonymizer.add_recognizer_deny_list(unfortunate_surnames, "SURNAMES")

In [10]:
anonymizer.anonymize_text("Mi tlf es 987654321 y mi apellido ALEGRÍA", preprocess=replace_accent)

('Mi tlf es MY_PHONE y mi apellido SURNAMES', True)

In [11]:
text="Mi tlf es 987654321 y mi apellido ALEGRÍA"
df=pd.DataFrame(data=[text], columns=["text"])
df = anonymizer.anonymize_dataset(df, preprocess=replace_accent)
df

Unnamed: 0,text,has_PII
0,Mi tlf es MY_PHONE y mi apellido SURNAMES,True


### Improving surnames with spacy

In [12]:
def contained(start, end, interval_list): # interval [start,end] is contained in any of the intervals in the interval_list
    return any([start>=i[1] and end<=i[2] for i in interval_list])

In [13]:
texto="Y pues eso, que se cayo de la cama mi hijo Pepe CASAS. Olesa de Montserrat-Viladecavalls dice lo contrario"

In [14]:
import spacy

nlp = spacy.load("es_core_news_md")
doc = nlp(texto)

# document level
spacy_ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents if e.label_=="PER"]

In [15]:
spacy_ents

[('Pepe CASAS', 43, 53, 'PER'),
 ('Olesa de Montserrat-Viladecavalls', 55, 88, 'PER')]

In [16]:
analyzer_res = anonymizer.analyzer.analyze(texto, language=anonymizer.language, entities=["SURNAMES"])
analyzer_res

[type: SURNAMES, start: 48, end: 53, score: 1]

In [17]:
# if my surname entity is recognized by spacy as a person, then we use it to anonymize
surnames_spacy=[RecognizerResult(entity_type="SURNAMES",start=res.start, end=res.end, score=1) for res in analyzer_res if contained(res.start, res.end, spacy_ents)]

In [18]:
anonymizer.anonymizer.anonymize(text=texto,
                                analyzer_results=surnames_spacy,
                                operators=anonymizer.anonymizers_config).text

'Y pues eso, que se cayo de la cama mi hijo Pepe SURNAMES. Olesa de Montserrat-Viladecavalls dice lo contrario'