# Install

In [3]:
%pip install presidio-analyzer
%pip install presidio-anonymizer
!python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.6.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


# Import

In [1]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

# Detection

### Name / Email / URL

In [13]:
text = "I suspect Professor Plum, in the Dining Room, with the candlestick, whose email address is plum@hot.com."

analyzer = AnalyzerEngine()
analyzer_results = analyzer.analyze(text=text, language="en")

print(analyzer_results)

[type: EMAIL_ADDRESS, start: 91, end: 103, score: 1.0, type: PERSON, start: 20, end: 24, score: 0.85, type: URL, start: 96, end: 103, score: 0.5]


### Phone Number

In [12]:
import pprint

text = "His name is Mr. Jones and his phone number is 212-555-5555"

analyzer_results = analyzer.analyze(text=text, language="en", return_decision_process=True)

print(analyzer_results)

pp = pprint.PrettyPrinter()
print("Decision process output:\n")
pp.pprint(analyzer_results[0].analysis_explanation.__dict__)
pp.pprint(analyzer_results[1].analysis_explanation.__dict__)

[type: PERSON, start: 16, end: 21, score: 0.85, type: PHONE_NUMBER, start: 46, end: 58, score: 0.75]
Decision process output:

{'original_score': 0.85,
 'pattern': None,
 'pattern_name': None,
 'recognizer': 'SpacyRecognizer',
 'score': 0.85,
 'score_context_improvement': 0,
 'supportive_context_word': '',
 'textual_explanation': "Identified as PERSON by Spacy's Named Entity "
                        'Recognition',
 'validation_result': None}
{'original_score': 0.4,
 'pattern': None,
 'pattern_name': None,
 'recognizer': 'ABCMeta',
 'score': 0.75,
 'score_context_improvement': 0.35,
 'supportive_context_word': 'phone',
 'textual_explanation': 'Recognized as US region phone number, using '
                        'PhoneRecognizer',
 'validation_result': None}


### Postcode

In [8]:
results = analyzer.analyze(text="My zip code is 90210", language="en")

print(f"Result:\n {results}")

Result:
 []


In [25]:
results = analyzer.analyze(text="my name is Baptiste", language="en")

print(f"Result:\n {results}")

Result:
 []


# Anonymization

In [5]:
text="My phone number is 212-555-5555"

# Set up the engine, loads the NLP module (spaCy model by default) 
# and other PII recognizers
analyzer = AnalyzerEngine()

# Call analyzer to get results
results = analyzer.analyze(text=text,
                           entities=["PHONE_NUMBER"],
                           language='en')
print(f"Analysis: {results}")

# Analyzer results are passed to the AnonymizerEngine for anonymization

anonymizer = AnonymizerEngine()

anonymized_text = anonymizer.anonymize(text=text,analyzer_results=results)

print(f"Anomymize: {anonymized_text}")

Analysis: [type: PHONE_NUMBER, start: 19, end: 31, score: 0.75]
Anomymize: text: My phone number is <PHONE_NUMBER>
items:
[
    {'start': 19, 'end': 33, 'entity_type': 'PHONE_NUMBER', 'text': '<PHONE_NUMBER>', 'operator': 'replace'}
]



In [29]:
from presidio_anonymizer.entities import OperatorConfig, RecognizerResult

# Analyzer output
text_to_anonymize = "His name is Mr. Jones and his phone number is 212-555-5555"

analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en')
print(analyzer_results)

# Define anonymization operators
operators = {
    "DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"}),
    "PHONE_NUMBER": OperatorConfig(
        "mask",
        {
            "type": "mask",
            "masking_char": "*",
            "chars_to_mask": 12,
            "from_end": True,
        },
    ),
    "TITLE": OperatorConfig("redact", {}),
}

anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize, analyzer_results=analyzer_results, operators=operators
)

print(f"text: {anonymized_results.text}")


[type: PERSON, start: 16, end: 21, score: 0.85, type: PHONE_NUMBER, start: 46, end: 58, score: 0.75]
text: His name is Mr. <ANONYMIZED> and his phone number is ************
