# Use Named entity recognition to detect PIIs in prompt or response 


<img src="../assets/NER.png" width="1150" align="center">

In [None]:
!pip install pip-system-certs -q
!pip install spacy presidio_anonymizer presidio_analyzer -q
!python -m spacy download en_core_web_lg

In [6]:
import os
os.environ['CURL_CA_BUNDLE'] = ''
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [7]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_lg")
text = "Applicant's name is John Doe and he lives in Silver St. \
        and his phone number is 555-123-1290"
doc = nlp(text)

displacy.render(doc, style="ent", jupyter=True)

# for ent in doc.ents:
#     print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [8]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

text="""
Applicant's name is John Doe and he lives in Silver St. and his phone number is 555-123-1290.
"""

# Set up the engine, loads the NLP module (spaCy model by default)
# and other PII recognizers
analyzer = AnalyzerEngine()

# Call analyzer to get results
results = analyzer.analyze(text=text,
                          #  entities=["PHONE_NUMBER"],
                           language='en')
# print(results)
for result in results:
  print(f"PII Type={result.entity_type},",
        f"Start={result.start},",
        f"End={result.end},",
        f"Score={result.score}")


PII Type=PERSON, Start=21, End=29, Score=0.85
PII Type=LOCATION, Start=46, End=56, Score=0.85
PII Type=PHONE_NUMBER, Start=81, End=93, Score=0.75


In [9]:
from IPython.display import display, HTML
import hashlib

class Result:
    def __init__(self, entity_type, start, end, score):
        self.entity_type = entity_type
        self.start = start
        self.end = end
        self.score = score

def type_to_color(entity_type):
    """Convert an entity type to a consistent random color."""
    # Hash the entity type to get consistent results
    hash_object = hashlib.md5(entity_type.encode())
    hex_dig = hash_object.hexdigest()
    # Take the first 6 characters from the hash to use as a color
    color = "#" + hex_dig[:6]
    return color

def annotate_text(text, results):
    # Sort results by start position
    results.sort(key=lambda x: x.start)

    annotated = ""
    prev_end = 0
    for result in results:
        # Append text until the start of the entity
        annotated += text[prev_end:result.start]
        color = type_to_color(result.entity_type)
        # Add the highlighted entity
        style = f"display: inline-block; border-radius: 0.25em; padding: 0.25em 0.5em; margin: 0 0.25em; line-height: 1; background-color: cyan; border: 1px solid #bbb;"
        annotated += f"<mark style='{style}' title='Score: {result.score}'>{text[result.start:result.end]} <span style='font-size: 0.8em; font-weight: bold;'>[ {result.entity_type} ]</span></mark>"
        prev_end = result.end

    # Append the remaining text
    annotated += text[prev_end:]
    return annotated

In [10]:
annotated = annotate_text(text, results)
display(HTML(annotated))

In [11]:
from presidio_anonymizer import AnonymizerEngine

anonymizer = AnonymizerEngine()
anonymized_text = anonymizer.anonymize(text=text,analyzer_results=results)
print(anonymized_text.text)


Applicant's name is <PERSON> and he lives in <LOCATION> and his phone number is <PHONE_NUMBER>.



In [12]:
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

operators = dict()
for result in results:
  operators[result.entity_type] = OperatorConfig("mask", {"chars_to_mask": result.end-result.start,
                                                          "masking_char": "*",
                                                          "from_end": False})


anonymizer = AnonymizerEngine()
anonymized_results = anonymizer.anonymize(
    text=text, analyzer_results=results, operators=operators
)

print(anonymized_results.text)



Applicant's name is ******** and he lives in ********** and his phone number is ************.

