# NLP PII Detector

In [1]:
#import necessary python library
import re       
import spacy          
import pandas as pd

In [2]:
# Load small English NLP model (no deep learning fine-tuning)
nlp = spacy.load("en_core_web_sm")

# sample ServiceNow ticket dataset (randomized)
Tickets = [
    {"id": 101, "description": "Customer Aravind Gunasekaran reported an issue. Contact: aravind123@example.com or +1-202-555-0147"},
    {"id": 102, "description": "Policy 9988776655 for user Ashok G is pending verification."},
    {"id": 103, "description": "Call received from 9820345678. Customer mentioned policy# POL-1234-A."},
    {"id": 104, "description": "No sensitive info, just system reboot and memory alert."},
    {"id": 105, "description": "Ticket created by agent: rajesh.sharma@tcs.com regarding claim number CLM908765."},
]

# regex patterns for PII
Patterns = {
    "Email": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
    "Phone": r"(\+?\d{1,3}[-.\s]?)?\d{10}",
    "Policy_Number": r"\b[A-Z]{2,4}[-]?\d{4,}\b",
    "Account_Number": r"\b\d{8,12}\b"
}



In [3]:
def detect_pii(text):
    findings = []

    # Regex-based detection
    for label, pattern in Patterns.items():
        matches = re.findall(pattern, text)
        if matches:
            findings.extend([(label, m) for m in matches])

    # NLP-based Named Entity Recognition
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "ORG", "GPE"]:
            findings.append((ent.label_, ent.text))
    
    # Remove duplicates
    findings = list(set(findings))
    return findings



In [4]:
# Analyze all tickets
results = []
for t in Tickets:
    pii_data = detect_pii(t["description"])
    risk_level = "High" if len(pii_data) > 2 else ("Medium" if pii_data else "Low")
    results.append({
        "Ticket_ID": t["id"],
        "Description": t["description"],
        "Detected_PII": pii_data,
        "Risk_Level": risk_level
    })


In [6]:
pd.set_option('display.max_colwidth', None)
df = pd.DataFrame(results)
df

Unnamed: 0,Ticket_ID,Description,Detected_PII,Risk_Level
0,101,Customer Aravind Gunasekaran reported an issue. Contact: aravind123@example.com or +1-202-555-0147,"[(Email, aravind123@example.com), (PERSON, Aravind Gunasekaran)]",Medium
1,102,Policy 9988776655 for user Ashok G is pending verification.,"[(Account_Number, 9988776655), (PERSON, Ashok G), (Phone, )]",High
2,103,Call received from 9820345678. Customer mentioned policy# POL-1234-A.,"[(PERSON, POL-1234-A.), (ORG, Customer), (Phone, ), (Account_Number, 9820345678), (Policy_Number, POL-1234)]",High
3,104,"No sensitive info, just system reboot and memory alert.",[],Low
4,105,Ticket created by agent: rajesh.sharma@tcs.com regarding claim number CLM908765.,"[(Email, rajesh.sharma@tcs.com), (Policy_Number, CLM908765), (ORG, CLM908765)]",High
