# NLP PII Detector

In [1]:
#import necessary python library
import re       
import spacy          
import pandas as pd

In [2]:
# Load spaCy model (classical NER)
nlp = spacy.load("en_core_web_sm")

# Sample ServiceNow-like ticket data (including Aravind example)
tickets = [
    {"id": 101, "description": "Customer Aravind Gunasekaran reported an issue. Contact: aravind123@example.com or +91-202-555-0147. SSN: 123-45-6789"},
    {"id": 102, "description": "Policy 9988776655 for user Asha K is pending verification."},
    {"id": 103, "description": "Call received from 9820345678. Customer mentioned policy# POL-1234-A."},
    {"id": 104, "description": "No sensitive info, just system reboot and memory alert."},
    {"id": 105, "description": "Ticket created by agent: rajesh.sharma@tcs.com regarding claim number CLM908765."},
]

# Regex patterns (use non-capturing groups to avoid tuple results)
patterns = {
    "Email": r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b",
    # Accept formats like +1-202-555-0147, (202) 555-0147, 9820345678, 202.555.0147
    "Phone": r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})",
    "SSN": r"\b\d{3}-\d{2}-\d{4}\b",
    "Policy_Number": r"\b[A-Z]{2,4}-?\d{4,}\b",
    "Account_Number": r"\b\d{8,12}\b"
}



In [3]:
def normalize_phone(raw):
    # keep a plus if present and reformat numeric portion
    digits = re.sub(r"\D", "", raw)  # keep only digits
    if len(digits) < 10 or len(digits) > 15:
        return None
    # if original started with + then preserve +countrycode
    if raw.strip().startswith("+"):
        return "+" + digits
    # if digits == 10 -> return as standard 10-digit
    if len(digits) == 10:
        return digits
    # otherwise return digits (may include country code)
    return digits



In [4]:
def clean_and_validate(match_text, label):
    s = match_text.strip()
    if label == "Phone":
        return normalize_phone(s)
    # general cleanup: drop too-short garbage
    if len(s) < 3:
        return None
    return s



In [5]:
def detect_pii(text):
    findings = set()

    # Regex-based detection using finditer to get full matches
    for label, pattern in patterns.items():
        for m in re.finditer(pattern, text):
            full = m.group(0)
            cleaned = clean_and_validate(full, label)
            if cleaned:
                findings.add((label, cleaned))

    # spaCy NER for PERSON and ORG
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ("PERSON", "ORG"):
            findings.add((ent.label_, ent.text.strip()))

    # return sorted list for deterministic ordering
    return sorted(list(findings))



In [6]:
def classify_risk(pii_items):
    if len(pii_items) >= 3:
        return "High"
    if len(pii_items) >= 1:
        return "Medium"
    return "Low"

# Run detection
output = []
for t in tickets:
    pii = detect_pii(t["description"])
    risk = classify_risk(pii)
    output.append({
        "Ticket_ID": t["id"],
        "Description": t["description"],
        "Detected_PII": [x[1] for x in pii],
        "Risk_Level": risk
    })


In [7]:
pd.set_option('display.max_colwidth', None)

In [8]:
df = pd.DataFrame(output)
df

Unnamed: 0,Ticket_ID,Description,Detected_PII,Risk_Level
0,101,Customer Aravind Gunasekaran reported an issue. Contact: aravind123@example.com or +91-202-555-0147. SSN: 123-45-6789,"[aravind123@example.com, SSN, Aravind Gunasekaran, +912025550147, 123-45-6789]",High
1,102,Policy 9988776655 for user Asha K is pending verification.,"[9988776655, Asha K, 9988776655]",High
2,103,Call received from 9820345678. Customer mentioned policy# POL-1234-A.,"[9820345678, Customer, POL-1234-A., 9820345678, POL-1234]",High
3,104,"No sensitive info, just system reboot and memory alert.",[],Low
4,105,Ticket created by agent: rajesh.sharma@tcs.com regarding claim number CLM908765.,"[rajesh.sharma@tcs.com, CLM908765, CLM908765]",High


In [12]:
df.to_csv("output.csv")