# Advanced PII Detection Agent Demo (Standalone)
## Comprehensive PII Detection using NER, Proximity Analysis, and Graph Theory

This notebook demonstrates the advanced PII detection agent that combines:
- **Named Entity Recognition (NER)** using spaCy
- **Proximity Analysis** for contextual risk assessment
- **Graph Theory** for relationship mapping
- **Risk Assessment** with multi-tier classification

All code is included directly in this notebook - no external imports needed!

---

## 1. Setup and Core Dependencies

First, let's install and import all necessary libraries.

In [None]:
# Install required packages (run once)
!pip install spacy pandas numpy networkx matplotlib plotly python-dotenv tqdm -q
!python -m spacy download en_core_web_sm -q

print("✅ Dependencies installed")

In [None]:
# Import libraries
import json
import logging
import re
import warnings
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Set, Any, Union
from dataclasses import dataclass, asdict
from enum import Enum
import hashlib
import time
from functools import lru_cache

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

print("✅ All libraries imported successfully")

## 2. Core Data Structures and Enums

Define the core data structures for PII detection.

In [None]:
class PIIType(Enum):
    """Enhanced PII classification with severity levels"""
    # Critical PII - high re-identification risk
    SSN = "social_security_number"
    CREDIT_CARD = "credit_card"
    BANK_ACCOUNT = "bank_account"
    DRIVERS_LICENSE = "drivers_license"
    PASSPORT = "passport"
    
    # Personal Identifiers
    PERSON = "person_name"
    EMAIL = "email_address"
    PHONE = "phone_number"
    DATE_OF_BIRTH = "date_of_birth"
    
    # Location Data
    ADDRESS = "physical_address"
    ZIP_CODE = "zip_code"
    LOCATION = "location"
    GPS_COORD = "gps_coordinates"
    
    # Organizational
    ORGANIZATION = "organization"
    WEBSITE = "website_url"
    
    # Technical
    IP_ADDRESS = "ip_address"
    MAC_ADDRESS = "mac_address"
    API_KEY = "api_key"
    PASSWORD = "password"
    
    # Financial
    IBAN = "iban"
    BITCOIN_ADDRESS = "bitcoin_address"
    
    # Medical
    MEDICAL_ID = "medical_identifier"
    
    # Other
    USERNAME = "username"
    OTHER = "other_pii"

class RiskLevel(Enum):
    """Risk assessment levels for PII exposure"""
    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"
    CRITICAL = "critical"

@dataclass
class PIIEntity:
    """Structured representation of a detected PII entity"""
    text: str
    pii_type: PIIType
    start_pos: int
    end_pos: int
    confidence: float
    context: str
    row_index: Optional[int] = None
    column_name: Optional[str] = None
    risk_level: RiskLevel = RiskLevel.LOW
    detection_method: str = "regex"  # "regex", "ner", "proximity", "graph"
    related_entities: List[str] = None
    
    def __post_init__(self):
        if self.related_entities is None:
            self.related_entities = []
    
    def to_dict(self) -> Dict:
        """Convert to dictionary for JSON serialization"""
        result = asdict(self)
        result['pii_type'] = self.pii_type.value
        result['risk_level'] = self.risk_level.value
        return result

print("✅ Data structures defined")

## 3. PII Detection Patterns

Define regex patterns for various PII types.

In [None]:
class PIIPatterns:
    """Compiled regex patterns for PII detection with security rationale"""
    
    EMAIL = re.compile(
        r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
        re.IGNORECASE
    )
    
    PHONE_US = re.compile(
        r'\b(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'
    )
    
    SSN = re.compile(
        r'\b\d{3}-\d{2}-\d{4}\b|\b\d{9}\b'
    )
    
    CREDIT_CARD = re.compile(
        r'\b(?:\d{4}[-\s]?){3}\d{4}\b'
    )
    
    IP_ADDRESS = re.compile(
        r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
    )
    
    API_KEY = re.compile(
        r'\b(?:api[_-]?key|apikey|access[_-]?token)[\s:=]+[\w-]{20,}\b',
        re.IGNORECASE
    )
    
    PASSWORD = re.compile(
        r'\b(?:password|passwd|pwd)[\s:=]+\S+\b',
        re.IGNORECASE
    )
    
    ZIP_CODE = re.compile(
        r'\b\d{5}(?:-\d{4})?\b'
    )
    
    DATE_OF_BIRTH = re.compile(
        r'\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2})\b'
    )

print("✅ PII patterns compiled")

## 4. NER-based PII Detector

Implementation of Named Entity Recognition based PII detection.

In [None]:
class PIINERDetector:
    """
    Named Entity Recognition-based PII detector using spaCy
    Handles person names, organizations, locations, and custom patterns
    """
    
    def __init__(self, model_name: str = "en_core_web_sm"):
        """
        Initialize NER detector with spaCy model
        
        Args:
            model_name: spaCy model to use for NER
        """
        self.model_name = model_name
        self.nlp = None
        self._load_model()
        self._setup_patterns()
        
    def _load_model(self):
        """Load spaCy model with error handling"""
        try:
            import spacy
            self.nlp = spacy.load(self.model_name)
            logger.info(f"Loaded spaCy model: {self.model_name}")
        except ImportError:
            logger.error("spaCy not installed. Run: pip install spacy")
            raise ImportError("spaCy is required for NER functionality")
        except OSError:
            logger.error(f"spaCy model '{self.model_name}' not found. Run: python -m spacy download {self.model_name}")
            raise OSError(f"spaCy model '{self.model_name}' not available")
    
    def _setup_patterns(self):
        """Setup regex patterns for non-NER PII detection"""
        self.patterns = {
            PIIType.EMAIL: PIIPatterns.EMAIL,
            PIIType.PHONE: PIIPatterns.PHONE_US,
            PIIType.SSN: PIIPatterns.SSN,
            PIIType.CREDIT_CARD: PIIPatterns.CREDIT_CARD,
            PIIType.ZIP_CODE: PIIPatterns.ZIP_CODE,
            PIIType.IP_ADDRESS: PIIPatterns.IP_ADDRESS,
            PIIType.DATE_OF_BIRTH: PIIPatterns.DATE_OF_BIRTH,
            PIIType.API_KEY: PIIPatterns.API_KEY,
            PIIType.PASSWORD: PIIPatterns.PASSWORD,
        }
    
    def detect_pii_in_text(self, text: str, context: Dict = None) -> List[PIIEntity]:
        """
        Detect PII entities in text using both NER and regex patterns
        
        Args:
            text: Input text to analyze
            context: Additional context (row_index, column_name, etc.)
            
        Returns:
            List of detected PII entities
        """
        entities = []
        context = context or {}
        
        if not text or not isinstance(text, str):
            return entities
        
        # Regex-based detection
        entities.extend(self._detect_regex_pii(text, context))
        
        # NER-based detection
        if self.nlp:
            entities.extend(self._detect_ner_pii(text, context))
        
        # Remove duplicates and overlaps
        entities = self._deduplicate_entities(entities)
        
        return entities
    
    def _detect_regex_pii(self, text: str, context: Dict) -> List[PIIEntity]:
        """Detect PII using regex patterns"""
        entities = []
        
        for pii_type, pattern in self.patterns.items():
            matches = pattern.finditer(text)
            for match in matches:
                entity = PIIEntity(
                    text=match.group(),
                    pii_type=pii_type,
                    start_pos=match.start(),
                    end_pos=match.end(),
                    confidence=0.9,  # High confidence for regex matches
                    context=self._extract_context(text, match.start(), match.end()),
                    row_index=context.get('row_index'),
                    column_name=context.get('column_name'),
                    detection_method="regex"
                )
                entities.append(entity)
        
        return entities
    
    def _detect_ner_pii(self, text: str, context: Dict) -> List[PIIEntity]:
        """Detect PII using spaCy NER"""
        entities = []
        
        try:
            doc = self.nlp(text)
            
            for ent in doc.ents:
                pii_type = self._map_ner_label_to_pii_type(ent.label_)
                if pii_type:
                    entity = PIIEntity(
                        text=ent.text,
                        pii_type=pii_type,
                        start_pos=ent.start_char,
                        end_pos=ent.end_char,
                        confidence=0.8,  # NER confidence can vary
                        context=self._extract_context(text, ent.start_char, ent.end_char),
                        row_index=context.get('row_index'),
                        column_name=context.get('column_name'),
                        detection_method="ner"
                    )
                    entities.append(entity)
        
        except Exception as e:
            logger.warning(f"NER processing error: {e}")
        
        return entities
    
    def _map_ner_label_to_pii_type(self, label: str) -> Optional[PIIType]:
        """Map spaCy NER labels to PII types"""
        label_mapping = {
            "PERSON": PIIType.PERSON,
            "ORG": PIIType.ORGANIZATION,
            "GPE": PIIType.LOCATION,  # Geopolitical entity
            "LOC": PIIType.LOCATION,
            "FAC": PIIType.LOCATION,  # Facility
            "DATE": None,  # Generally not PII unless specific format
            "TIME": None,
            "MONEY": None,
            "PERCENT": None,
            "ORDINAL": None,
            "CARDINAL": None,
        }
        return label_mapping.get(label)
    
    def _extract_context(self, text: str, start: int, end: int, window: int = 50) -> str:
        """Extract context around detected entity"""
        context_start = max(0, start - window)
        context_end = min(len(text), end + window)
        return text[context_start:context_end]
    
    def _deduplicate_entities(self, entities: List[PIIEntity]) -> List[PIIEntity]:
        """Remove overlapping and duplicate entities, keeping highest confidence"""
        if not entities:
            return entities
        
        # Sort by start position
        entities.sort(key=lambda x: x.start_pos)
        
        deduplicated = []
        for entity in entities:
            # Check for overlap with existing entities
            overlaps = False
            for existing in deduplicated:
                if (entity.start_pos < existing.end_pos and 
                    entity.end_pos > existing.start_pos):
                    # Overlapping entities - keep the one with higher confidence
                    if entity.confidence > existing.confidence:
                        deduplicated.remove(existing)
                        deduplicated.append(entity)
                    overlaps = True
                    break
            
            if not overlaps:
                deduplicated.append(entity)
        
        return sorted(deduplicated, key=lambda x: x.start_pos)

print("✅ NER Detector class defined")

## 5. Proximity Analyzer

Analyze proximity relationships between PII entities to assess risk.

In [None]:
class ProximityAnalyzer:
    """
    Analyzes proximity relationships between entities to identify contextual PII risks
    Uses sliding window approach to find related entities that increase re-identification risk
    """
    
    def __init__(self, window_size: int = 100, risk_threshold: float = 0.7):
        """
        Initialize proximity analyzer
        
        Args:
            window_size: Character window size for proximity analysis
            risk_threshold: Threshold for high-risk proximity relationships
        """
        self.window_size = window_size
        self.risk_threshold = risk_threshold
        
        # Define risk relationships between PII types
        self.risk_matrix = self._build_risk_matrix()
    
    def _build_risk_matrix(self) -> Dict[Tuple[PIIType, PIIType], float]:
        """Build matrix of risk scores for PII type combinations"""
        risk_matrix = {}
        
        # High-risk combinations (score >= 0.8)
        high_risk_pairs = [
            (PIIType.PERSON, PIIType.SSN),
            (PIIType.PERSON, PIIType.DATE_OF_BIRTH),
            (PIIType.PERSON, PIIType.ADDRESS),
            (PIIType.EMAIL, PIIType.PHONE),
            (PIIType.PERSON, PIIType.PHONE),
            (PIIType.ORGANIZATION, PIIType.EMAIL),
            (PIIType.ZIP_CODE, PIIType.ADDRESS),
            (PIIType.PERSON, PIIType.CREDIT_CARD),
        ]
        
        # Medium-risk combinations (score >= 0.5)
        medium_risk_pairs = [
            (PIIType.PERSON, PIIType.ORGANIZATION),
            (PIIType.EMAIL, PIIType.ORGANIZATION),
            (PIIType.PHONE, PIIType.ADDRESS),
            (PIIType.LOCATION, PIIType.ZIP_CODE),
            (PIIType.PERSON, PIIType.EMAIL),
            (PIIType.PERSON, PIIType.LOCATION),
        ]
        
        # Assign risk scores
        for pair in high_risk_pairs:
            risk_matrix[pair] = 0.9
            risk_matrix[(pair[1], pair[0])] = 0.9  # Symmetric
        
        for pair in medium_risk_pairs:
            risk_matrix[pair] = 0.6
            risk_matrix[(pair[1], pair[0])] = 0.6  # Symmetric
        
        return risk_matrix
    
    def analyze_proximity(self, entities: List[PIIEntity], text: str) -> List[PIIEntity]:
        """
        Analyze proximity relationships and update entity risk levels
        
        Args:
            entities: List of detected PII entities
            text: Original text for context analysis
            
        Returns:
            Updated entities with proximity-based risk assessments
        """
        if len(entities) < 2:
            return entities
        
        # Create proximity groups
        proximity_groups = self._create_proximity_groups(entities)
        
        # Analyze each group for risk relationships
        for group in proximity_groups:
            self._analyze_group_risk(group, text)
        
        # Update entity risk levels based on proximity analysis
        updated_entities = []
        for entity in entities:
            updated_entity = self._update_entity_risk(entity, proximity_groups)
            updated_entities.append(updated_entity)
        
        return updated_entities
    
    def _create_proximity_groups(self, entities: List[PIIEntity]) -> List[List[PIIEntity]]:
        """Group entities that are within proximity window of each other"""
        groups = []
        used_entities = set()
        
        for i, entity in enumerate(entities):
            if i in used_entities:
                continue
            
            group = [entity]
            used_entities.add(i)
            
            # Find all entities within proximity window
            for j, other_entity in enumerate(entities[i+1:], i+1):
                if j in used_entities:
                    continue
                
                if self._are_proximate(entity, other_entity):
                    group.append(other_entity)
                    used_entities.add(j)
            
            if len(group) > 1:  # Only include groups with multiple entities
                groups.append(group)
        
        return groups
    
    def _are_proximate(self, entity1: PIIEntity, entity2: PIIEntity) -> bool:
        """Check if two entities are within proximity window"""
        return abs(entity1.start_pos - entity2.start_pos) <= self.window_size
    
    def _analyze_group_risk(self, group: List[PIIEntity], text: str):
        """Analyze risk level for a group of proximate entities"""
        group_types = [entity.pii_type for entity in group]
        
        # Calculate maximum risk score for the group
        max_risk = 0.0
        risk_pairs = []
        
        for i, entity1 in enumerate(group):
            for entity2 in group[i+1:]:
                pair_key = (entity1.pii_type, entity2.pii_type)
                risk_score = self.risk_matrix.get(pair_key, 0.3)  # Default low risk
                
                if risk_score > max_risk:
                    max_risk = risk_score
                
                if risk_score >= 0.5:  # Medium or higher risk
                    risk_pairs.append((entity1, entity2, risk_score))
        
        # Update related entities information
        for entity in group:
            entity.related_entities = [
                f"{other.pii_type.value}:{other.text[:20]}..." 
                for other in group if other != entity
            ]
    
    def _update_entity_risk(self, entity: PIIEntity, proximity_groups: List[List[PIIEntity]]) -> PIIEntity:
        """Update entity risk level based on proximity analysis"""
        # Find which group this entity belongs to
        entity_group = None
        for group in proximity_groups:
            if entity in group:
                entity_group = group
                break
        
        if not entity_group or len(entity_group) == 1:
            # No proximity relationships
            entity.risk_level = self._get_base_risk_level(entity.pii_type)
            return entity
        
        # Calculate risk based on proximity relationships
        max_proximity_risk = 0.0
        for other_entity in entity_group:
            if other_entity != entity:
                pair_key = (entity.pii_type, other_entity.pii_type)
                risk = self.risk_matrix.get(pair_key, 0.3)
                max_proximity_risk = max(max_proximity_risk, risk)
        
        # Combine base risk with proximity risk
        base_risk = self._get_base_risk_score(entity.pii_type)
        combined_risk = min(1.0, base_risk + max_proximity_risk * 0.5)
        
        entity.risk_level = self._score_to_risk_level(combined_risk)
        entity.detection_method = "proximity"
        
        return entity
    
    def _get_base_risk_level(self, pii_type: PIIType) -> RiskLevel:
        """Get base risk level for a PII type"""
        critical_types = {PIIType.SSN, PIIType.CREDIT_CARD, PIIType.PASSPORT, 
                         PIIType.DRIVERS_LICENSE, PIIType.BANK_ACCOUNT}
        high_types = {PIIType.DATE_OF_BIRTH, PIIType.ADDRESS, PIIType.MEDICAL_ID}
        medium_types = {PIIType.PERSON, PIIType.EMAIL, PIIType.PHONE}
        
        if pii_type in critical_types:
            return RiskLevel.CRITICAL
        elif pii_type in high_types:
            return RiskLevel.HIGH
        elif pii_type in medium_types:
            return RiskLevel.MEDIUM
        else:
            return RiskLevel.LOW
    
    def _get_base_risk_score(self, pii_type: PIIType) -> float:
        """Get base risk score for a PII type"""
        risk_level = self._get_base_risk_level(pii_type)
        score_mapping = {
            RiskLevel.CRITICAL: 0.9,
            RiskLevel.HIGH: 0.7,
            RiskLevel.MEDIUM: 0.5,
            RiskLevel.LOW: 0.3
        }
        return score_mapping[risk_level]
    
    def _score_to_risk_level(self, score: float) -> RiskLevel:
        """Convert risk score to risk level"""
        if score >= 0.85:
            return RiskLevel.CRITICAL
        elif score >= 0.65:
            return RiskLevel.HIGH
        elif score >= 0.45:
            return RiskLevel.MEDIUM
        else:
            return RiskLevel.LOW

print("✅ Proximity Analyzer class defined")

## 6. Graph Builder for Entity Relationships

Build and analyze entity relationship graphs using NetworkX.

In [None]:
class PIIGraphBuilder:
    """
    Builds and analyzes entity graphs using networkx to identify PII clusters
    and relationships that may increase re-identification risks
    """
    
    def __init__(self, min_edge_weight: float = 0.1):
        """
        Initialize graph builder
        
        Args:
            min_edge_weight: Minimum weight for edges to be included in graph
        """
        self.min_edge_weight = min_edge_weight
        self.graph = nx.Graph()
        self.entity_metadata = {}
    
    def build_graph(self, entities: List[PIIEntity], text: str = None) -> nx.Graph:
        """
        Build entity relationship graph
        
        Args:
            entities: List of PII entities
            text: Original text for context (optional)
            
        Returns:
            NetworkX graph with entities as nodes and relationships as edges
        """
        self.graph.clear()
        self.entity_metadata.clear()
        
        if not entities:
            return self.graph
        
        # Add nodes
        for i, entity in enumerate(entities):
            node_id = f"{entity.pii_type.value}_{i}"
            
            self.graph.add_node(
                node_id,
                pii_type=entity.pii_type.value,
                text=entity.text[:50],  # Truncate for privacy
                confidence=entity.confidence,
                risk_level=entity.risk_level.value,
                row_index=entity.row_index,
                column_name=entity.column_name,
                detection_method=entity.detection_method
            )
            
            self.entity_metadata[node_id] = entity
        
        # Add edges based on relationships
        self._add_proximity_edges(entities)
        self._add_semantic_edges(entities)
        self._add_co_occurrence_edges(entities)
        
        return self.graph
    
    def _add_proximity_edges(self, entities: List[PIIEntity]):
        """Add edges between entities that are spatially close"""
        proximity_analyzer = ProximityAnalyzer()
        
        for i, entity1 in enumerate(entities):
            node1_id = f"{entity1.pii_type.value}_{i}"
            
            for j, entity2 in enumerate(entities[i+1:], i+1):
                node2_id = f"{entity2.pii_type.value}_{j}"
                
                # Calculate proximity weight
                distance = abs(entity1.start_pos - entity2.start_pos)
                if distance <= proximity_analyzer.window_size:
                    # Inverse distance weighting
                    weight = max(0.1, 1.0 - (distance / proximity_analyzer.window_size))
                    
                    if weight >= self.min_edge_weight:
                        self.graph.add_edge(
                            node1_id, node2_id,
                            weight=weight,
                            edge_type="proximity",
                            distance=distance
                        )
    
    def _add_semantic_edges(self, entities: List[PIIEntity]):
        """Add edges between semantically related entities"""
        semantic_relationships = {
            (PIIType.PERSON, PIIType.EMAIL): 0.8,
            (PIIType.PERSON, PIIType.PHONE): 0.8,
            (PIIType.PERSON, PIIType.ADDRESS): 0.9,
            (PIIType.ORGANIZATION, PIIType.EMAIL): 0.7,
            (PIIType.ORGANIZATION, PIIType.ADDRESS): 0.8,
            (PIIType.ADDRESS, PIIType.ZIP_CODE): 0.9,
            (PIIType.EMAIL, PIIType.USERNAME): 0.6,
        }
        
        for i, entity1 in enumerate(entities):
            node1_id = f"{entity1.pii_type.value}_{i}"
            
            for j, entity2 in enumerate(entities[i+1:], i+1):
                node2_id = f"{entity2.pii_type.value}_{j}"
                
                # Check for semantic relationship
                pair_key = (entity1.pii_type, entity2.pii_type)
                reverse_key = (entity2.pii_type, entity1.pii_type)
                
                weight = semantic_relationships.get(pair_key) or semantic_relationships.get(reverse_key)
                
                if weight and weight >= self.min_edge_weight:
                    self.graph.add_edge(
                        node1_id, node2_id,
                        weight=weight,
                        edge_type="semantic"
                    )
    
    def _add_co_occurrence_edges(self, entities: List[PIIEntity]):
        """Add edges between entities that co-occur in the same row/column"""
        # Group entities by row and column
        row_groups = defaultdict(list)
        col_groups = defaultdict(list)
        
        for i, entity in enumerate(entities):
            node_id = f"{entity.pii_type.value}_{i}"
            
            if entity.row_index is not None:
                row_groups[entity.row_index].append(node_id)
            
            if entity.column_name is not None:
                col_groups[entity.column_name].append(node_id)
        
        # Add edges within row groups
        for row_entities in row_groups.values():
            if len(row_entities) > 1:
                for i, node1 in enumerate(row_entities):
                    for node2 in row_entities[i+1:]:
                        if not self.graph.has_edge(node1, node2):
                            self.graph.add_edge(
                                node1, node2,
                                weight=0.3,
                                edge_type="row_co_occurrence"
                            )
        
        # Add edges within column groups (lighter weight)
        for col_entities in col_groups.values():
            if len(col_entities) > 1:
                for i, node1 in enumerate(col_entities):
                    for node2 in col_entities[i+1:]:
                        if not self.graph.has_edge(node1, node2):
                            self.graph.add_edge(
                                node1, node2,
                                weight=0.2,
                                edge_type="column_co_occurrence"
                            )
    
    def analyze_graph(self) -> Dict[str, Any]:
        """
        Perform comprehensive graph analysis
        
        Returns:
            Dictionary with graph analysis results
        """
        if not self.graph.nodes():
            return {"error": "Empty graph - no entities to analyze"}
        
        analysis = {}
        
        # Basic graph metrics
        analysis["basic_metrics"] = {
            "num_nodes": self.graph.number_of_nodes(),
            "num_edges": self.graph.number_of_edges(),
            "density": nx.density(self.graph),
            "is_connected": nx.is_connected(self.graph)
        }
        
        # Connected components analysis
        components = list(nx.connected_components(self.graph))
        analysis["connected_components"] = {
            "count": len(components),
            "sizes": [len(comp) for comp in components],
            "largest_component_size": max([len(comp) for comp in components]) if components else 0
        }
        
        # Centrality measures
        if self.graph.number_of_nodes() > 1:
            analysis["centrality"] = {
                "degree_centrality": dict(nx.degree_centrality(self.graph)),
                "betweenness_centrality": dict(nx.betweenness_centrality(self.graph)),
                "closeness_centrality": dict(nx.closeness_centrality(self.graph))
            }
        
        # Risk cluster identification
        analysis["risk_clusters"] = self._identify_risk_clusters(components)
        
        # Edge analysis
        analysis["edge_analysis"] = self._analyze_edges()
        
        return analysis
    
    def _identify_risk_clusters(self, components: List[Set]) -> List[Dict]:
        """Identify high-risk clusters of connected entities"""
        risk_clusters = []
        
        for i, component in enumerate(components):
            if len(component) < 2:
                continue
            
            # Calculate cluster risk score
            cluster_entities = [self.entity_metadata[node_id] for node_id in component]
            risk_scores = [self._get_risk_score(entity.risk_level) for entity in cluster_entities]
            avg_risk = np.mean(risk_scores)
            max_risk = max(risk_scores)
            
            # Analyze PII type diversity
            pii_types = {entity.pii_type for entity in cluster_entities}
            type_diversity = len(pii_types) / len(cluster_entities)
            
            cluster_info = {
                "cluster_id": i,
                "size": len(component),
                "entities": [
                    {
                        "type": entity.pii_type.value,
                        "text_preview": entity.text[:20] + "..." if len(entity.text) > 20 else entity.text,
                        "risk_level": entity.risk_level.value,
                        "confidence": entity.confidence
                    }
                    for entity in cluster_entities
                ],
                "average_risk_score": avg_risk,
                "max_risk_score": max_risk,
                "type_diversity": type_diversity,
                "overall_risk": self._calculate_cluster_risk(avg_risk, max_risk, type_diversity, len(component))
            }
            
            risk_clusters.append(cluster_info)
        
        # Sort by overall risk
        risk_clusters.sort(key=lambda x: x["overall_risk"], reverse=True)
        
        return risk_clusters
    
    def _get_risk_score(self, risk_level: RiskLevel) -> float:
        """Convert risk level to numeric score"""
        score_mapping = {
            RiskLevel.LOW: 0.25,
            RiskLevel.MEDIUM: 0.5,
            RiskLevel.HIGH: 0.75,
            RiskLevel.CRITICAL: 1.0
        }
        return score_mapping[risk_level]
    
    def _calculate_cluster_risk(self, avg_risk: float, max_risk: float, 
                               type_diversity: float, cluster_size: int) -> float:
        """Calculate overall cluster risk score"""
        # Weighted combination of factors
        size_factor = min(1.0, cluster_size / 5.0)  # Larger clusters are riskier
        diversity_factor = type_diversity  # More diverse types increase risk
        
        overall_risk = (
            0.4 * avg_risk +
            0.3 * max_risk +
            0.2 * diversity_factor +
            0.1 * size_factor
        )
        
        return min(1.0, overall_risk)
    
    def _analyze_edges(self) -> Dict[str, Any]:
        """Analyze edge patterns and types"""
        edge_data = []
        
        for u, v, data in self.graph.edges(data=True):
            edge_data.append({
                "source": u,
                "target": v,
                "weight": data.get("weight", 0),
                "type": data.get("edge_type", "unknown")
            })
        
        if not edge_data:
            return {"total_edges": 0}
        
        # Edge type distribution
        edge_types = [edge["type"] for edge in edge_data]
        type_counts = {edge_type: edge_types.count(edge_type) for edge_type in set(edge_types)}
        
        # Weight distribution
        weights = [edge["weight"] for edge in edge_data]
        
        return {
            "total_edges": len(edge_data),
            "edge_type_distribution": type_counts,
            "weight_statistics": {
                "mean": np.mean(weights),
                "std": np.std(weights),
                "min": np.min(weights),
                "max": np.max(weights)
            }
        }

print("✅ Graph Builder class defined")

## 7. Create Sample Data

Let's create various sample datasets to demonstrate PII detection.

In [None]:
# Create sample customer data with various PII types
customer_data = pd.DataFrame({
    'customer_id': ['CUST001', 'CUST002', 'CUST003', 'CUST004', 'CUST005'],
    'full_name': ['John Doe', 'Jane Smith', 'Robert Johnson', 'Maria Garcia', 'David Lee'],
    'email': ['john.doe@example.com', 'jane.smith@company.org', 'rjohnson@email.net', 
              'maria.g@domain.com', 'dlee@business.co'],
    'phone': ['555-123-4567', '(555) 987-6543', '+1 555-555-5555', 
              '555.444.3333', '1-555-222-1111'],
    'ssn': ['123-45-6789', '987-65-4321', '456-78-9123', '321-54-9876', '789-12-3456'],
    'credit_card': ['4111-1111-1111-1111', '5555-4444-3333-2222', '3782-8224-6310-005',
                   '6011-1111-1111-1117', '4532-1234-5678-9010'],
    'address': ['123 Main St, New York, NY 10001', '456 Oak Ave, Los Angeles, CA 90001',
               '789 Pine Rd, Chicago, IL 60601', '321 Elm St, Houston, TX 77001',
               '654 Maple Dr, Phoenix, AZ 85001'],
    'notes': ['VIP customer since 2020', 'Prefers email communication', 
             'Contact after 5pm only', 'Spanish speaking preferred', 
             'Account manager: Sarah Wilson']
})

print("Sample customer data created:")
print(f"Shape: {customer_data.shape}")
print(f"Columns: {list(customer_data.columns)}")
display(customer_data.head())

## 8. Demonstrate PII Detection Pipeline

Now let's demonstrate the complete PII detection pipeline.

In [None]:
# Initialize components
ner_detector = PIINERDetector()
proximity_analyzer = ProximityAnalyzer(window_size=50)
graph_builder = PIIGraphBuilder()

# Test on a sample text
test_text = """John Doe (john.doe@example.com) called from 555-123-4567 regarding 
his account. His SSN is 123-45-6789 and he wants to update his credit card 
ending in 1111. He works at Acme Corporation in New York."""

print("Test Text:")
print(test_text)
print("\n" + "="*60 + "\n")

# Step 1: Detect PII
print("Step 1: PII Detection")
print("-" * 30)
entities = ner_detector.detect_pii_in_text(test_text)
print(f"Found {len(entities)} PII entities:\n")
for entity in entities:
    print(f"  📍 {entity.pii_type.value:20} | '{entity.text}'")
    print(f"     Position: [{entity.start_pos}:{entity.end_pos}] | Confidence: {entity.confidence:.2f}")

# Step 2: Proximity Analysis
print("\n" + "="*60)
print("Step 2: Proximity Analysis")
print("-" * 30)
analyzed_entities = proximity_analyzer.analyze_proximity(entities, test_text)
print("Entities with elevated risk due to proximity:\n")
for entity in analyzed_entities:
    if entity.related_entities:
        print(f"  🔗 {entity.pii_type.value}: '{entity.text}'")
        print(f"     Risk Level: {entity.risk_level.value.upper()}")
        print(f"     Related to: {len(entity.related_entities)} other entities")

# Step 3: Graph Analysis
print("\n" + "="*60)
print("Step 3: Graph Analysis")
print("-" * 30)
graph = graph_builder.build_graph(analyzed_entities)
analysis = graph_builder.analyze_graph()

print(f"Graph Metrics:")
print(f"  • Nodes: {analysis['basic_metrics']['num_nodes']}")
print(f"  • Edges: {analysis['basic_metrics']['num_edges']}")
print(f"  • Density: {analysis['basic_metrics']['density']:.3f}")
print(f"  • Connected Components: {analysis['connected_components']['count']}")

if analysis.get('risk_clusters'):
    print(f"\nHigh-Risk Clusters:")
    for cluster in analysis['risk_clusters'][:2]:
        print(f"  • Cluster {cluster['cluster_id']}: {cluster['size']} entities")
        print(f"    Overall Risk: {cluster['overall_risk']:.2f}")

## 9. Process CSV Data

Process the sample CSV data and generate masked output.

In [None]:
def process_dataframe(df: pd.DataFrame) -> Dict[str, Any]:
    """Process a DataFrame for PII detection and masking"""
    
    detector = PIINERDetector()
    analyzer = ProximityAnalyzer()
    builder = PIIGraphBuilder()
    
    all_entities = []
    masked_df = df.copy()
    
    # Process each cell
    for row_idx, row in df.iterrows():
        for col_name, cell_value in row.items():
            if pd.isna(cell_value) or cell_value == '':
                continue
            
            cell_str = str(cell_value)
            context = {
                'row_index': row_idx,
                'column_name': col_name
            }
            
            # Detect PII in cell
            entities = detector.detect_pii_in_text(cell_str, context)
            
            if entities:
                all_entities.extend(entities)
                
                # Mask PII in the cell
                masked_value = cell_str
                # Sort entities by position (reverse order for proper replacement)
                entities.sort(key=lambda x: x.start_pos, reverse=True)
                
                for entity in entities:
                    mask_token = f"[{entity.pii_type.value.upper()}_REDACTED]"
                    masked_value = (
                        masked_value[:entity.start_pos] + 
                        mask_token + 
                        masked_value[entity.end_pos:]
                    )
                
                masked_df.at[row_idx, col_name] = masked_value
    
    # Analyze relationships
    if all_entities:
        all_entities = analyzer.analyze_proximity(all_entities, "")
        graph = builder.build_graph(all_entities)
        graph_analysis = builder.analyze_graph()
    else:
        graph_analysis = {}
    
    # Calculate risk distribution
    risk_distribution = defaultdict(int)
    for entity in all_entities:
        risk_distribution[entity.risk_level.value] += 1
    
    # Get PII types detected
    pii_types_detected = list(set(e.pii_type.value for e in all_entities))
    
    results = {
        'summary': {
            'total_entities': len(all_entities),
            'pii_types_detected': pii_types_detected,
            'risk_distribution': dict(risk_distribution)
        },
        'masked_df': masked_df,
        'entities': all_entities,
        'graph_analysis': graph_analysis
    }
    
    return results

# Process the customer data
print("Processing customer data...")
results = process_dataframe(customer_data)

print("\n" + "="*60)
print("PROCESSING RESULTS")
print("="*60)
print(f"\n📊 Summary:")
print(f"  Total PII entities detected: {results['summary']['total_entities']}")
print(f"  PII types found: {', '.join(results['summary']['pii_types_detected'])}")
print(f"  Risk distribution: {json.dumps(results['summary']['risk_distribution'], indent=4)}")

print("\n📄 Original vs Masked Data:")
print("\nOriginal (first row):")
display(customer_data.head(1))

print("\nMasked (first row):")
display(results['masked_df'].head(1))

## 10. Visualize Results

Create visualizations of the PII detection results.

In [None]:
# Visualize PII types distribution
pii_types = [entity.pii_type.value for entity in results['entities']]
pii_counts = pd.Series(pii_types).value_counts()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: PII Types Distribution
ax1 = axes[0, 0]
bars = ax1.bar(range(len(pii_counts)), pii_counts.values)
ax1.set_xticks(range(len(pii_counts)))
ax1.set_xticklabels(pii_counts.index, rotation=45, ha='right')
ax1.set_xlabel('PII Type')
ax1.set_ylabel('Count')
ax1.set_title('PII Types Detected')
ax1.grid(axis='y', alpha=0.3)

# Color code by risk
colors = []
for pii_type in pii_counts.index:
    if pii_type in ['social_security_number', 'credit_card']:
        colors.append('red')
    elif pii_type in ['email_address', 'phone_number']:
        colors.append('orange')
    else:
        colors.append('yellow')
for bar, color in zip(bars, colors):
    bar.set_color(color)

# Plot 2: Risk Level Distribution
ax2 = axes[0, 1]
risk_levels = [entity.risk_level.value for entity in results['entities']]
risk_counts = pd.Series(risk_levels).value_counts()
colors_risk = {'critical': '#FF0000', 'high': '#FF6600', 'medium': '#FFAA00', 'low': '#FFFF00'}
wedges, texts, autotexts = ax2.pie(risk_counts.values, labels=risk_counts.index, autopct='%1.1f%%',
        colors=[colors_risk.get(level, '#CCCCCC') for level in risk_counts.index],
        startangle=90)
ax2.set_title('Risk Level Distribution')

# Plot 3: PII by Column
ax3 = axes[1, 0]
column_counts = defaultdict(int)
for entity in results['entities']:
    if entity.column_name:
        column_counts[entity.column_name] += 1
column_df = pd.Series(column_counts)
ax3.bar(range(len(column_df)), column_df.values, color='steelblue')
ax3.set_xticks(range(len(column_df)))
ax3.set_xticklabels(column_df.index, rotation=45, ha='right')
ax3.set_xlabel('Column')
ax3.set_ylabel('PII Count')
ax3.set_title('PII Distribution by Column')
ax3.grid(axis='y', alpha=0.3)

# Plot 4: Detection Method Distribution
ax4 = axes[1, 1]
methods = [entity.detection_method for entity in results['entities']]
method_counts = pd.Series(methods).value_counts()
ax4.bar(range(len(method_counts)), method_counts.values, color='coral')
ax4.set_xticks(range(len(method_counts)))
ax4.set_xticklabels(method_counts.index)
ax4.set_xlabel('Detection Method')
ax4.set_ylabel('Count')
ax4.set_title('Detection Methods Used')
ax4.grid(axis='y', alpha=0.3)

plt.suptitle('PII Detection Analysis Dashboard', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 11. Generate Compliance Report

Create a compliance-focused report for regulatory requirements.

In [None]:
def generate_compliance_report(results: Dict) -> Dict:
    """Generate a compliance report based on PII detection results"""
    
    summary = results['summary']
    pii_types = summary['pii_types_detected']
    
    # Define regulatory mappings
    regulations = {
        'GDPR': {
            'affected_types': ['person_name', 'email_address', 'phone_number', 'physical_address', 'ip_address'],
            'requirements': 'Requires explicit consent, right to erasure, data portability'
        },
        'HIPAA': {
            'affected_types': ['person_name', 'social_security_number', 'medical_identifier', 'date_of_birth'],
            'requirements': 'Requires encryption, access controls, audit logs'
        },
        'PCI_DSS': {
            'affected_types': ['credit_card', 'bank_account'],
            'requirements': 'Requires tokenization, network segmentation, regular security scans'
        },
        'CCPA': {
            'affected_types': ['person_name', 'email_address', 'phone_number', 'ip_address', 'physical_address'],
            'requirements': 'Requires opt-out mechanism, data disclosure, non-discrimination'
        }
    }
    
    report = {
        'timestamp': pd.Timestamp.now().isoformat(),
        'total_pii_entities': summary['total_entities'],
        'affected_regulations': [],
        'recommendations': [],
        'risk_score': 0
    }
    
    # Check which regulations apply
    for reg_name, reg_info in regulations.items():
        affected_types = [pii_type for pii_type in pii_types if pii_type in reg_info['affected_types']]
        if affected_types:
            report['affected_regulations'].append({
                'regulation': reg_name,
                'affected_pii_types': affected_types,
                'requirements': reg_info['requirements']
            })
    
    # Calculate overall risk score
    risk_dist = summary.get('risk_distribution', {})
    risk_weights = {'critical': 1.0, 'high': 0.7, 'medium': 0.4, 'low': 0.1}
    total_weighted = sum(risk_dist.get(level, 0) * weight 
                        for level, weight in risk_weights.items())
    max_possible = summary['total_entities']
    report['risk_score'] = (total_weighted / max_possible * 100) if max_possible > 0 else 0
    
    # Generate recommendations
    if report['risk_score'] > 70:
        report['recommendations'].append("CRITICAL: Immediate remediation required")
        report['recommendations'].append("Implement data encryption at rest and in transit")
        report['recommendations'].append("Review and restrict data access permissions")
    elif report['risk_score'] > 40:
        report['recommendations'].append("HIGH: Significant PII exposure detected")
        report['recommendations'].append("Implement data masking for sensitive fields")
        report['recommendations'].append("Enable audit logging for all data access")
    else:
        report['recommendations'].append("MODERATE: Standard security measures recommended")
        report['recommendations'].append("Regular security reviews recommended")
    
    return report

# Generate compliance report
compliance_report = generate_compliance_report(results)

print("COMPLIANCE REPORT")
print("=" * 60)
print(f"Generated: {compliance_report['timestamp']}")
print(f"\nRisk Score: {compliance_report['risk_score']:.1f}/100")
print(f"Total PII Entities: {compliance_report['total_pii_entities']}")

print("\nAffected Regulations:")
for reg in compliance_report['affected_regulations']:
    print(f"\n  📋 {reg['regulation']}")
    print(f"     Affected PII Types: {', '.join(reg['affected_pii_types'])}")
    print(f"     Requirements: {reg['requirements']}")

print("\nRecommendations:")
for i, rec in enumerate(compliance_report['recommendations'], 1):
    print(f"  {i}. {rec}")

## 12. Summary

This notebook has demonstrated a comprehensive PII detection system using:

1. **Named Entity Recognition (NER)** - Detecting person names, organizations, and locations
2. **Regex Pattern Matching** - Finding structured PII like SSNs, emails, and phone numbers
3. **Proximity Analysis** - Identifying high-risk PII combinations based on spatial relationships
4. **Graph Theory** - Building entity relationship networks to find PII clusters
5. **Risk Assessment** - Multi-tier classification of PII exposure risk
6. **Compliance Reporting** - Automated regulatory compliance assessment

The system provides enterprise-grade PII detection and protection capabilities suitable for production deployment in privacy-sensitive environments.

---

**Built with ❤️ for data privacy and security**