In [1]:
import re
from typing import List, Dict, Any


In [2]:
DIRECT_PII_FIELDS = [
    "name", "full_name", "first_name", "last_name",
    "email", "phone", "mobile", "address", "national_id", "ssn", "passport",
]


In [3]:
QUASI_PII_HINTS = [
    "postcode", "zip", "birth", "age", "gender", "occupation", "salary", "income", "ethnicity", "education", "location"
]


In [4]:
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{4}")
ADDRESS_HINTS = ["street", "road", "ave", "avenue", "city", "state", "country"]


In [5]:
def detect_pii_in_schema(columns: List[str]) -> Dict[str, List[str]]:
    direct, quasi = [], []
    cols_lower = [c.lower() for c in columns]
    for c in cols_lower:
        if any(k in c for k in DIRECT_PII_FIELDS) or EMAIL_RE.search(c) or PHONE_RE.search(c):
            direct.append(c)
        elif any(k in c for k in QUASI_PII_HINTS) or any(h in c for h in ADDRESS_HINTS):
            quasi.append(c)
    return {"direct_pii": sorted(set(direct)), "quasi_pii": sorted(set(quasi))}



In [6]:
def assess_risk(meta: Dict[str, Any], pii: Dict[str, List[str]]) -> Dict[str, Any]:
    risk = 0
    findings = []
    if pii["direct_pii"]:
        risk += 3
        findings.append("Direct PII present: " + ", ".join(pii["direct_pii"]))
    if pii["quasi_pii"]:
        risk += 1
        findings.append("Quasi-identifiers present: " + ", ".join(pii["quasi_pii"]))
    if meta.get("domain") in {"health", "finance"}:
        risk += 2
        findings.append(f"Sensitive domain: {meta['domain']}")
    if meta.get("open_model_access"):  # e.g., public API
        risk += 1
        findings.append("Model exposed via public interface (consider output filtering)")
    return {"risk_score": risk, "findings": findings}


In [7]:
def recommend_actions(pii: Dict[str, List[str]], risk_report: Dict[str, Any]) -> List[str]:
    recs = []
    if pii["direct_pii"]:
        recs.append("Apply removal/masking for direct PII fields; prefer irreversible hashing for identifiers where feasible.")
        recs.append("Evaluate necessity (GDPR data minimization) and drop non-essential direct PII columns.")
    if pii["quasi_pii"]:
        recs.append("Generalize or bucket quasi-identifiers (e.g., age→age_range, zipcode→3-digit).")
    if risk_report["risk_score"] >= 4:
        recs.append("Consider Differential Privacy for training (ε,budget selection) and output noise for analytics.")
        recs.append("Add input/output filters with PII NER redaction before/after LLM calls.")
    recs.append("Document purpose limitation and retention policy; align with lawful basis.")
    return recs


In [8]:
def run_privacy_check(columns: List[str], meta: Dict[str, Any]) -> Dict[str, Any]:
    pii = detect_pii_in_schema(columns)
    risk = assess_risk(meta, pii)
    recs = recommend_actions(pii, risk)
    return {
        "pii_detection": pii,
        "risk_assessment": risk,
        "recommendations": recs
    }


In [9]:
if __name__ == "__main__":
    demo_cols = ["user_id", "full_name", "email", "age", "zipcode", "diagnosis_text"]
    demo_meta = {"domain": "health", "open_model_access": True}
    report = run_privacy_check(demo_cols, demo_meta)
    import json
    print(json.dumps(report, indent=2))

{
  "pii_detection": {
    "direct_pii": [
      "email",
      "full_name"
    ],
    "quasi_pii": [
      "age",
      "zipcode"
    ]
  },
  "risk_assessment": {
    "risk_score": 7,
    "findings": [
      "Direct PII present: email, full_name",
      "Quasi-identifiers present: age, zipcode",
      "Sensitive domain: health",
      "Model exposed via public interface (consider output filtering)"
    ]
  },
  "recommendations": [
    "Apply removal/masking for direct PII fields; prefer irreversible hashing for identifiers where feasible.",
    "Evaluate necessity (GDPR data minimization) and drop non-essential direct PII columns.",
    "Generalize or bucket quasi-identifiers (e.g., age\u2192age_range, zipcode\u21923-digit).",
    "Consider Differential Privacy for training (\u03b5,budget selection) and output noise for analytics.",
    "Add input/output filters with PII NER redaction before/after LLM calls.",
    "Document purpose limitation and retention policy; align with lawful 