In [None]:
# Setup
!pip install --upgrade pip
!pip install pyspark==3.4.1
!pip install spark-nlp==5.1.4
!pip install -q transformers presidio-analyzer presidio-anonymizer

# Spark + Spark NLP Setup
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, explode, regexp_replace, lit, collect_list, expr, avg, count, max as spark_max
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, FloatType
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer, WordEmbeddingsModel, NerDLModel, NerConverter
from pyspark.ml import Pipeline
from transformers import pipeline as hf_pipeline
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern
from presidio_anonymizer import AnonymizerEngine
import logging
import re

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Indian_Hospital_PII_Redaction") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.8") \
    .getOrCreate()

# Sample Indian hospital text

data = [
    ("OUT PATIENT RECORD - SURGICAL ONCOLOGY. ",),
    ("Patient Name: Mrs Sandhya PUVVALA, Age/Sex: 43 Yrs / Female, Episode No.: 00000851378. ",),
    ("Hospital No.: MH000731649, Date: 2022-06-08 10:52, Patient Mob. No.: 9611827350, ",),
    ("E-mail ID: sandhya.puvvala@gmail.com. ",),
    ("Consultant Name: DR. SHABBER ZAVERI (MBBS-92, MS-97, Mch.(Surgical Oncology)-01). ",),
    ("KMC Reg No.: 33660, Dept: ONCOLOGY SURGICAL MHB. ",),
    ("Doctor's Notes: K/c/o IDC (E) breast C Tz-m/fo, stage IIB. ",),
    ("PE: 916 HKI-mv-ng C14 Hypothermia K/G 7-50 l. ",),
    ("S/P (E) RCs + Subpec + ALND + IIGAP flap encapsab C Tz PMG 22/10 41, H161Gr Cn 12/12/2020. ",),
    ("s/P Adj-chemo - AC x 4 pack x q - test 14/3/2020 Leq: 2y-2012-dnon-acid 22/12/21. ",),
    ("s/P Adj - RT - Goby 15# - test 14/19/2020. Started Aromatin from 15/8/2020. ",),
    ("14/3/22 B/t Mammogram - B/RADS 2 on visit breast. ",),
    ("USG - Med a pellu -> uterine fibroid 2.7 x 2.1 cm on follow up. ",),
    ("PE: ESR-0 No ccn (B) Breast - Nk-NAD Lungs-thy due health. No polyoble lung. ",),
    ("(B) Axilla - NAD No (B) upper limb lymphadenism. ",),
    ("T - Aromatin 2.5 OD, T - Thyronorm 50mg OD, Tablet D3 Zoledronic Acid 4mg today. ",),
    ("Manipal Hospital, HAL Airport Road, #98 HAL Airport Road, Bangalore 560017. ",),
    ("Website: www.manipalhospitals.com, Email-Id: info@manipalhospitals.com. ",),
    ("Phone: For booking an appointment/enquiry, call on: 1800 102 5555 / 1800 102 3222. ",),
    ("For Home Care, call on: 1800 102 6070. ",),

    ("OUT PATIENT RECORD - CARDIOLOGY. ",),
    ("Patient Name: Mr Rajesh KUMAR, Age/Sex: 58 Yrs / Male, Episode No.: 00000852147. ",),
    ("Hospital No.: MH000732891, Date: 2022-07-15 14:30, Patient Mob. No.: 9876543210, ",),
    ("E-mail ID: rajesh.kumar@hotmail.com. ",),
    ("Consultant Name: DR. PRIYA SHARMA (MBBS-88, MD-93, DM.(Cardiology)-96). ",),
    ("KMC Reg No.: 41257, Dept: CARDIOLOGY MHB. ",),
    ("Doctor's Notes: K/c/o CAD with STEMI, s/p PCI to LAD. ",),
    ("PE: BP 140/90 mmHg, HR 78/min, SpO2 98% RA. No acute distress. ",),
    ("S/P Primary PCI to LAD with DES implantation on 12/7/2022. EF: 45% on Echo. ",),
    ("Currently on dual antiplatelet therapy - Aspirin 75mg + Clopidogrel 75mg OD. ",),
    ("s/P Cardiac rehab program initiated. Lipid profile normal on statins. ",),
    ("Follow up ECG - NSR, no new ST changes. Troponin levels normalized. ",),
    ("2D Echo - RWMA in anterior wall, EF 45%, no regional wall motion abnormalities. ",),
    ("PE: CVS - S1S2 heard, no murmurs. Chest clear. No pedal edema. ",),
    ("Peripheral pulses palpable. No signs of heart failure. ",),
    ("T - Metoprolol 50mg BD, T - Atorvastatin 40mg OD, T - Aspirin 75mg OD, T - Clopidogrel 75mg OD. ",),
    ("Apollo Hospital, Bannerghatta Road, #154/11 Bannerghatta Road, Bangalore 560076. ",),
    ("Website: www.apollohospitals.com, Email-Id: info@apollobangalore.com. ",),
    ("Phone: For booking an appointment/enquiry, call on: 1860 500 1066 / 080 2630 2630. ",),
    ("For Emergency services, call on: 1066. ",),

    ("OUT PATIENT RECORD - ORTHOPEDICS. ",),
    ("Patient Name: Mrs Lakshmi NAIR, Age/Sex: 65 Yrs / Female, Episode No.: 00000853299. ",),
    ("Hospital No.: MH000733672, Date: 2022-08-22 09:15, Patient Mob. No.: 9445612378, ",),
    ("E-mail ID: lakshmi.nair@yahoo.in. ",),
    ("Consultant Name: DR. MOHAN RAO (MBBS-89, MS.(Ortho)-94, DNB-98). ",),
    ("KMC Reg No.: 38924, Dept: ORTHOPEDICS MHB. ",),
    ("Doctor's Notes: K/c/o Osteoarthritis (B) knees, Grade III changes. ",),
    ("PE: Gait antalgic, uses walking stick. (B) knee effusion present. ",),
    ("S/P Conservative management with physiotherapy and analgesics for 2 years. ",),
    ("Recent X-ray shows progression of joint space narrowing and osteophyte formation. ",),
    ("s/P Intra-articular steroid injection (R) knee 3 months back with temporary relief. ",),
    ("MRI (B) knees - Severe cartilage loss, subchondral sclerosis, meniscal tears. ",),
    ("Patient counseled regarding Total Knee Replacement surgery options. ",),
    ("PE: ROM (R) knee 0-90 degrees, (L) knee 0-95 degrees. Crepitus present. ",),
    ("No signs of infection. Distal pulses palpable. Neurologically intact. ",),
    ("T - Paracetamol 650mg TDS, T - Glucosamine 750mg BD, Cap - Calcium+D3 OD. ",),
    ("Fortis Hospital, Cunningham Road, #14 Cunningham Road, Bangalore 560052. ",),
    ("Website: www.fortishealthcare.com, Email-Id: info@fortisbangalore.com. ",),
    ("Phone: For booking an appointment/enquiry, call on: 0804 6794 6794 / 1800 102 6767. ",),
    ("For Ambulance services, call on: 102. ",),

    ("OUT PATIENT RECORD - ENDOCRINOLOGY. ",),
    ("Patient Name: Mr Suresh REDDY, Age/Sex: 52 Yrs / Male, Episode No.: 00000854125. ",),
    ("Hospital No.: MH000734458, Date: 2022-09-10 11:45, Patient Mob. No.: 9123456789, ",),
    ("E-mail ID: suresh.reddy@gmail.com. ",),
    ("Consultant Name: DR. KAVITHA MENON (MBBS-91, MD-96, DM.(Endocrinology)-00). ",),
    ("KMC Reg No.: 42831, Dept: ENDOCRINOLOGY MHB. ",),
    ("Doctor's Notes: K/c/o T2DM since 10 years, HTN since 5 years, Dyslipidemia. ",),
    ("PE: BMI 28.5 kg/m2, BP 150/95 mmHg, FBS 145 mg/dl, PPBS 210 mg/dl. ",),
    ("S/P Multiple antidiabetic medications over years. Recent HbA1c 8.2%. ",),
    ("Diabetic retinopathy screening - Mild NPDR (B) eyes. Microalbuminuria present. ",),
    ("s/P Dietary counseling and lifestyle modifications advised repeatedly. ",),
    ("Recent lipid profile - TC 220, TG 180, HDL 35, LDL 145 mg/dl. ",),
    ("Thyroid function tests - TSH 3.2 mIU/L, T3 T4 normal. eGFR 75 ml/min. ",),
    ("PE: Fundus - Mild dot hemorrhages (B) eyes. Feet examination normal. ",),
    ("No diabetic foot ulcers. Peripheral neuropathy assessment negative. ",),
    ("T - Metformin 1000mg BD, T - Glimepiride 2mg OD, T - Amlodipine 5mg OD, T - Atorvastatin 20mg OD. ",),
    ("Columbia Asia Hospital, Hebbal, #Kirloskar Business Park, Hebbal, Bangalore 560024. ",),
    ("Website: www.columbiaasia.com, Email-Id: info@columbiaasia.com. ",),
    ("Phone: For booking an appointment/enquiry, call on: 080 6132 0000 / 1800 103 4530. ",),
    ("For Health Check packages, call on: 080 6132 0000. ",),

    ("OUT PATIENT RECORD - GASTROENTEROLOGY. ",),
    ("Patient Name: Mrs Priya SHARMA, Age/Sex: 38 Yrs / Female, Episode No.: 00000855367. ",),
    ("Hospital No.: MH000735291, Date: 2022-10-05 16:20, Patient Mob. No.: 9987654321, ",),
    ("E-mail ID: priya.sharma@outlook.com. ",),
    ("Consultant Name: DR. ARUN KUMAR (MBBS-93, MD-98, DM.(Gastroenterology)-02). ",),
    ("KMC Reg No.: 45672, Dept: GASTROENTEROLOGY MHB. ",),
    ("Doctor's Notes: K/c/o GERD, Chronic gastritis, H.pylori positive. ",),
    ("PE: Epigastric tenderness present. No hepatosplenomegaly. Bowel sounds normal. ",),
    ("S/P Upper GI Endoscopy - Grade B esophagitis, antral gastritis, H.pylori positive. ",),
    ("Triple therapy for H.pylori eradication completed 4 weeks back. ",),
    ("s/P PPI therapy ongoing. Lifestyle modifications counseled regarding diet. ",),
    ("Recent H.pylori stool antigen test - Negative (post-treatment). ",),
    ("USG Abdomen - Normal liver, GB, pancreas. No focal lesions identified. ",),
    ("PE: Abdomen soft, non-tender. No masses palpable. No ascites. ",),
    ("Patient reports significant improvement in symptoms post H.pylori treatment. ",),
    ("T - Pantoprazole 40mg OD, T - Domperidone 10mg TDS, Syr - Sucralfate 10ml TDS. ",),
    ("Narayana Health City, Bommasandra, #258/A Bommasandra Industrial Area, Bangalore 560099. ",),
    ("Website: www.narayanahealth.org, Email-Id: info@narayanahealth.org. ",),
    ("Phone: For booking an appointment/enquiry, call on: 080 7122 4444 / 1800 102 9999. ",),
    ("For International patients, call on: +91 80 7122 4567. ",),
]

df = spark.createDataFrame(data, ["text"])

# Spark NLP Pipeline
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
embeddings = WordEmbeddingsModel.pretrained("glove_100d").setInputCols(["document", "token"]).setOutputCol("embeddings")
ner_model = NerDLModel.pretrained("ner_dl", "en").setInputCols(["document", "token", "embeddings"]).setOutputCol("ner").setIncludeConfidence(True)
ner_converter = NerConverter().setInputCols(["document", "token", "ner"]).setOutputCol("entities")

pipeline = Pipeline(stages=[document_assembler, tokenizer, embeddings, ner_model, ner_converter])
model = pipeline.fit(df)
processed_df = model.transform(df).cache()

# Define context keywords for ORG to be remapped to ADDRESS
ORG_CONTEXT_TERMS = ["hospital", "clinic", "labs", "insurance", "ltd", "limited", "centre", "center"]

# Modified UDF to remap and filter entities
def custom_entity_filter(entities):
    results = []
    for ent in entities:
        word = ent.result
        label = ent.metadata.get("entity", "").strip("<>").upper()
        confidence = float(ent.metadata.get("confidence", 0.0))

        # Filter out MISC
        if label == "MISC":
            continue

        # Remap PER → NAME
        if label == "PER":
            label = "NAME"

        # Remap LOC → ADDRESS
        elif label == "LOC":
            label = "ADDRESS"

        # Remap ORG → ADDRESS only if context matches
        elif label == "ORG":
            lowered = word.lower()
            if any(term in lowered for term in ORG_CONTEXT_TERMS):
                label = "ADDRESS"
            else:
                continue  # skip this ORG if it doesn't match context

        results.append((word, label, confidence))
    return results

# Update schema if needed
schema = ArrayType(StructType([
    StructField("word", StringType()),
    StructField("entity", StringType()),
    StructField("confidence", FloatType())
]))

# Use new UDF
filtered_udf = udf(custom_entity_filter, schema)
entity_df = processed_df.withColumn("entity_info", filtered_udf("entities")).withColumn("entity", explode("entity_info"))
entity_df = entity_df.filter(col("entity.confidence") >= 0.70)

# Final table of entities
pii_table = entity_df.select(
    col("entity.word").alias("detected_pii"),
    col("entity.entity").alias("entity_type"),
    col("entity.confidence").alias("confidence_score")
).distinct().orderBy(col("confidence_score").desc())

# Entity summary metrics
entity_summary = entity_df.groupBy("entity.entity").agg(
    count("*").alias("count"),
    avg("entity.confidence").alias("avg_confidence"),
    spark_max("entity.confidence").alias("max_confidence")
).orderBy("count", ascending=False)


print("\n🛡️ Presidio Redaction Layer")
from presidio_analyzer import AnalyzerEngine, RecognizerResult, EntityRecognizer, AnalysisExplanation

regex_redactions = [
    # Aadhaar - flexible spacing
    (r"(?i)\b(?:aadhaar(?:\s*(?:no|number|num)\.?\s*[:\-]?)?\s*)?((?:\d{4}[\s\-]*){3}|\d{12})\b", "[AADHAAR]"),

    # Phone numbers - flexible spacing and formats
    (r"(?i)\b(?:phone|mobile|mob\.?|contact)[\s:.-]*\+?91[-\s]?[6-9]\d{9}\b|\b[6-9]\d{9}\b", "[PHONE]"),

    # Email - case insensitive
    (r"(?i)\b[\w\.-]+@[\w\.-]+\.\w+\b", "[EMAIL]"),

    # Patient IDs - case insensitive, flexible spacing and separators
    (r"(?i)\b(?:MRD|UHID|Reg\.?\s*No\.?|Episode\s*No\.?|Hospital\s*No\.?)\s*[:\-]?\s*\w*\d+\b", "[ID]"),

    # Dates - various formats
    (r"(?i)\b\d{1,2,4}[\s\-\/]\d{1,2}[\s\-\/]\d{2,4}\b", "[DOB]"),
    (r"(?i)\b\d{1,2}(st|nd|rd|th)?[-\s]?(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[-\s]?\d{2,4}\b", "[DOB]"),

    # Age (standalone or with label variations)
    (r"(?i)\b(age|ag|age/sex|ag/sex)?\s*[:\-]?\s*\d{1,3}\s*(yrs?|years)?\b", "[AGE]"),

    # Gender (standalone or in combo)
    (r"(?i)\b(sex|gender|age/sex|ag/sex)?\s*[:\-]?\s*(male|female|transgender|m|f)\b", "[GENDER]"),

    # PIN codes
    #(r"\b\d{6}\b", "[PIN_CODE]"),

    # Names with titles - case insensitive, flexible spacing
    (r"(?i)\b(?:dr\.?|mr\.?|mrs\.?|ms\.?|col\.?|prof\.?|shri|smt|miss|master|md\.?)\s+[a-z]+(?:[\s\-]+[a-z]+)*\b", "[NAME]"),

    # PAN - case insensitive
    (r"(?i)\b[a-z]{5}[\s\-]*\d{4}[\s\-]*[a-z]\b", "[PAN]"),

    # Passport - case insensitive
    (r"(?i)\b[a-pr-wy]\d{7}\b", "[PASSPORT]"),

    (
        r"""(?ix)
        \b(
            (mr[dn]|uhid|reg\.?\s*no\.?|hospital\s*no\.?|episode\s*no\.?)
            [-:\s]*\w*\d+
            | [a-z]{4}0[a-z0-9]{6}
            | [a-z]{2}[\s\-]*\d{6,}
            | [a-z]{3}[\s\-]*\d{7}
            | [a-z]{2}[\s\-]*\d{2}[\s\-]*\d{11}
            | \w+@uidai
            | \b\d{2,4}[\s\-]*\d{2,4}[\s\-]*\d{2,4}[\s\-]*\d{2,4}\b
            | [a-z]{2,5}[\s\-]*\d{4,8}
        )\b
        """,
        "[ID]"
    ),

    # IP Address
    (r"\b(?:\d{1,3}\.){3}\d{1,3}\b", "[IP_ADDRESS]"),

    # URLs - case insensitive
    (r"(?i)https?:\/\/(?:www\.)?[-\w\.-]+\.[a-z]{2,6}\b|www\.[-\w\.-]+\.[a-z]{2,6}\b", "[URL]"),

    # Bank Account
    (r"\b\d{2,4}[\s\-]*\d{2,4}[\s\-]*\d{2,4}[\s\-]*\d{2,4}\b", "[BANK_ACCOUNT]"),

    # General address components (street, sector, block, etc.) with flexible spacing
    (
        r"(?i)\b(?:flat|house|h\.?\s*no\.?|quarter|apt|apartment|floor|block|sector|plot|cantonment|"
        r"lane|road|street|st|nagar|residency|chowk|galli|bazar|mandal|ward|taluka|tehsil|mohalla|"
        r"vihar|bhawan|gram|samiti|line|layout|colony|avenue|enclave|bypass|cross|expressway)"
        r"\s*[\w\s,/-]{0,100}",
        "[ADDRESS]"
    ),

    # Addresses ending with 6-digit PIN codes with optional whitespace
    (
        r"(?i)(?:house|flat|h\.?\s*no\.?|apt|sector|block|plot|lane|street|road|nagar|colony|area)"
        r"\s*[\w\s,/-]{0,100}?\s*\d{6}\b",
        "[ADDRESS]"
    ),

    # Addresses prefixed with "address:" or similar labels, with flexible spacing
    (
        r"(?i)\b(?:address|addr|residence)\s*[:\-]?\s*[\w\s,/-]{10,100}?\s*\d{6}\b",
        "[ADDRESS]"
    )
]

# Presidio fallback with custom recognizer
logging.info("\n🛡️ Presidio Redaction Layer")
class UHIDRecognizer(EntityRecognizer):
    def __init__(self):
        super().__init__(supported_entities=["UHID"])
        self.context = ["uhid", "hospital id", "patient id"]

    def load(self):
        pass  # No model to load

    def analyze(self, text, entities, nlp_artifacts):
        pattern = re.compile(r"\b[A-Z]{2}\d{6}\b")
        results = []

        for match in pattern.finditer(text):
            # Context window of ±20 characters
            window = text[max(0, match.start() - 20):match.end() + 20].lower()

            # Check context presence
            if any(ctx in window for ctx in self.context):
                explanation = AnalysisExplanation(
                    recognizer="CustomUHIDRecognizer",
                    original_score=0.85,
                    textual_explanation="UHID detected using regex pattern"
                )

                result = RecognizerResult(
                    entity_type="ID",
                    start=match.start(),
                    end=match.end(),
                    score=0.85,
                    analysis_explanation=explanation
                )
                results.append(result)

        return results
class HospitalNoRecognizer(EntityRecognizer):
    def __init__(self):
        super().__init__(supported_entities=["HOSPITAL_NO"])
        self.context = ["hospital no", "record id"]

    def load(self):
        pass

    def analyze(self, text, entities, nlp_artifacts):
        pattern = re.compile(r"\b[A-Z]{2}\d{6,}\b")
        results = []

        for match in pattern.finditer(text):
            window = text[max(0, match.start()-20):match.end()+20].lower()

            if any(ctx in window for ctx in self.context):
                explanation = AnalysisExplanation(
                    recognizer="HospitalNoRecognizer",
                    original_score=0.85,
                    textual_explanation="Hospital number detected using regex with context"
                )

                result = RecognizerResult(
                    entity_type="ID",
                    start=match.start(),
                    end=match.end(),
                    score=0.85,
                    analysis_explanation=explanation
                )
                results.append(result)

        return results

class MRNRecognizer(EntityRecognizer):
    def __init__(self):
        super().__init__(supported_entities=["MRN"])
        self.context = ["mrn", "mrd", "med rec number"]

    def load(self):
        pass

    def analyze(self, text, entities, nlp_artifacts):
        pattern = re.compile(r"\bMR[N|D][-:]?\d+\b", re.IGNORECASE)
        results = []

        for match in pattern.finditer(text):
            window = text[max(0, match.start()-20):match.end()+20].lower()

            if any(ctx in window for ctx in self.context):
                explanation = AnalysisExplanation(
                    recognizer="MRNRecognizer",
                    original_score=0.85,
                    textual_explanation="MRN detected using regex with context"
                )

                result = RecognizerResult(
                    entity_type="ID",
                    start=match.start(),
                    end=match.end(),
                    score=0.85,
                    analysis_explanation=explanation
                )
                results.append(result)

        return results


class NameRecognizer(EntityRecognizer):
    def __init__(self):
        super().__init__(supported_entities=["NAME"])
        self.context = ["mr", "mrs", "dr", "patient", "name"]

    def load(self):
        pass

    def analyze(self, text, entities, nlp_artifacts):
        pattern = re.compile(r"\b(?:Dr|Mr|Mrs|Ms|Prof|Col|Maj)\.\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b")
        results = []

        for match in pattern.finditer(text):
            window = text[max(0, match.start()-20):match.end()+20].lower()

            if any(ctx in window for ctx in self.context):
                explanation = AnalysisExplanation(
                    recognizer="NameRecognizer",
                    original_score=0.88,
                    textual_explanation="Name with prefix detected using regex"
                )

                result = RecognizerResult(
                    entity_type="ID",
                    start=match.start(),
                    end=match.end(),
                    score=0.88,
                    analysis_explanation=explanation
                )
                results.append(result)

        return results
from presidio_analyzer import AnalyzerEngine, RecognizerResult, AnalysisExplanation
from presidio_anonymizer import AnonymizerEngine

analyzer = AnalyzerEngine()
analyzer.registry.load_predefined_recognizers()

# Add your custom recognizers
analyzer.registry.add_recognizer(UHIDRecognizer())
analyzer.registry.add_recognizer(HospitalNoRecognizer())
analyzer.registry.add_recognizer(MRNRecognizer())
analyzer.registry.add_recognizer(NameRecognizer())

anonymizer = AnonymizerEngine()

# Reduce logging
logging.getLogger("presidio-analyzer").setLevel(logging.ERROR)

# Step 2: Define allowed PII and their replacement tags
PII_REPLACEMENTS = {
    'CREDIT_CARD': 'CREDIT_CARD',
    'DATE_TIME': 'DOB',
    'EMAIL_ADDRESS': 'EMAIL',
    'IN_AADHAAR': 'AADHAAR',
    'IN_PAN': 'PAN',
    'IN_PASSPORT': 'PASSPORT',
    'IN_VEHICLE_REGISTRATION': 'ID',
    'IN_VOTER': 'ID',
    'IP_ADDRESS': 'IP_ADDRESS',
    'LOCATION': 'ADDRESS',
    'MEDICAL_LICENSE': 'ID',
    'PERSON': 'NAME',
    'PHONE_NUMBER': 'PHONE',
    'URL': 'URL',
    'UHID': 'ID',
    'HOSPITAL_NO': 'ID',
    'MRN': 'ID',
    'NAME': 'NAME'
}

ALLOWED_PII_TYPES = set(PII_REPLACEMENTS.keys())
from presidio_anonymizer.entities import OperatorConfig
# Step 3: Prepare Spark data
text_list = df.select("text").rdd.flatMap(lambda x: x).collect()

presidio_results = []
redacted_outputs = []

# Step 4: Loop over texts and redact with custom tags
for text in text_list:
    pres_results = analyzer.analyze(text=text, language="en")

    # Filter only allowed types
    filtered_results = [r for r in pres_results if r.entity_type in ALLOWED_PII_TYPES and r.score >= 0.6]

    # Build operator config for each entity type
    operators = {
    entity: OperatorConfig("replace", {"new_value": f"[{PII_REPLACEMENTS[entity]}]"})
    for entity in PII_REPLACEMENTS
}
    # Redact using the filtered results and the mapping
    redacted = anonymizer.anonymize(text, filtered_results, operators)
    presidio_results.append((text, redacted.text))

# Step 5: Build Spark DataFrame and join
presidio_df = spark.createDataFrame(presidio_results, ["text", "presidio_redacted"])

final_df = df.join(presidio_df, on="text", how="left")
final_df = final_df.withColumn("redacted_text", col("presidio_redacted"))

# UDF to apply all regex redactions
def redact_text(text):
    if not text:
        return text
    for pattern, repl in regex_redactions:
        text = re.sub(pattern, repl, text)
    return text

redact_udf = udf(redact_text, StringType())

# Apply UDF
final_df = final_df.withColumn("final_redacted", redact_udf(col("redacted_text")))


# Dynamic redaction of high-confidence Spark NLP entities
name_entities = entity_df.filter(
    (col("entity.entity") == "PER") & (col("entity.confidence") >= 0.75)
).select("entity.word").distinct().rdd.flatMap(lambda x: x).collect()

for name in name_entities:
    escaped = re.escape(name)
    final_df = final_df.withColumn("redacted_text", regexp_replace("redacted_text", rf"\b{escaped}\b", "[NAME]"))

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline as hf_pipeline
from huggingface_hub import login
#login()

# Fallback to IndicNER for low-confidence
low_conf_df = entity_df.filter(col("entity.confidence") < 0.75).select("text").distinct()

if not low_conf_df.rdd.isEmpty():
    tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")
    model = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER")
    indic_ner = hf_pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

    low_texts = low_conf_df.rdd.flatMap(lambda x: x).collect()
    logging.info("\n🔁 IndicNER fallback for low-confidence:")
    indic_results = []

    for text in low_texts:
        logging.info(f"\nText: {text}")
        for ent in indic_ner(text):
            logging.info(f"→ {ent['word']} ({ent['entity_group']}, {ent['score']:.2f})")
            indic_results.append((text, ent['word'], ent['entity_group'], float(ent['score'])))

    indic_schema = StructType([
        StructField("text", StringType()),
        StructField("word", StringType()),
        StructField("entity", StringType()),
        StructField("confidence", FloatType())
    ])
    indic_df = spark.createDataFrame(indic_results, indic_schema)

    for row in indic_results:
        escaped = re.escape(row[1])
        clean_label = row[2].strip("<>").upper()
        final_df = final_df.withColumn(
        "redacted_text",
        regexp_replace("redacted_text", rf"\b{escaped}\b", f"[{clean_label}]")
    )

entity_summary.show(truncate=False)
# Output final results
for row in final_df.select("redacted_text").collect():
    print(row['redacted_text'])
for row in pii_table.collect():
    print(row)
redacted_texts = final_df.select("redacted_text").rdd.flatMap(lambda x: x).collect()
with open("redacted_output.txt", "w", encoding="utf-8") as f:
    for line in redacted_texts:
        f.write(line + "\n")
from google.colab import files
files.download("redacted_output.txt")


Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting pyspark==3.4.1
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
[33m  DEPRECATION: Building 'pyspark' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface



[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/667M [00:00<?, ?B/s]

Device set to use cuda:0


model.safetensors:   0%|          | 0.00/667M [00:00<?, ?B/s]

+-------+-----+------------------+--------------+
|entity |count|avg_confidence    |max_confidence|
+-------+-----+------------------+--------------+
|NAME   |20   |0.9032774984836578|0.99775004    |
|ADDRESS|17   |0.8612117627087761|0.9922        |
+-------+-----+------------------+--------------+

s/P Adj-chemo - AC x 4 pack x q - test [DOB] Leq: 2y-2012-dnon-acid [DOB]. 
[DOB] B/t Mammogram - B/RADS 2 on visit breast. 
Phone: For booking an appointment/enquiry, call on: [DOB]. 
Website: [URL], [NAME]: [EMAIL]. 
s/P Cardiac rehab program initiated. Lipid profile normal on statins. 
[DOB] Echo - RWMA in anterior wall, EF 45%, no regional wall motion abnormalities. 
Hospital No.: MH000732891, Date: [DOB], Patient Mob. No.: 9876543210, 
Hospital No.: MH000733672, Date: [DOB], Patient Mob. No.: 9445612378, 
OUT PATIENT RECORD - SURGICAL ONCOLOGY. 
PE: CVS - S1S2 heard, no murmurs. Chest clear. No pedal edema. 
E-mail ID: [EMAIL]. 
S/P (E) RCs + Subpec + ALND + IIGAP flap encapsab C Tz PM

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import re
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score

data = [
    ("OUT PATIENT RECORD - SURGICAL ONCOLOGY. ",),
    ("Patient Name: [NAME], [AGE] / [GENDER], [ID]. ",),
    ("Hospital No.: [ID], Date: [DOB] [TIME], Patient Mob. No.: [PHONE], ",),
    ("E-mail ID: [EMAIL]. ",),
    ("Consultant Name: [NAME] (MBBS-92, MS-97, Mch.(Surgical Oncology)-01). ",),
    ("KMC Reg No.: [PIN_CODE], Dept: ONCOLOGY SURGICAL MHB. ",),
    ("Doctor's Notes: K/c/o IDC (E) breast C Tz-m/fo, stage IIB. ",),
    ("PE: 916 HKI-mv-ng C14 Hypothermia K/G 7-50 l. ",),
    ("S/P (E) RCs + Subpec + ALND + IIGAP flap encapsab C Tz PMG 22/10 41, H161Gr Cn [DOB]. ",),
    ("s/P Adj-chemo - AC x 4 pack x q - test [DOB] Leq: 2y-2012-dnon-acid [DOB]. ",),
    ("s/P Adj - RT - Goby 15# - test 14/19/2020. Started Aromatin from [DOB]. ",),
    ("[DOB] B/t Mammogram - B/RADS 2 on visit breast. ",),
    ("USG - Med a pellu -> uterine fibroid 2.7 x 2.1 cm on follow up. ",),
    ("PE: ESR-0 No ccn (B) Breast - Nk-NAD Lungs-thy due health. No polyoble lung. ",),
    ("(B) Axilla - NAD No (B) upper limb lymphadenism. ",),
    ("T - Aromatin 2.5 OD, T - Thyronorm 50mg OD, Tablet D3 Zoledronic Acid 4mg today. ",),
    ("Manipal Hospital, HAL Airport Road, #98 HAL Airport Road, Bangalore [PIN_CODE]. ",),
    ("Website: [URL], Email-Id: [EMAIL]. ",),
    ("Phone: For booking an appointment/enquiry, call on: [PHONE] / [PHONE]. ",),
    ("For Home Care, call on: [PHONE].",),
    ("Patient Name: [NAME] visited our cardiology department on [DOB]. ",),
    ("[ID], [DOB], Contact: [PHONE], Email: [EMAIL]. ",),
    ("Emergency contact: Husband [NAME] ([PHONE]). [ADDRESS], ",),
    ("Noida, UP - [PIN_CODE]. [AADHAAR]. Insurance Policy: Star Health - [ID]. ",),
    ("Patient complained of chest pain and was referred by [NAME] from Apollo Hospital. ",),
    ("Lab reports show elevated troponin levels. Prescription includes Atorvastatin 20mg and Metoprolol 50mg twice daily.",),
    ("Admission Report: [NAME], [AGE], [ID], admitted on [DOB] with diabetes complications. ",),
    ("Phone: [PHONE], [PAN]. Guardian: Son [NAME] ([PHONE]). ",),
    ("[ADDRESS], Jaipur, Rajasthan - [PIN_CODE]. [ID]. ",),
    ("Patient's blood glucose levels were 280 mg/dl on admission. [NAME], Endocrinologist, ",),
    ("prescribed Insulin Glargine 24 units bedtime and Metformin 500mg BD. Follow-up scheduled with [NAME] on [DOB].",),
    ("Pediatric Case: [NAME], [DOB], brought by mother [NAME] (Mobile: [PHONE]) ",),
    ("for vaccination. Father: [NAME], Army Medical Corps. [ADDRESS], ",),
    ("Cantonment Area, Pune - [PIN_CODE]. [AADHAAR], [ID]. ",),
    ("Administered DPT booster and MMR vaccine. Next appointment with [NAME] scheduled for [DOB]. ",),
    ("Emergency contact: Grandfather [NAME] - [PHONE].",),
    ("Patient: [NAME], [AGE], [ID], discharged on [DOB] ",),
    ("after successful appendectomy. Contact details: [PHONE], WhatsApp: Same number. ",),
    ("Email: [EMAIL]. [ADDRESS], ",),
    ("Bangalore - [PIN_CODE]. Insurance: Mediclaim Policy No. [BANK_ACCOUNT] (Oriental Insurance). ",),
    ("Surgeon: [NAME], Anesthesiologist: [NAME]. Post-operative instructions given. ",),
    ("Follow-up with [NAME] in 2 weeks. Emergency contact: Wife [NAME] ([PHONE]).",),
    ("Patient: [NAME] ([ID]), [DOB], Phone: [PHONE], ",),
    ("Email: [EMAIL], [ADDRESS], Bengaluru, ",),
    ("[AADHAAR], [ID]",)
]

tag_aliases = {
    "PER": "NAME",
    "PERSON": "NAME",
    "DATE_TIME": "DOB",
    "DOB": "DOB",
    "EMAIL_ADDRESS": "EMAIL",
    "PHONE_NUMBER": "PHONE",
    "LOC": "ADDRESS",
    "LOCATION": "ADDRESS"
    # Add more aliases here if needed
}

def normalize_tag(tag):
    return tag_aliases.get(tag.upper(), tag.upper())

# Step 1: Load your output from text file
with open("redacted_output.txt", "r", encoding="utf-8") as f:
    predicted_lines = [line.strip() for line in f.readlines()]

# Step 2: Load ground truth (already in memory as `data`)
ground_truth_lines = [item[0].strip() for item in data]

# Step 3: Function to extract all [TAGS] from a line
def extract_tags(text):
    return [normalize_tag(tag) for tag in re.findall(r"[\[<]([A-Z_]+)[\]>]", text)]

# Step 4: Flatten tag lists
predicted_tags = []
ground_truth_tags = []

for pred, gt in zip(predicted_lines, ground_truth_lines):
    predicted_tags.extend(extract_tags(pred))
    ground_truth_tags.extend(extract_tags(gt))

# Step 5: Count per-tag true positives (TP), false positives (FP), false negatives (FN)
gt_counter = Counter(ground_truth_tags)
pred_counter = Counter(predicted_tags)

# All unique tags in either list
all_tags = set(gt_counter.keys()).union(pred_counter.keys())

# Step 6: Compute per-entity metrics
print(f"{'ENTITY':<15} {'PRECISION':<10} {'RECALL':<10} {'F1':<10}")
print("="*45)

for tag in sorted(all_tags):
    tp = min(gt_counter[tag], pred_counter[tag])  # Simplified matching count
    fp = max(0, pred_counter[tag] - tp)
    fn = max(0, gt_counter[tag] - tp)

    precision = tp / (tp + fp + 1e-9)
    recall = tp / (tp + fn + 1e-9)
    f1 = 2 * precision * recall / (precision + recall + 1e-9)

    print(f"{tag:<15} {precision:.2f}      {recall:.2f}      {f1:.2f}")

# Optional: Micro-average scores
total_tp = sum(min(gt_counter[tag], pred_counter[tag]) for tag in all_tags)
total_fp = sum(max(0, pred_counter[tag] - min(gt_counter[tag], pred_counter[tag])) for tag in all_tags)
total_fn = sum(max(0, gt_counter[tag] - min(gt_counter[tag], pred_counter[tag])) for tag in all_tags)

micro_precision = total_tp / (total_tp + total_fp + 1e-9)
micro_recall = total_tp / (total_tp + total_fn + 1e-9)
micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall + 1e-9)

print("\nOverall Micro-Averaged Scores:")
print(f"Precision: {micro_precision:.2f}, Recall: {micro_recall:.2f}, F1: {micro_f1:.2f}")


ENTITY          PRECISION  RECALL     F1        
AADHAAR         0.00      0.00      0.00
ADDRESS         0.50      1.00      0.67
AGE             0.00      0.00      0.00
BANK_ACCOUNT    0.00      0.00      0.00
DOB             0.61      1.00      0.76
EMAIL           1.00      1.00      1.00
GENDER          0.00      0.00      0.00
ID              0.00      0.00      0.00
NAME            0.57      1.00      0.73
PAN             1.00      1.00      1.00
PHONE           1.00      0.15      0.27
PIN_CODE        0.00      0.00      0.00
TIME            0.00      0.00      0.00
URL             1.00      1.00      1.00

Overall Micro-Averaged Scores:
Precision: 0.62, Recall: 0.57, F1: 0.60


In [None]:
from sparknlp.annotator import NerDLModel

ner_model = NerDLModel.pretrained("ner_dl", "en")  # or your custom model
labels = ner_model.getClasses()

print("Entity Labels:")
for label in labels:
    print(label)

from presidio_analyzer import AnalyzerEngine

analyzer = AnalyzerEngine()
entities = analyzer.get_supported_entities()
print(sorted(entities))


ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]
Entity Labels:
O
B-ORG
B-LOC
B-PER
I-PER
I-ORG
B-MISC
I-LOC
I-MISC
['AU_ABN', 'AU_ACN', 'AU_MEDICARE', 'AU_TFN', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'EMAIL_ADDRESS', 'IBAN_CODE', 'IN_AADHAAR', 'IN_PAN', 'IN_PASSPORT', 'IN_VEHICLE_REGISTRATION', 'IN_VOTER', 'IP_ADDRESS', 'LOCATION', 'MEDICAL_LICENSE', 'NRP', 'PERSON', 'PHONE_NUMBER', 'SG_NRIC_FIN', 'UK_NHS', 'UK_NINO', 'URL', 'US_BANK_NUMBER', 'US_DRIVER_LICENSE', 'US_ITIN', 'US_PASSPORT', 'US_SSN']


In [None]:
import re

# All patterns that represent patient identifiers
id_patterns = [
    r"\bMRN[:\s]*\d{4,10}\b",             # MRN
    r"\bUHID[:\s]*[A-Za-z0-9]{4,15}\b",   # UHID
    r"\bUIAI[:\s]*\d{12}\b",              # UIAI ID (e.g., Aadhaar)
    r"\b[Aa]adhaar\s*[:\s]*\d{4}\s*\d{4}\s*\d{4}\b",  # Aadhaar 1234 5678 9012
    r"\b[Aa]adhaar\s*[:\s]*\d{12}\b",     # Aadhaar 123456789012
    r"\bPatient ID[:\s]*[A-Za-z0-9\-]{4,20}\b",       # Generic ID
    r"\bHospital ID[:\s]*[A-Za-z0-9\-]{4,20}\b"
]

# Any other PII patterns like name, email, etc. you might want to keep separate
email_patterns = [r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b']
phone_patterns = [r'\b\d{10}\b', r'\+91[-\s]?\d{10}\b']
name_patterns = [r'\b(Name|Patient Name)[:\s]*[A-Z][a-z]+\s+[A-Z][a-z]+\b']
address_patterns = [r'\b\d{1,4}\s+\w+\s+\w+(?:\s+\w+)*\b']

# Combine all with their redaction labels
all_patterns = [
    *( (p, '[ID]') for p in id_patterns ),
    *( (p, '[EMAIL]') for p in email_patterns ),
    *( (p, '[PHONE]') for p in phone_patterns ),
    *( (p, '[NAME]') for p in name_patterns ),
    *( (p, '[ADDRESS]') for p in address_patterns )
]

# Redaction function
def redact_pii(text):
    if text is None:
        return None
    for pattern, replacement in all_patterns:
        text = re.sub(pattern, replacement, text)
    return text



In [None]:
# Updated regex patterns with case insensitivity and flexible spacing


regex_redactions = [
    (r"\b\d{4}[\s._-]?\d{4}[\s._-]?\d{4}\b", "[AADHAAR]"),
    (r"\b[6-9]\d{2}[\s._-]?\d{3}[\s._-]?\d{4}\b", "[PHONE]"),
    (r"\b[\w._%+-]+@[\w.-]+\.\w+\b", "[EMAIL]"),
    (r"(MRD|UHID|Reg\.?|Episodes?No\.?|Hospitals?No\.?)[:\s._-]*[A-Z]{2}[\s_-]?\d{6,}", "[ID]"),
    (r"\b\d{2}[-/]\d{2}[-/]\d{4}\b", "[DOB]"),
    (r"\b\d{4}[-/]\d{2}[-/]\d{2}\b", "[DOB]"),
    (r"\b\d{2}[-/]\d{2}[-/]\d{2}\b", "[DOB]"),
    (r"\b\d{1,2}(st|nd|rd|th)?[\s._-]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[\s._-]+\d{2,4}\b", "[DOB]"),
    (r"\bAge[:\s-]?\d{1,3}\b", "[AGE]"),
    (r"\b\d{1,3}[\s-]?Yrs\b", "[AGE]"),
    (r"\b(Male|Female|Transgender)\b", "[GENDER]"),
    (r"\b[A-Z]{2}[\s_-]?\d{6,}\b", "[ID]"),
    (r"\b\d{6}\b", "[PIN_CODE]"),
    (r"(Dr\.?|Mr\.?|Mrs\.?|Ms\.?|Maj\.?|Col\.?|Prof\.?|Capt\.?|Smt\.?|Shri\.?|Miss\.?)\s+(?:[A-Z][a-z]+[\s._-]?){1,4}", "[NAME]"),
    (r"\b[A-Z]{5}[\s._-]?\d{4}[\s._-]?[A-Z]\b", "[PAN]"),
    (r"\b[A-PR-WY][0-9]{7}\b", "[PASSPORT]"),
    (r"(Dr|Mr|Mrs|Ms|Prof|Col|Maj|Capt|Shri|Smt|Miss|Master|Md)\.?\s+(?:[A-Z][a-z]*[\s._-]?){1,4}", "[NAME]"),
    (r"\b[A-Z]{3}[\s._-]?\d{7}\b", "[ID]"),
    (r"\b[A-Z]{2}[\s-]?[0-9]{2}[\s-]?[0-9]{11}\b", "[ID]"),
    (r"\b\w+@UIDAI\b", "[ID]"),
    (r"\b(?:\d{1,3}\.){3}\d{1,3}\b", "[IP_ADDRESS]"),
    (r"https?://(?:www\.)?[\w.-]+\.[a-z]{2,6}\b|www\.[\w.-]+\.[a-z]{2,6}\b", "[URL]"),
    (r"\b[\w._-]+@[a-zA-Z]+\b", "[UPI]"),
    (r"\bMR[DN][\\s\\-_:]?\d{4,}\b", "[MRN]"),
    (r"\bMRN[:\-_\s]?\w{6,12}\b", "[MRN]"),
    (r"\b\d{10,16}\b", "[BANK_ACCOUNT]"),
    (r"\b[A-Z]{4}0[A-Z0-9]{6}\b", "[IFSC]"),
    (r"\b(PAN|ID|Voter ID)[:\s\-]*[A-Z]{5}\d{4}[A-Z]\b", "[PAN]"),
    (r"\b(Flat|House|H\.?No\.?|Quarter|Apt|Apartment|Floor|Block|Sector|Plot|Cantonment|Lane|Road|Street|Nagar|Residency|Chowk)[^\n,]{0,80}\d{6}", "[ADDRESS]"),
    (r"\d{1,4}\s+[A-Za-z]+\s+(Road|Rd|Street|St|Nagar|Layout|Avenue|Block|Colony|Enclave|Cross)\b.*?(?=,|$)", "[ADDRESS]"),
    (r"\b[A-Z]{2,5}[\s._-]?\d{5,10}\b", "[ID]"),
    (r"\bPatient\s*ID[:\s\-]*\w{4,12}\b", "[ID]"),
    (r"\b[Ee]pisode\s*[Nn]o[:\s\-]?\w+\b", "[ID]")
]


# Apply with case-insensitive flag in PySpark
from pyspark.sql.functions import regexp_replace

for pattern, replacement in regex_redactions:
    final_df = final_df.withColumn("redacted_text", regexp_replace("redacted_text", pattern, replacement))

# Alternative approach - if you want to use Python's re module flags
import re

def apply_redactions_with_flags(text):
    """Apply all redactions with proper flags"""
    for pattern, replacement in regex_redactions:
        # Remove (?i) from pattern since we're using re.IGNORECASE flag
        clean_pattern = pattern.replace("(?i)", "")
        text = re.sub(clean_pattern, replacement, text, flags=re.IGNORECASE)
    return text

# For testing individual patterns
def test_pattern(pattern, test_text):
    """Test a pattern with different cases"""
    import re
    clean_pattern = pattern.replace("(?i)", "")
    matches = re.findall(clean_pattern, test_text, re.IGNORECASE)
    return matches

# Test cases for verification
test_cases = [
    "DR. SHABBER ZAVERI",  # Should match [NAME]
    "dr shabber zaveri",   # Should match [NAME]
    "Mr John Smith",       # Should match [NAME]
    "MRS. PRIYA SHARMA",   # Should match [NAME]
    "Hospital No: MH000731649",  # Should match [PATIENT_ID]
    "HOSPITAL NO MH000731649",   # Should match [PATIENT_ID]
    "hospital no: mh000731649",  # Should match [PATIENT_ID]
    "Phone: 9611827350",   # Should match [PHONE]
    "PHONE 9611827350",    # Should match [PHONE]
    "email: test@GMAIL.COM",  # Should match [EMAIL]
    "Aadhaar: 1234 5678 9012 3456",  # Should match [AADHAAR]
    "AADHAAR 1234-5678-9012-3456"    # Should match [AADHAAR]
]

# Test the patterns
print("Testing patterns:")
for test_text in test_cases:
    redacted = apply_redactions_with_flags(test_text)
    print(f"Original: {test_text}")
    print(f"Redacted: {redacted}")
    print("-" * 50)

Testing patterns:
Original: DR. SHABBER ZAVERI
Redacted: [NAME]
--------------------------------------------------
Original: dr shabber zaveri
Redacted: [NAME]
--------------------------------------------------
Original: Mr John Smith
Redacted: [NAME]
--------------------------------------------------
Original: MRS. PRIYA SHARMA
Redacted: [NAME]. [NAME]
--------------------------------------------------
Original: Hospital No: MH000731649
Redacted: [PATIENT_ID]
--------------------------------------------------
Original: HOSPITAL NO MH000731649
Redacted: [PATIENT_ID]
--------------------------------------------------
Original: hospital no: mh000731649
Redacted: [PATIENT_ID]
--------------------------------------------------
Original: Phone: 9611827350
Redacted: Phone: [PHONE]
--------------------------------------------------
Original: PHONE 9611827350
Redacted: PHONE [PHONE]
--------------------------------------------------
Original: email: test@GMAIL.COM
Redacted: email: [EMAIL]
----

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import pandas as pd
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import StringType

# Load IndicNER model
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")
model = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER")
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
df = spark.createDataFrame(data, ["text"])

# Function to redact PII using IndicNER
def redact_pii(text):
    entities = ner_pipeline(text)
    redacted_text = text
    for entity in entities:
        if entity["entity_group"] in ["PER", "ORG", "LOC"]:  # Person, Organization, Location
            redacted_text = redacted_text.replace(entity["word"], "[REDACTED]")
    return redacted_text

# Pandas UDF for Spark
@pandas_udf(StringType())
def redact_pii_udf(texts: pd.Series) -> pd.Series:
    return texts.apply(redact_pii)

# Add a new column with redacted text
df_redacted = df.withColumn("redacted_text", redact_pii_udf(df["text"]))

# Show results
df_redacted.show(truncate=False)

import re
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Regex patterns for Indian PII
patterns = {
    "AADHAAR": r"\b[0-9]{4}\s?[0-9]{4}\s?[0-9]{4}\b",
    "PHONE": r"(?:\+91|91)?[ -]?[6-9]\d{9}",
    "PAN": r"[A-Z]{5}[0-9]{4}[A-Z]{1}",
    "EMAIL": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
}

def redact_with_regex(text):
    for pii_type, pattern in patterns.items():
        text = re.sub(pattern, f"[{pii_type}_REDACTED]", text)
    return text

# Register UDF
redact_regex_udf = udf(redact_with_regex, StringType())

# Apply regex redaction after IndicNER
df_final = df_redacted.withColumn(
    "fully_redacted_text",
    redact_regex_udf(df_redacted["redacted_text"])
)

df_final.select("text", "fully_redacted_text").show(truncate=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/667M [00:00<?, ?B/s]

Device set to use cpu


NameError: name 'spark' is not defined