# SmolLM3 PII Validation ServiceThis notebook sets up SmolLM3-3B to validate PII redaction by scanning already-redacted text for any remaining PII that local models missed.

## Install Dependencies

In [None]:
!pip install -q flask flask-cors pyngrok llama-cpp-python requests

## Download SmolLM3 Model

In [None]:
import os# Create models directoryos.makedirs('models', exist_ok=True)# Check if model already existsmodel_path = "models/SmolLM3-Q4_K_M.gguf"if not os.path.exists(model_path):    print("Downloading SmolLM3-3B model (Q4_K_M quantized)...")        # Download the SmolLM3-3B model from unsloth    !wget -O models/SmolLM3-Q4_K_M.gguf https://huggingface.co/unsloth/SmolLM3-3B-GGUF/resolve/main/SmolLM3-3B-Q4_K_M.gguf        print("Model downloaded successfully!")else:    print("Model already exists, skipping download.")

## Import Libraries

In [None]:
import reimport jsonimport timeimport threadingfrom datetime import datetimefrom flask import Flask, request, jsonifyfrom flask_cors import CORSfrom pyngrok import ngrokfrom llama_cpp import Llama

## Load SmolLM3 Model

In [None]:
print("Loading SmolLM3-3B model with llama.cpp...")# Initialize llama.cpp model with error handlingtry:    llm = Llama(        model_path="models/SmolLM3-Q4_K_M.gguf",        n_ctx=2048,  # Context window        n_threads=4,  # Number of CPU threads        n_gpu_layers=-1,  # Use all GPU layers (T4 GPU acceleration)        verbose=False,        chat_format="chatml"  # Use jinja template for SmolLM3    )    print("Model loaded successfully with GPU acceleration!")except Exception as e:    print(f"Error loading model: {e}")    print("Falling back to CPU-only mode...")    try:        llm = Llama(            model_path="models/SmolLM3-Q4_K_M.gguf",            n_ctx=2048,  # Context window            n_threads=4,  # Number of CPU threads            n_gpu_layers=0,  # Use CPU only            verbose=False,            chat_format="chatml"  # Use jinja template for SmolLM3        )        print("Model loaded successfully with CPU only!")    except Exception as cpu_e:        print(f"CPU fallback also failed: {cpu_e}")        print("Trying without chat_format...")        llm = Llama(            model_path="models/SmolLM3-Q4_K_M.gguf",            n_ctx=2048,            n_threads=4,            n_gpu_layers=0,            verbose=False        )        print("Model loaded successfully without chat format!")

## PII Detection PatternsEnhanced patterns to detect PII that might have leaked through initial redaction

In [None]:
# Luhn algorithm for credit card validationdef luhn_check(card_number):    """    Validate credit card number using Luhn algorithm    """    # Remove spaces and dashes    card_number = re.sub(r'[\s-]', '', card_number)        if not card_number.isdigit() or len(card_number) < 13:        return False        # Convert to list of integers    digits = [int(d) for d in card_number]        # Starting from the rightmost digit, double every second digit    for i in range(len(digits) - 2, -1, -2):        digits[i] *= 2        if digits[i] > 9:            digits[i] = digits[i] // 10 + digits[i] % 10        # Sum all digits    total = sum(digits)        # Valid if sum is divisible by 10    return total % 10 == 0# Custom validator for SSN patternsdef is_valid_ssn(ssn):    """    Validate SSN format and check for invalid ranges    """    # Remove dashes    ssn_digits = re.sub(r'-', '', ssn)        if len(ssn_digits) != 9 or not ssn_digits.isdigit():        return False        # Extract parts    area = ssn_digits[:3]    group = ssn_digits[3:5]    serial = ssn_digits[5:]        # Check for invalid area numbers    if area in ['000', '666'] or area.startswith('9'):        return False        # Check for invalid group numbers    if group == '00':        return False        # Check for invalid serial numbers    if serial == '0000':        return False        return True# Custom validator for account numbersdef is_valid_account_number(account):    """    Validate account number with additional checks to reduce false positives    """    # Remove spaces and dashes    account_digits = re.sub(r'[\s-]', '', account)        if not account_digits.isdigit() or len(account_digits) < 8 or len(account_digits) > 12:        return False        # Additional validation: check for common account number patterns    # e.g., not all same digits, not sequential patterns    if len(set(account_digits)) == 1:  # All same digits        return False        # Check for obvious sequential patterns    sequential_patterns = ['12345678', '87654321', '00000000', '11111111', '22222222']    if account_digits in sequential_patterns:        return False        return True# Comprehensive PII patterns for validationPII_VALIDATION_PATTERNS = [    # Names that might have been missed    {"pattern": re.compile(r"\b(?:Mr\.?|Mrs\.?|Ms\.?|Miss|Dr\.?)\s+[A-Z][a-z]+\b", re.IGNORECASE), "category": "PERSON_TITLE"},    {"pattern": re.compile(r"\b[A-Z][a-z]{2,}\s+[A-Z][a-z]{2,}\b"), "category": "PERSON_NAME"},    {"pattern": re.compile(r"\b[A-Z][a-z]+(?:\s[A-Z][a-z]+){1,2}\b"), "category": "FULL_NAME"},        # Contact information    {"pattern": re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b"), "category": "PHONE"},    {"pattern": re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"), "category": "EMAIL"},        # Addresses    {"pattern": re.compile(r"\b\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Drive|Dr|Lane|Ln|Court|Ct|Place|Pl)\b", re.IGNORECASE), "category": "ADDRESS"},        # Dates that might contain birthdates    {"pattern": re.compile(r"\b(?:0?[1-9]|1[0-2])[/.-](?:0?[1-9]|[12]\d|3[01])[/.-](?:19|20)\d{2}\b"), "category": "DATE"},    {"pattern": re.compile(r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b", re.IGNORECASE), "category": "DATE"},        # ID Numbers - Enhanced patterns to reduce false positives    # Enhanced SSN pattern with validation for area/group/serial numbers    {"pattern": re.compile(r"\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b"), "category": "SSN", "validator": is_valid_ssn},    # Differentiated ID_NUMBER pattern (e.g., driver's license, employee ID) - more specific formats    {"pattern": re.compile(r"\b[A-Z]{1,2}\d{6,8}\b|\b\d{2}[A-Z]{2}\d{4}\b|\b[A-Z]\d{8}\b"), "category": "ID_NUMBER"},        # Medical/Legal specific    {"pattern": re.compile(r"\b(?:patient|client|defendant|plaintiff)\s+[A-Z][a-z]+\b", re.IGNORECASE), "category": "PERSON_ROLE"},    {"pattern": re.compile(r"\b(?:case|docket)\s*(?:no\.?|number)?\s*:?\s*[A-Za-z0-9-]+\b", re.IGNORECASE), "category": "CASE_NUMBER"},        # Organizations and locations    {"pattern": re.compile(r"\b[A-Z][a-z]+\s+(?:Hospital|Clinic|Medical Center|Court|University|College)\b"), "category": "ORGANIZATION"},    {"pattern": re.compile(r"\b[A-Z][a-z]+,\s*[A-Z]{2}\b"), "category": "LOCATION"},        # Credit cards, account numbers - Enhanced patterns with validation    # Enhanced credit card pattern with Luhn algorithm validation    {"pattern": re.compile(r"\b(?:\d{4}[\s-]?){3}\d{4}\b"), "category": "CREDIT_CARD", "validator": luhn_check},    # Narrowed ACCOUNT_NUMBER pattern with additional validation    {"pattern": re.compile(r"\b\d{8,12}\b"), "category": "ACCOUNT_NUMBER", "validator": is_valid_account_number}]print(f"Loaded {len(PII_VALIDATION_PATTERNS)} PII validation patterns")

## PII Detection Functions

In [None]:
def detect_remaining_pii(text):    """    Detect any PII that remains in supposedly redacted text    """    entities = []        # Skip text that's already a PII placeholder    if "<PII" in text and ">" in text:        # This is already redacted content, scan for any non-placeholder PII        for pattern_info in PII_VALIDATION_PATTERNS:            pattern = pattern_info["pattern"]            category = pattern_info["category"]            validator = pattern_info.get("validator")                        matches = pattern.finditer(text)            for match in matches:                # Skip if this match is inside a PII placeholder                # Find the last opening tag before match start                opening_pos = text.rfind('<PII', 0, match.start())                # Find the next closing tag after match end                closing_pos = text.find('</PII>', match.end())                                # Check if match is fully contained within a placeholder                if opening_pos != -1 and closing_pos != -1 and opening_pos < match.start() and closing_pos > match.end():                    continue                                # Apply additional validation if validator function exists                if validator:                    if not validator(match.group(0)):                        continue  # Skip invalid matches                                entities.append({                    "entity_group": category,                    "word": match.group(0),                    "start": match.start(),                    "end": match.end(),                    "score": 0.95  # Very high confidence for validation patterns                })    else:        # This is unredacted text, apply all patterns        for pattern_info in PII_VALIDATION_PATTERNS:            pattern = pattern_info["pattern"]            category = pattern_info["category"]            validator = pattern_info.get("validator")                        matches = pattern.finditer(text)            for match in matches:                # Apply additional validation if validator function exists                if validator:                    if not validator(match.group(0)):                        continue  # Skip invalid matches                                # Avoid duplicates                overlap = False                for entity in entities:                    if (match.start() < entity["end"] and match.end() > entity["start"]):                        overlap = True                        break                                if not overlap:                    entities.append({                        "entity_group": category,                        "word": match.group(0),                        "start": match.start(),                        "end": match.end(),                        "score": 0.85                    })        return entitiesdef redact_text(text, entities):    """    Redact PII entities from text    """    redacted = text    mapping = {}    category_counts = {}        # Sort entities by start position in reverse order to maintain indices    sorted_entities = sorted(entities, key=lambda x: x["start"], reverse=True)        for entity in sorted_entities:        category = entity["entity_group"]        category_counts[category] = category_counts.get(category, 0) + 1                placeholder = f"<PII {category} {category_counts[category]}>"        mapping[placeholder] = entity["word"]                redacted = redacted[:entity["start"]] + placeholder + redacted[entity["end"]:]        return redacted, mappingprint("PII detection functions defined successfully!")

## Flask API Setup

In [None]:
app = Flask(__name__)CORS(app)@app.route('/health', methods=['GET'])def health_check():    return jsonify({        "status": "healthy",         "model": "SmolLM3-3B (llama.cpp)",        "purpose": "PII validation service"    })@app.route('/generate', methods=['POST'])def generate_endpoint():    try:        data = request.get_json()                # Extract parameters        prompt = data.get('prompt', '')        max_tokens = data.get('max_length', 50)        temperature = data.get('temperature', 0.1)        redact_pii = data.get('redact_pii', True)                if not prompt:            return jsonify({"error": "Prompt is required"}), 400                # For PII validation, we primarily focus on detecting remaining PII        # in the supposedly redacted text, not generating new text        result = {            "prompt": prompt,            "max_length": max_tokens,            "temperature": temperature,            "validation_mode": True        }                # Detect PII in the input (which should be redacted text)        if redact_pii:            print(f"Validating text for remaining PII: {prompt[:100]}...")            pii_entities = detect_remaining_pii(prompt)                        if pii_entities:                # Apply additional redactions if PII found                redacted_text, mapping = redact_text(prompt, pii_entities)                result.update({                    "pii_entities": pii_entities,                    "redacted_text": redacted_text,                    "mapping": mapping,                    "pii_count": len(pii_entities),                    "validation_result": "FAILED - PII detected in supposedly redacted text"                })                print(f"⚠️  Found {len(pii_entities)} PII entities in redacted text!")            else:                result.update({                    "pii_entities": [],                    "redacted_text": prompt,  # No changes needed                    "mapping": {},                    "pii_count": 0,                    "validation_result": "PASSED - No additional PII detected"                })                print("✅ No additional PII detected - redaction appears complete")                return jsonify(result)            except Exception as e:        print(f"Error in validation: {str(e)}")        return jsonify({"error": str(e)}), 500@app.route('/', methods=['GET'])def home():    return jsonify({        "message": "SmolLM3 PII Validation Service",        "description": "Validates redacted text for any remaining PII",        "endpoints": ["/generate (POST)", "/health (GET)"]    })print("Flask app created successfully!")

## Start Flask Server

In [None]:
def run_app():    app.run(host='0.0.0.0', port=5000, debug=False, use_reloader=False)# Start the Flask app in a separate threadthread = threading.Thread(target=run_app)thread.daemon = Truethread.start()time.sleep(3)  # Wait for app to startprint("🚀 Flask API is running on http://localhost:5000")print("Endpoints available:")print("- GET  /health     - Health check")print("- POST /generate   - Validate redacted text for remaining PII")print("- GET  /          - API information")

## Setup Ngrok Tunnel**⚠️ IMPORTANT:** Get your auth token from https://dashboard.ngrok.com/get-started/your-authtoken

In [None]:
print("Setting up ngrok tunnel...")# Replace with your actual ngrok auth tokenNGROK_AUTH_TOKEN = "YOUR_AUTH_TOKEN_HERE"  # Get from https://dashboard.ngrok.com/get-started/your-authtokenif NGROK_AUTH_TOKEN != "YOUR_AUTH_TOKEN_HERE":    ngrok.set_auth_token(NGROK_AUTH_TOKEN)        # Open a HTTP tunnel on port 5000    public_url = ngrok.connect(5000, bind_tls=True)    print(f"\n✅ Public URL: {public_url}")    print("\n🔗 Copy this URL to your redactor application's 'SmolLM3 Colab URL' field!")        # Save the public URL to a file    with open('ngrok_url.txt', 'w') as f:        f.write(str(public_url))            print("\n📝 Public URL saved to 'ngrok_url.txt'")        # Print example usage    print("\n📋 Example API usage:")    print(f"curl -X POST {public_url}/generate \\")    print("  -H \"Content-Type: application/json\" \\")    print("  -d '{\"prompt\": \"Dr. Smith treats <PII PERSON 1> at the clinic\", \"redact_pii\": true}'")    else:    print("\n⚠️  Please set your ngrok auth token in the cell above.")    print("Get your token from: https://dashboard.ngrok.com/get-started/your-authtoken")    print("Then replace 'YOUR_AUTH_TOKEN_HERE' with your actual token and run this cell again.")