# Batch Processing Pipeline

**Process multiple XML files automatically** with Molt-Shield.

This notebook shows engineers how to:
- Process folders of XML files
- Generate sanitization reports
- Archive originals to vault
- Monitor processing status

## 1. Setup

In [None]:
# ============================================================
# SETUP: Install dependencies (run this FIRST)
# ============================================================
# These packages are NOT pre-installed in Google Colab

!pip install -q lxml pydantic pyyaml

print("‚úì Dependencies installed: lxml, pydantic, pyyaml")

# Clone repository (uncomment and add your repo URL)
# !git clone https://github.com/YOUR_USERNAME/molt-shield.git /content/molt-shield

import sys
sys.path.insert(0, '/content/molt-shield/src')

import os
os.chdir('/content/molt-shield')

print("‚úì Environment ready!")

## 2. Create Sample Batch Data

Let's create multiple XML files to simulate a batch processing job.

In [None]:
import os
import json
from pathlib import Path
from datetime import datetime

# Create batch input directory
os.makedirs('batch_data/input', exist_ok=True)
os.makedirs('batch_data/output', exist_ok=True)
os.makedirs('batch_data/vault', exist_ok=True)
os.makedirs('batch_data/reports', exist_ok=True)

# Sample files to process
BATCH_FILES = [
    {
        "filename": "simulation_001.xml",
        "content": """<?xml version="1.0"?>
<simulation>
    <blade id="b1">
        <stress>850.5</stress>
        <temp>650.0</temp>
    </blade>
</simulation>"""
    },
    {
        "filename": "simulation_002.xml",
        "content": """<?xml version="1.0"?>
<simulation>
    <blade id="b2">
        <stress>920.3</stress>
        <temp>680.5</temp>
    </blade>
</simulation>"""
    },
    {
        "filename": "simulation_003.xml",
        "content": """<?xml version="1.0"?>
<simulation>
    <blade id="b3">
        <stress>780.0</stress>
        <temp>520.0</temp>
    </blade>
</simulation>"""
    },
]

# Write batch input files
for f in BATCH_FILES:
    path = os.path.join('batch_data/input', f['filename'])
    with open(path, 'w') as file:
        file.write(f['content'])
    print(f"‚úì Created: batch_data/input/{f['filename']}")

print(f"\nüìÅ Created {len(BATCH_FILES)} files to process")

## 3. Define the Batch Processor

In [None]:
from src.gatekeeper import apply_gatekeeper
from src.config import MaskingConfig, load_config
from src.policy_engine import Policy, Rule
from src.vault import Vault
from pathlib import Path
from lxml import etree
import shutil
import json
from datetime import datetime

class BatchProcessor:
    """Process multiple XML files with Molt-Shield."""
    
    def __init__(self, input_dir, output_dir, vault_dir, report_dir):
        self.input_dir = Path(input_dir)
        self.output_dir = Path(output_dir)
        self.vault_dir = Path(vault_dir)
        self.report_dir = Path(report_dir)
        
        # Create policy
        self.policy = Policy(
            version="1.0",
            global_masking=True,
            rules=[
                Rule(tag_pattern="stress", action="mask_value"),
                Rule(tag_pattern="temp", action="mask_value"),
            ]
        )
        
        # Load config
        self.config = load_config('config/default.yaml')
        
        # Results tracking
        self.results = {
            "timestamp": datetime.now().isoformat(),
            "files_processed": 0,
            "files_failed": 0,
            "total_values_masked": 0,
            "details": []
        }
    
    def process_file(self, filename):
        """Process a single XML file."""
        input_path = self.input_dir / filename
        
        try:
            # Create session-specific vault
            session_id = Path(filename).stem
            vault_path = self.vault_dir / f"{session_id}.vault.json"
            vault = Vault(vault_path)
            
            # Process
            sanitized_path, _ = apply_gatekeeper(
                input_path, 
                self.policy, 
                self.config
            )
            
            # Copy to output
            output_path = self.output_dir / filename
            shutil.copy(sanitized_path, output_path)
            
            # Save vault
            vault.save()
            
            # Track results
            self.results["files_processed"] += 1
            self.results["details"].append({
                "filename": filename,
                "status": "success",
                "output": str(output_path),
                "vault": str(vault_path)
            })
            
            return True, f"Processed: {filename}"
            
        except Exception as e:
            self.results["files_failed"] += 1
            self.results["details"].append({
                "filename": filename,
                "status": "failed",
                "error": str(e)
            })
            
            return False, f"Failed: {filename} - {str(e)}"
    
    def process_all(self):
        """Process all XML files in input directory."""
        xml_files = list(self.input_dir.glob("*.xml"))
        
        print(f"üìã Found {len(xml_files)} files to process\n")
        
        for file_path in xml_files:
            success, message = self.process_file(file_path.name)
            status = "‚úÖ" if success else "‚ùå"
            print(f"{status} {message}")
        
        self.save_report()
        return self.results
    
    def save_report(self):
        """Save processing report."""
        report_path = self.report_dir / f"report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(report_path, 'w') as f:
            json.dump(self.results, f, indent=2)
        print(f"\nüìä Report saved: {report_path}")
        return report_path

print("‚úì BatchProcessor class defined")

## 4. Run Batch Processing

In [None]:
# Initialize processor
processor = BatchProcessor(
    input_dir='batch_data/input',
    output_dir='batch_data/output',
    vault_dir='batch_data/vault',
    report_dir='batch_data/reports'
)

# Run batch processing
print("üöÄ Starting batch processing...\n")
results = processor.process_all()

## 5. View Results

In [None]:
# Display results summary
print("=== BATCH PROCESSING SUMMARY ===\n")
print(f"Timestamp: {results['timestamp']}")
print(f"Files processed: {results['files_processed']}")
print(f"Files failed: {results['files_failed']}")

# Show output files
print("\n=== OUTPUT FILES ===")
for f in os.listdir('batch_data/output'):
    path = os.path.join('batch_data/output', f)
    size = os.path.getsize(path)
    print(f"  üìÑ {f} ({size} bytes)")

# Show vault files
print("\n=== VAULT FILES ===")
for f in os.listdir('batch_data/vault'):
    path = os.path.join('batch_data/vault', f)
    size = os.path.getsize(path)
    print(f"  üîê {f} ({size} bytes)")

## 6. View Sanitized Output

In [None]:
# Show one of the sanitized files
print("=== SANITIZED OUTPUT EXAMPLE ===\n")
with open('batch_data/output/simulation_001.xml', 'r') as f:
    print(f.read())

## 7. Rehydration Demo

In [None]:
# Demonstrate rehydration
from src.vault import Vault

# Load a vault
vault = Vault('batch_data/vault/simulation_001.vault.json')
vault.load()

print("=== REHYDRATION TEST ===\n")
print("Vault contents:")
for masked, entry in vault.entries.items():
    print(f"  {masked} ‚Üí {entry.original_value}")

# Rehydrate a sample
sample = "<stress>VAL_xxx</stress>"
rehydrated = vault.rehydrate_xml(sample)
print(f"\nOriginal: {sample}")
print(f"Rehydrated: {rehydrated}")

## Summary

You now have a complete batch processing pipeline:

### Features
- Process multiple XML files automatically
- Generate processing reports
- Archive originals to vault
- Track success/failure status

### Directory Structure Created
```
batch_data/
‚îú‚îÄ‚îÄ input/          # Original XML files
‚îú‚îÄ‚îÄ output/        # Sanitized files
‚îú‚îÄ‚îÄ vault/         # Original values (encrypted)
‚îî‚îÄ‚îÄ reports/      # Processing reports
```

### Next Steps
- Schedule with cron for automated processing
- Add file watching for real-time processing
- Integrate with cloud storage (S3, GCS)