In [None]:
# Databricks Event Hub Producer - Optimized for Spark DataFrames
import os, json, gzip, time, uuid, random
from typing import Iterable, List, Dict, Any
from azure.eventhub import EventHubProducerClient, EventData
from azure.eventhub.exceptions import EventHubError, OperationTimeoutError
from pyspark.sql.functions import col, to_json, struct
from pyspark.sql.types import StringType
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --- Databricks-specific Configuration ---
# Event Hub connection details from Databricks secrets
try:
    EVENT_HUB_CONNECTION_STRING = dbutils.secrets.get("kv-scope", "eh-connstr")
    EVENT_HUB_NAME = dbutils.secrets.get("kv-scope", "eh-name") or "datatransferhub"
    logger.info("✅ Successfully retrieved Event Hub credentials from secrets")
except Exception as e:
    logger.warning(f"⚠️ Could not retrieve secrets: {e}")
    # Fallback for testing
    EVENT_HUB_CONNECTION_STRING = "YOUR_CONNECTION_STRING_HERE"
    EVENT_HUB_NAME = "datatransferhub"

# --- Constants for Databricks/Event Hub ---
MAX_EVENT_BYTES = 900_000      # ~0.9 MB safety margin (1MB limit - overhead)
MAX_RETRIES = 7                # Retry attempts for transient failures
BASE_DELAY = 0.5               # Base delay for exponential backoff
MAX_BACKOFF = 30               # Maximum backoff delay
BATCH_SIZE = 100               # Events per batch for efficient processing
PARTITION_COUNT = 32           # Number of Event Hub partitions (adjust to your setup)

print(f"🔧 Configuration:")
print(f"   Event Hub: {EVENT_HUB_NAME}")
print(f"   Max Event Size: {MAX_EVENT_BYTES:,} bytes")
print(f"   Batch Size: {BATCH_SIZE} events")
print(f"   Partition Count: {PARTITION_COUNT}")

In [None]:
# --- Helper Functions for Databricks + Event Hub ---

def to_bytes(obj) -> bytes:
    """Convert various data types to bytes for Event Hub."""
    if isinstance(obj, (bytes, bytearray)):
        return bytes(obj)
    if isinstance(obj, str):
        return obj.encode("utf-8")
    if isinstance(obj, dict):
        return json.dumps(obj, separators=(",", ":"), ensure_ascii=False).encode("utf-8")
    return json.dumps(obj, separators=(",", ":"), ensure_ascii=False, default=str).encode("utf-8")

def gzip_compress(data: bytes) -> bytes:
    """Compress data using gzip for efficient transport."""
    return gzip.compress(data)

def chunk_data(data: bytes, max_size: int) -> List[bytes]:
    """Split large data into chunks that fit Event Hub size limits."""
    chunks = []
    for i in range(0, len(data), max_size):
        chunks.append(data[i:i + max_size])
    return chunks

def calculate_partition_key(record: Dict[str, Any], strategy: str = "hash") -> str:
    """Calculate partition key for even distribution across Event Hub partitions."""
    if strategy == "hash":
        # Use hash of customer_id or order_id for even distribution
        key_field = record.get("customer_id") or record.get("order_id") or record.get("id") or str(uuid.uuid4())
        return str(hash(str(key_field)) % PARTITION_COUNT)
    elif strategy == "round_robin":
        # Simple round-robin (requires external counter)
        return str(random.randint(0, PARTITION_COUNT - 1))
    elif strategy == "customer_id":
        # Keep all events for same customer on same partition
        customer_id = record.get("customer_id", "unknown")
        return str(hash(str(customer_id)) % PARTITION_COUNT)
    else:
        return "0"  # Default partition

def send_batch_with_retry(producer: EventHubProducerClient, batch, max_retries: int = MAX_RETRIES):
    """Send batch with exponential backoff retry logic."""
    delay = BASE_DELAY
    
    for attempt in range(1, max_retries + 1):
        try:
            producer.send_batch(batch)
            logger.info(f"✅ Successfully sent batch with {batch.count} events")
            return True
            
        except (EventHubError, OperationTimeoutError) as e:
            if attempt == max_retries:
                logger.error(f"❌ Failed to send batch after {max_retries} attempts: {e}")
                raise
            
            # Exponential backoff with jitter
            jitter = random.uniform(0, delay * 0.25)
            sleep_time = min(delay + jitter, MAX_BACKOFF)
            logger.warning(f"⚠️ Attempt {attempt} failed, retrying in {sleep_time:.2f}s: {e}")
            time.sleep(sleep_time)
            delay = min(delay * 2, MAX_BACKOFF)
            
        except Exception as e:
            logger.error(f"❌ Unexpected error sending batch: {e}")
            raise
    
    return False

print("✅ Helper functions defined successfully!")

In [None]:
# --- Databricks Spark DataFrame Processing ---

class EventHubSender:
    """
    Databricks-optimized Event Hub sender with batching and partitioning.
    """
    
    def __init__(self, connection_string: str, event_hub_name: str):
        self.connection_string = connection_string
        self.event_hub_name = event_hub_name
        self.producer = None
        self.stats = {
            "events_sent": 0,
            "batches_sent": 0,
            "chunks_created": 0,
            "errors": 0
        }
    
    def __enter__(self):
        """Initialize Event Hub producer client."""
        try:
            self.producer = EventHubProducerClient.from_connection_string(
                conn_str=self.connection_string,
                eventhub_name=self.event_hub_name
            )
            logger.info(f"🔗 Connected to Event Hub: {self.event_hub_name}")
            return self
        except Exception as e:
            logger.error(f"❌ Failed to connect to Event Hub: {e}")
            raise
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Close Event Hub producer and print statistics."""
        if self.producer:
            self.producer.close()
            logger.info("🔌 Event Hub connection closed")
        
        print("\n📊 Event Hub Sending Statistics:")
        print(f"   Events sent: {self.stats['events_sent']:,}")
        print(f"   Batches sent: {self.stats['batches_sent']:,}")
        print(f"   Chunks created: {self.stats['chunks_created']:,}")
        print(f"   Errors: {self.stats['errors']:,}")
    
    def create_event_from_record(self, record: Dict[str, Any], correlation_id: str) -> List[EventData]:
        """
        Create Event Hub events from a single record, with chunking if needed.
        """
        try:
            # Convert record to JSON and compress
            json_data = json.dumps(record, ensure_ascii=False, default=str)
            compressed_data = gzip_compress(to_bytes(json_data))
            
            # Split into chunks if too large
            chunks = chunk_data(compressed_data, MAX_EVENT_BYTES)
            events = []
            
            for chunk_idx, chunk in enumerate(chunks):
                event = EventData(chunk)
                event.content_type = "application/json+gzip"
                event.properties = {
                    "correlation_id": correlation_id,
                    "chunk_index": chunk_idx + 1,
                    "total_chunks": len(chunks),
                    "compressed": True,
                    "schema_version": "v1",
                    "source": "databricks",
                    "timestamp": int(time.time() * 1000)  # milliseconds
                }
                events.append(event)
            
            if len(chunks) > 1:
                self.stats["chunks_created"] += len(chunks)
                logger.debug(f"📦 Split large record into {len(chunks)} chunks")
            
            return events
            
        except Exception as e:
            logger.error(f"❌ Error creating event from record: {e}")
            self.stats["errors"] += 1
            return []
    
    def send_records_batch(self, records: List[Dict[str, Any]], partition_strategy: str = "hash"):
        """
        Send a batch of records to Event Hub with optimal partitioning.
        """
        if not records:
            return
        
        # Group records by partition key for efficient batching
        partition_groups = {}
        
        for record in records:
            correlation_id = str(uuid.uuid4())
            partition_key = calculate_partition_key(record, partition_strategy)
            
            if partition_key not in partition_groups:
                partition_groups[partition_key] = []
            
            events = self.create_event_from_record(record, correlation_id)
            if events:
                partition_groups[partition_key].extend(events)
        
        # Send each partition group as separate batches
        for partition_key, events in partition_groups.items():
            self._send_events_for_partition(events, partition_key)
    
    def _send_events_for_partition(self, events: List[EventData], partition_key: str):
        """Send events for a specific partition, handling batch size limits."""
        current_batch = None
        
        try:
            current_batch = self.producer.create_batch(partition_key=partition_key)
            
            for event in events:
                try:
                    current_batch.add(event)
                except ValueError:
                    # Batch is full, send it and create a new one
                    if current_batch.count > 0:
                        send_batch_with_retry(self.producer, current_batch)
                        self.stats["batches_sent"] += 1
                        self.stats["events_sent"] += current_batch.count
                    
                    # Create new batch and add the event
                    current_batch = self.producer.create_batch(partition_key=partition_key)
                    current_batch.add(event)
            
            # Send final batch if it has events
            if current_batch and current_batch.count > 0:
                send_batch_with_retry(self.producer, current_batch)
                self.stats["batches_sent"] += 1
                self.stats["events_sent"] += current_batch.count
                
        except Exception as e:
            logger.error(f"❌ Error sending events for partition {partition_key}: {e}")
            self.stats["errors"] += 1

print("✅ EventHubSender class defined successfully!")

In [None]:
# --- Main Databricks to Event Hub Processing ---

def process_dataframe_to_eventhub(
    df, 
    connection_string: str, 
    event_hub_name: str,
    batch_size: int = BATCH_SIZE,
    partition_strategy: str = "hash"
):
    """
    Process a Spark DataFrame and send to Event Hub efficiently.
    
    Args:
        df: Spark DataFrame to process
        connection_string: Event Hub connection string
        event_hub_name: Event Hub name
        batch_size: Number of records to process in each batch
        partition_strategy: Partitioning strategy ('hash', 'customer_id', 'round_robin')
    """
    
    print(f"🚀 Starting DataFrame to Event Hub processing...")
    print(f"   Strategy: {partition_strategy}")
    print(f"   Batch size: {batch_size}")
    
    # Get total count for progress tracking
    total_records = df.count()
    print(f"   Total records: {total_records:,}")
    
    if total_records == 0:
        print("⚠️ DataFrame is empty, nothing to process")
        return
    
    # Convert DataFrame to JSON strings for easier processing
    # This ensures all columns are properly serialized
    df_json = df.select(to_json(struct(*df.columns)).alias("json_data"))
    
    processed_records = 0
    
    with EventHubSender(connection_string, event_hub_name) as sender:
        
        # Process in batches using toLocalIterator for memory efficiency
        batch_records = []
        
        for row in df_json.toLocalIterator():
            try:
                # Parse JSON back to dict for processing
                record = json.loads(row["json_data"])
                batch_records.append(record)
                
                # Send batch when it reaches the desired size
                if len(batch_records) >= batch_size:
                    sender.send_records_batch(batch_records, partition_strategy)
                    processed_records += len(batch_records)
                    
                    # Progress update
                    progress = (processed_records / total_records) * 100
                    print(f"📊 Progress: {processed_records:,}/{total_records:,} ({progress:.1f}%)")
                    
                    batch_records = []  # Reset batch
                    
            except Exception as e:
                logger.error(f"❌ Error processing record: {e}")
                continue
        
        # Send remaining records in final batch
        if batch_records:
            sender.send_records_batch(batch_records, partition_strategy)
            processed_records += len(batch_records)
        
        print(f"✅ Processing complete! Sent {processed_records:,} records to Event Hub")

print("✅ Main processing function defined!")

In [None]:
# --- Example Usage: Sales Customer Data to Event Hub ---

# Example 1: Basic usage with your sales customer data
print("🔧 Example 1: Basic Sales Customer Processing")

try:
    # Load your Spark DataFrame (adjust table name as needed)
    sales_df = spark.table("samples.bakehouse.sales_customers")
    
    # Show sample data
    print("\n📋 Sample data:")
    sales_df.select("customer_id", "order_id", "product_name", "order_value").show(5, truncate=False)
    
    # Process and send to Event Hub
    process_dataframe_to_eventhub(
        df=sales_df,
        connection_string=EVENT_HUB_CONNECTION_STRING,
        event_hub_name=EVENT_HUB_NAME,
        batch_size=50,  # Smaller batches for demo
        partition_strategy="customer_id"  # Keep customer events together
    )
    
except Exception as e:
    print(f"❌ Error in Example 1: {e}")
    print("💡 Make sure the table 'samples.bakehouse.sales_customers' exists")

In [None]:
# --- Alternative Examples for Different Data Sources ---

# Example 2: Create sample data if the table doesn't exist
print("\n🔧 Example 2: Sample Data Generation")

# Create sample sales data for testing
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
from datetime import datetime, timedelta

# Define schema
schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("order_id", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("order_value", DoubleType(), True),
    StructField("order_date", TimestampType(), True),
    StructField("customer_segment", StringType(), True),
    StructField("region", StringType(), True)
])

# Generate sample data
sample_data = []
regions = ["North", "South", "East", "West"]
segments = ["Premium", "Standard", "Basic"]
products = ["Widget A", "Widget B", "Widget C", "Gadget X", "Gadget Y"]

for i in range(1000):  # Create 1000 sample records
    sample_data.append((
        i % 100 + 1,  # customer_id (1-100)
        f"ORD-{i:06d}",  # order_id
        random.choice(products),  # product_name
        round(random.uniform(10.0, 500.0), 2),  # order_value
        datetime.now() - timedelta(days=random.randint(0, 365)),  # order_date
        random.choice(segments),  # customer_segment
        random.choice(regions)  # region
    ))

# Create DataFrame
sample_df = spark.createDataFrame(sample_data, schema)

print(f"✅ Created sample DataFrame with {sample_df.count():,} records")
sample_df.show(10)

In [None]:
# Example 3: Process the sample data with different partitioning strategies
print("\n🔧 Example 3: Testing Different Partitioning Strategies")

# Test with hash partitioning for even distribution
print("\n📊 Testing Hash Partitioning (even distribution):")
try:
    process_dataframe_to_eventhub(
        df=sample_df.limit(100),  # Process first 100 records for demo
        connection_string=EVENT_HUB_CONNECTION_STRING,
        event_hub_name=EVENT_HUB_NAME,
        batch_size=20,
        partition_strategy="hash"
    )
except Exception as e:
    print(f"❌ Hash partitioning test failed: {e}")

# Test with customer_id partitioning to keep customer events together
print("\n👥 Testing Customer ID Partitioning (events grouped by customer):")
try:
    process_dataframe_to_eventhub(
        df=sample_df.filter(col("customer_id") <= 10),  # First 10 customers only
        connection_string=EVENT_HUB_CONNECTION_STRING,
        event_hub_name=EVENT_HUB_NAME,
        batch_size=15,
        partition_strategy="customer_id"
    )
except Exception as e:
    print(f"❌ Customer ID partitioning test failed: {e}")

print("\n✅ Partitioning strategy tests complete!")

# 🎯 Databricks Event Hub Integration - Key Features

## ✅ **What This Script Provides:**

### **1. Databricks-Optimized Design**
- Uses `dbutils.secrets` for secure credential management
- Leverages Spark DataFrames with `toLocalIterator()` for memory efficiency
- Proper JSON serialization with `to_json()` and `struct()`
- Progress tracking for large datasets

### **2. Event Hub Best Practices**
- **Chunking**: Handles large records by splitting into multiple events
- **Compression**: Uses gzip to reduce data size
- **Partitioning**: Multiple strategies for optimal throughput
- **Retry Logic**: Exponential backoff for resilient sending
- **Batching**: Configurable batch sizes for performance tuning

### **3. Partitioning Strategies**
- **Hash**: Even distribution across all partitions
- **Customer ID**: Keep related events together (good for ordering)
- **Round Robin**: Simple load balancing

### **4. Error Handling & Monitoring**
- Comprehensive logging and statistics
- Graceful error handling with detailed error messages
- Progress tracking for long-running operations

## 🔧 **Configuration for Your Environment:**

1. **Update Databricks Secrets:**
   ```python
   # Set these in your Databricks secret scope
   dbutils.secrets.put("kv-scope", "eh-connstr", "YOUR_EVENT_HUB_CONNECTION_STRING")
   dbutils.secrets.put("kv-scope", "eh-name", "YOUR_EVENT_HUB_NAME")
   ```

2. **Adjust Constants:**
   ```python
   PARTITION_COUNT = 32    # Match your Event Hub partition count
   BATCH_SIZE = 100        # Optimize based on your data size
   MAX_EVENT_BYTES = 900_000  # Adjust based on Event Hub tier
   ```

## 🚀 **Usage Patterns:**

- **Small datasets**: Use higher batch sizes (500-1000)
- **Large datasets**: Use moderate batch sizes (50-200) with progress tracking
- **Real-time streaming**: Combine with Spark Structured Streaming
- **Customer ordering**: Use "customer_id" partitioning strategy