# Kafka Data Generator
This notebook generates sample data to test the Kafka Alerting Pipeline.
Use this to send test messages to your Kafka topic.


In [15]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
from datetime import datetime, timedelta
import random
from databricks.connect import DatabricksSession

In [None]:
import os

# Force token authentication (override VSCode extension settings)
os.environ['DATABRICKS_AUTH_TYPE'] = 'pat'  # Personal Access Token
os.environ['DATABRICKS_HOST'] = 'https://e2-demo-field-eng.cloud.databricks.com'
os.environ['DATABRICKS_TOKEN'] = ''  # Replace with your actual token

# Remove problematic env vars
os.environ.pop('DATABRICKS_SERVERLESS_COMPUTE_ID', None)
os.environ.pop('DATABRICKS_METADATA_SERVICE_URL', None)

# Now import and create session
from databricks.connect import DatabricksSession

spark = DatabricksSession.builder.serverless().getOrCreate()

print("✅ Connected!")
spark.sql("SELECT 'Hello from Databricks!' as message").show()

✅ Connected!
+--------------------+
|             message|
+--------------------+
|Hello from Databr...|
+--------------------+



In [19]:
spark = DatabricksSession.builder.serverless().profile("e2-field-eng-demo").getOrCreate()

In [20]:
# Configuration - Update these values for your environment
KAFKA_BOOTSTRAP_SERVERS = "localhost:9092"
KAFKA_TOPIC = "client-events"

# Sample clients
CLIENTS = [
    {"id": "client_001", "name": "Acme Corp"},
    {"id": "client_002", "name": "Global Industries"},
    {"id": "client_003", "name": "Tech Solutions"},
    {"id": "client_004", "name": "Finance Group"},
    {"id": "client_005", "name": "Retail Chain"}
]

# Event types
EVENT_TYPES = ["transaction", "login", "purchase", "update", "delete"]

# Statuses
STATUSES = ["active", "pending", "completed", "failed"]


In [21]:
def generate_sample_message():
    """Generate a single sample message matching the expected schema."""
    import builtins  # Import builtins to access Python's round function
    
    client = random.choice(CLIENTS)
    
    message = {
        "client_id": client["id"],
        "client_name": client["name"],
        "timestamp": (datetime.now() - timedelta(seconds=random.randint(0, 3600))).isoformat(),
        "event_type": random.choice(EVENT_TYPES),
        "data": {
            "key1": f"value_{random.randint(1, 100)}",
            "key2": f"data_{random.randint(1, 100)}",
            "source": random.choice(["web", "mobile", "api"])
        },
        "amount": builtins.round(random.uniform(10, 5000), 2),  # Use Python's built-in round
        "status": random.choice(STATUSES),
        "metadata": {
            "source": random.choice(["web", "mobile", "api"]),
            "region": random.choice(["us-east", "us-west", "eu-west", "ap-south"]),
            "version": "1.0"
        }
    }
    
    return message

# Generate sample messages
sample_messages = [generate_sample_message() for _ in range(100)]

# Display a few samples
print("Sample messages generated:")
for msg in sample_messages[:3]:
    print(json.dumps(msg, indent=2))
    print("-" * 50)


Sample messages generated:
{
  "client_id": "client_004",
  "client_name": "Finance Group",
  "timestamp": "2026-02-02T14:54:32.607806",
  "event_type": "purchase",
  "data": {
    "key1": "value_75",
    "key2": "data_14",
    "source": "api"
  },
  "amount": 267.69,
  "status": "pending",
  "metadata": {
    "source": "web",
    "region": "us-west",
    "version": "1.0"
  }
}
--------------------------------------------------
{
  "client_id": "client_004",
  "client_name": "Finance Group",
  "timestamp": "2026-02-02T14:42:26.607843",
  "event_type": "transaction",
  "data": {
    "key1": "value_56",
    "key2": "data_2",
    "source": "web"
  },
  "amount": 4980.9,
  "status": "completed",
  "metadata": {
    "source": "mobile",
    "region": "us-west",
    "version": "1.0"
  }
}
--------------------------------------------------
{
  "client_id": "client_003",
  "client_name": "Tech Solutions",
  "timestamp": "2026-02-02T14:41:53.607856",
  "event_type": "update",
  "data": {
    "ke

## Option 1: Write to Kafka using Structured Streaming


In [22]:
# Convert to DataFrame
schema = StructType([
    StructField("client_id", StringType()),
    StructField("client_name", StringType()),
    StructField("timestamp", StringType()),
    StructField("event_type", StringType()),
    StructField("data", MapType(StringType(), StringType())),
    StructField("amount", DoubleType()),
    StructField("status", StringType()),
    StructField("metadata", MapType(StringType(), StringType()))
])

df = spark.createDataFrame(sample_messages, schema)

# Convert to JSON strings for Kafka
kafka_df = df.select(
    col("client_id").alias("key"),
    to_json(struct("*")).alias("value")
)

# Write to Kafka
# Uncomment and update with your Kafka configuration
# kafka_df.write \
#     .format("kafka") \
#     .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
#     .option("topic", KAFKA_TOPIC) \
#     .save()

print(f"Generated {kafka_df.count()} messages ready to send to Kafka")
kafka_df.show(5, truncate=False)


Generated 100 messages ready to send to Kafka
+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|key       |value                                                                                                                                                                                                                                                                                        |
+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|client_004|{"client_id":"client_004","client_name":"Fina

## Option 2: Continuous Data Generation (for testing streaming)


In [23]:
# Use rate source for continuous generation
from pyspark.sql.functions import expr, rand, when

# Create a rate stream (generates rows at specified rate)
rate_stream = spark.readStream \
    .format("rate") \
    .option("rowsPerSecond", 10) \
    .load()

# Transform to match our schema
def generate_streaming_data(batch_df, batch_id):
    """Generate sample data for each micro-batch."""
    messages = [generate_sample_message() for _ in range(batch_df.count())]
    messages_df = spark.createDataFrame(messages, schema)
    
    # Convert to Kafka format
    kafka_messages = messages_df.select(
        col("client_id").alias("key"),
        to_json(struct("*")).alias("value")
    )
    
    # Write to Kafka
    # Uncomment and update with your Kafka configuration
    # kafka_messages.write \
    #     .format("kafka") \
    #     .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
    #     .option("topic", KAFKA_TOPIC) \
    #     .save()
    
    print(f"Batch {batch_id}: Generated {kafka_messages.count()} messages")

# Start the streaming query (uncomment to run)
# query = rate_stream.writeStream \
#     .foreachBatch(generate_streaming_data) \
#     .outputMode("update") \
#     .start()
# 
# query.awaitTermination()

print("Streaming data generator configured. Uncomment the query section to start.")


Streaming data generator configured. Uncomment the query section to start.


## Test Data with Edge Cases


In [24]:
# Generate test data with edge cases to test error handling
edge_case_messages = [
    # Valid message
    {
        "client_id": "client_001",
        "client_name": "Test Client",
        "timestamp": datetime.now().isoformat(),
        "event_type": "test",
        "data": {"test": "data"},
        "amount": 100.0,
        "status": "active",
        "metadata": {"source": "test"}
    },
    # Missing timestamp (should be dropped by bronze layer)
    {
        "client_id": "client_002",
        "client_name": "Test Client 2",
        "event_type": "test",
        "data": {"test": "data"},
        "amount": 50.0,
        "status": "pending",
        "metadata": {"source": "test"}
    },
    # Invalid status (should be dropped by silver layer)
    {
        "client_id": "client_003",
        "client_name": "Test Client 3",
        "timestamp": datetime.now().isoformat(),
        "event_type": "test",
        "data": {"test": "data"},
        "amount": 75.0,
        "status": "invalid_status",
        "metadata": {"source": "test"}
    },
    # Negative amount (should be dropped by silver layer)
    {
        "client_id": "client_004",
        "client_name": "Test Client 4",
        "timestamp": datetime.now().isoformat(),
        "event_type": "test",
        "data": {"test": "data"},
        "amount": -100.0,
        "status": "completed",
        "metadata": {"source": "test"}
    }
]

print("Edge case test messages:")
for i, msg in enumerate(edge_case_messages):
    print(f"\n{i+1}. {json.dumps(msg, indent=2)}")
    print("-" * 50)


Edge case test messages:

1. {
  "client_id": "client_001",
  "client_name": "Test Client",
  "timestamp": "2026-02-02T15:37:54.380123",
  "event_type": "test",
  "data": {
    "test": "data"
  },
  "amount": 100.0,
  "status": "active",
  "metadata": {
    "source": "test"
  }
}
--------------------------------------------------

2. {
  "client_id": "client_002",
  "client_name": "Test Client 2",
  "event_type": "test",
  "data": {
    "test": "data"
  },
  "amount": 50.0,
  "status": "pending",
  "metadata": {
    "source": "test"
  }
}
--------------------------------------------------

3. {
  "client_id": "client_003",
  "client_name": "Test Client 3",
  "timestamp": "2026-02-02T15:37:54.380145",
  "event_type": "test",
  "data": {
    "test": "data"
  },
  "amount": 75.0,
  "status": "invalid_status",
  "metadata": {
    "source": "test"
  }
}
--------------------------------------------------

4. {
  "client_id": "client_004",
  "client_name": "Test Client 4",
  "timestamp": "202