## Writing Data to Various Destinations

In [0]:
# First, let's load our processed streaming data
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Load our sample streaming data that we processed in previous modules
file_path = "/pyspark/video-streaming-data/module3-transform/joins_aggregations/streaming_events.csv"

df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

# Show our data
print("Sample of our streaming data:")
df.limit(5).display()

Sample of our streaming data:


event_id,user_id,content_id,timestamp,duration_seconds,device_type,quality,buffering_count,error_type,ip_address,country,session_id
EVT10000,USR41813,CON10763,2023-09-03T09:18:59Z,565,Web,HD,4,,72.119.240.124,ES,SES10000
EVT10001,USR46484,CON12784,2023-09-09T11:44:27Z,2018,Web,HD,1,,156.3.251.123,FR,SES10001
EVT10002,USR37573,CON16367,2023-09-09T16:51:53Z,2900,TV,4K,3,,182.53.26.241,AU,SES10002
EVT10003,USR46584,CON18916,2023-09-13T08:03:13Z,3242,Tablet,4K,3,,9.203.70.180,FR,SES10003
EVT10004,USR52241,CON18924,2023-09-04T13:07:20Z,4248,TV,4K,1,,152.202.251.124,NL,SES10004


In [0]:
# Let's see how many records we have
print(f"Total records: {df.count()}")

Total records: 50000


#### 1. WRITE TO PARQUET FORMAT

In [0]:
# Parquet is a columnar format that compresses data and provides efficient queries
parquet_path = "/pyspark/video-streaming-data/module4-load/destinations/output_templates/parquet_output"

# Write the DataFrame to Parquet
df.write.mode("overwrite").parquet(parquet_path)

print(f"Data written to Parquet at: {parquet_path}")

In [0]:
# Read it back to verify (this is faster than CSV because of Parquet optimizations)
parquet_df = spark.read.parquet(parquet_path)
print("Data read back from Parquet:")
parquet_df.limit(5).display()

#### 2. WRITE TO ORC FORMAT

In [0]:
# ORC is another columnar format with good compression and performance
orc_path = "/pyspark/video-streaming-data/module4-load/destinations/output_templates/orc_output"

# Write the DataFrame to ORC
df.write.mode("overwrite").orc(orc_path)

print(f"Data written to ORC at: {orc_path}")

#### 3. WRITE TO DELTA LAKE FORMAT

In [0]:
# Delta Lake adds ACID transactions, versioning, and many other features
delta_path = "/pyspark/video-streaming-data/module4-load/destinations/output_templates/delta_output"

# Write the DataFrame to Delta
df.write.format("delta").mode("overwrite").save(delta_path)

print(f"Data written to Delta Lake at: {delta_path}")

In [0]:
# Read back Delta Lake to verify
delta_df = spark.read.format("delta").load(delta_path)
print("First 5 rows from Delta Lake:")
delta_df.limit(5).display()

#### 4. PARTITIONING DATA FOR OPTIMIZED QUERIES

In [0]:
# Let's partition the data by device_type to optimize for queries that filter on this column
partitioned_path = "/pyspark/video-streaming-data/module4-load/destinations/output_templates/partitioned"

# Write with partitioning
df.write \
    .partitionBy("device_type") \
    .mode("overwrite") \
    .parquet(partitioned_path)

print(f"Data partitioned by device_type and written to: {partitioned_path}")

In [0]:
# List the partitions that were created
# In standard Jupyter, you would use os.listdir instead of dbutils.fs.ls
import os
partitions = os.listdir(partitioned_path)
for partition in partitions:
    print(partition)

#### 5. WRITING TO DATABASE TABLES

In [0]:
# Create widgets for database connection parameters
dbutils.widgets.text("jdbc_server", "your-azure-sql-server.database.windows.net", "JDBC Server")
dbutils.widgets.text("jdbc_database", "videostreamingdb", "Database Name")
dbutils.widgets.text("jdbc_table", "streamingevents", "Table Name")
dbutils.widgets.text("jdbc_user", "admin", "Username")
dbutils.widgets.text("jdbc_password", "your_password_here", "Password") # Using text widget for compatibility

In [0]:
# Get values from widgets
jdbc_server = dbutils.widgets.get("jdbc_server")
jdbc_database = dbutils.widgets.get("jdbc_database")
jdbc_table = dbutils.widgets.get("jdbc_table")
jdbc_user = dbutils.widgets.get("jdbc_user")
jdbc_password = dbutils.widgets.get("jdbc_password")


In [0]:
# Remove SQL widgets to clean up the UI
widget_names = ["jdbc_database", "jdbc_password", "jdbc_server", "jdbc_user", "jdbc_table"]

# Try to remove all widgets
for name in widget_names:
    try:
        dbutils.widgets.remove(name)
        print(f"Removed widget: {name}")
    except Exception as e:
        print(f"Failed to remove widget {name}: {e}")

Removed widget: jdbc_database
Removed widget: jdbc_password
Removed widget: jdbc_server
Removed widget: jdbc_user
Removed widget: jdbc_table


In [0]:
# Construct JDBC URL
jdbc_url = f"jdbc:sqlserver://{jdbc_server}:1433;database={jdbc_database}"

# Define driver
jdbc_driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver"

print(f"Configured to write to: {jdbc_server}/{jdbc_database}/{jdbc_table}")

Configured to write to: 57.database.windows.net/videostreamingdb/streamingevents


In [0]:
# For a small dataset (50,000 rows), fewer partitions are better
# Too many partitions would create unnecessary overhead
optimized_df = df.repartition(4)  # 4 partitions for 50,000 rows

# Write to a database table with optimizations appropriate for this size
optimized_df.write \
    .format("jdbc") \
    .mode("overwrite") \
    .option("url", jdbc_url) \
    .option("dbtable", jdbc_table) \
    .option("user", jdbc_user) \
    .option("password", jdbc_password) \
    .option("driver", jdbc_driver) \
    .option("batchsize", 5000) \
    .option("isolationLevel", "READ_COMMITTED") \
    .save()

print("Data written to database table with optimizations for our dataset")

Data written to database table with optimizations for our dataset


#### 6. Z-ORDERING FOR PERFORMANCE

In [0]:
# 1. Set up paths
base_dir = "/pyspark/video-streaming-data"
large_data_path = f"{base_dir}/module3-transform/optimization/large_events.csv"
zordered_path = f"{base_dir}/module4-load/destinations/output_templates/z_ordered/events"
unoptimized_path = f"{base_dir}/module4-load/destinations/output_templates/unoptimized/events"

# 2. Load and prepare data
print("Loading large dataset...")
df = spark.read.option("header", "true").option("inferSchema", "true").csv(large_data_path)
print(f"Dataset loaded: {df.count():,} rows")

Loading large dataset...
Dataset loaded: 100,000 rows


In [0]:
# 3. Create unoptimized Delta table
print("Creating unoptimized Delta table...")
df.write.format("delta").mode("overwrite").save(unoptimized_path)

# 4. Create and Z-order Delta table
print("Creating and Z-ordering Delta table...")
df.write.format("delta").mode("overwrite").save(zordered_path)
spark.sql(f"OPTIMIZE delta.`{zordered_path}` ZORDER BY (user_id, content_id)")


Creating unoptimized Delta table...
Creating and Z-ordering Delta table...


DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,

In [0]:
# 5. Simple performance test
import time

# Test query that benefits from Z-ordering
test_query = "user_id LIKE 'USR1%' AND content_id LIKE 'CON1%'"

# Clear cache
spark.catalog.clearCache()

# Test unoptimized
print("\nTesting unoptimized Delta table...")
start = time.time()
unopt_count = spark.sql(f"SELECT COUNT(*) FROM delta.`{unoptimized_path}` WHERE {test_query}").collect()[0][0]
unopt_time = (time.time() - start) * 1000
print(f"Found {unopt_count} rows in {unopt_time:.2f} ms")

# Clear cache again
spark.catalog.clearCache()

# Test Z-ordered
print("\nTesting Z-ordered Delta table...")
start = time.time()
zorder_count = spark.sql(f"SELECT COUNT(*) FROM delta.`{zordered_path}` WHERE {test_query}").collect()[0][0]
zorder_time = (time.time() - start) * 1000
print(f"Found {zorder_count} rows in {zorder_time:.2f} ms")

# 6. Show comparison
if zorder_time < unopt_time:
    improvement = (unopt_time - zorder_time) / unopt_time * 100
    print(f"\nZ-ordering improved performance by {improvement:.2f}%")
    print(f"Z-ordered query was {unopt_time/zorder_time:.2f}x faster")
else:
    print("\nIn this demo, Z-ordering didn't show performance improvement.")
    print("This can happen with small datasets or cached data.")


Testing unoptimized Delta table...
Found 19902 rows in 1095.26 ms

Testing Z-ordered Delta table...
Found 19902 rows in 703.11 ms

Z-ordering improved performance by 35.80%
Z-ordered query was 1.56x faster


In [0]:
# Z-ordering Benefits
print("Key benefits of Z-ordering:")
print("1. Multi-dimensional clustering - efficient for queries with multiple filter conditions")
print("2. Works well with high-cardinality columns (unlike partitioning)")
print("3. No need to decide in advance which columns to optimize (more flexible)")
print("4. Can be applied after table creation (unlike partitioning)")
print("5. Particularly effective for selective queries on large datasets")


In [0]:
# WRITING BEST PRACTICES
print("Best practices when writing data:")
print("1. Choose the right file format for your use case")
print("   - Parquet/ORC: Good for analytics workloads")
print("   - Delta: When you need ACID transactions, time travel, schema enforcement")
print("2. Use appropriate partitioning for large datasets")
print("   - Partition on low-cardinality columns (date, country, category)")
print("   - Avoid over-partitioning (creates too many small files)")
print("3. Use Z-ordering for high-cardinality columns in Delta tables")
print("4. Consider compaction to manage file sizes")
print("5. Use appropriate write modes")
print("   - 'overwrite': Replace existing data")
print("   - 'append': Add to existing data")
print("   - 'ignore': Skip if data exists")
print("   - 'error': Fail if data exists (default)")