In [1]:
import pyspark
from pyspark.sql import Column, DataFrame, SparkSession, functions
from pyspark.sql.functions import *
from py4j.java_collections import MapConverter
import shutil
import random
import threading
import os
conf = pyspark.SparkConf()
conf.setMaster("spark://spark:7077") 
delight_secret= os.environ.get('DELIGHT_SECRET')

# Getting non-existent keys
FOO = os.getenv('FOO')
conf.set("spark.hadoop.fs.s3a.endpoint", 'http://s3:9000') \
    .set("spark.hadoop.fs.s3a.access.key", 'minio') \
    .set("spark.hadoop.fs.s3a.secret.key", 'minio123') \
    .set("spark.hadoop.fs.s3a.fast.upload", True) \
    .set("spark.hadoop.fs.s3a.path.style.access", True) \
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .set("spark.jars.repositories", "https://oss.sonatype.org/content/repositories/snapshots") \
    .set("spark.jars.packages","co.datamechanics:delight_2.12:latest-SNAPSHOT") \
    .set("spark.extraListeners", "co.datamechanics.delight.DelightListener") \
    .set("spark.delight.accessToken.secret", delight_secret)
sc = pyspark.SparkContext(conf=conf)

https://oss.sonatype.org/content/repositories/snapshots added as a remote repository with the name: repo-1


:: loading settings :: url = jar:file:/usr/local/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
co.datamechanics#delight_2.12 added as a dependency
io.delta#delta-core_2.12 added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-49194ebf-64da-4c34-bdd8-a5342d33f332;1.0
	confs: [default]
	found co.datamechanics#delight_2.12;latest-SNAPSHOT in repo-1
	found io.delta#delta-core_2.12;1.0.0 in central
	found org.antlr#antlr4;4.7 in central
	found org.antlr#antlr4-runtime;4.7 in central
	found org.antlr#antlr-runtime;3.5.2 in central
	found org.antlr#ST4;4.0.8 in central
	found org.abego.treelayout#org.abego.treelayout.core;1.0.3 in central
	found org.glassfish#javax.json;1.0.4 in central
	found com.ibm.icu#icu4j;58.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.819 in central
	found org.apache.hadoop#hadoop-aws;3.2.0 in centra

In [3]:
sc.setLogLevel('INFO')

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession(sc).builder.appName("streaming").getOrCreate()
from delta.tables import *

21/11/02 17:01:00 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir ('file:/home/jovyan/work/spark-warehouse').
21/11/02 17:01:00 INFO SharedState: Warehouse path is 'file:/home/jovyan/work/spark-warehouse'.


In [5]:
from pyspark.sql.utils import AnalysisException
def get_from_raw_to_delta_stream(streamingRawDF,checkpointLocation, data_uri , partitioned_by):
      stream=streamingRawDF.writeStream \
      .format("delta") \
      .outputMode("append") \
      .partitionBy(partitioned_by) \
      .option("overwriteSchema", "true") \
      .trigger(once=True) \
      .options(ignoreDeletes=True) \
      .option("checkpointLocation", checkpointLocation).start(data_uri)
      return (stream)


def get_streaming_df(OBJECTURL, format='json'):
    if (format in ('json','csv')):
        schema=(spark.read.format('json').load(OBJECTURL).schema)
        return(spark.readStream.format(format).schema(schema).load(OBJECTURL))
    else:
        return(spark.readStream.format(format).load(OBJECTURL))
    return streamingRawDF

def getDeltaTableFromPath(path):
    deltaTable=None
    try:
        deltaTable = DeltaTable.forPath(spark, path)
    except AnalysisException as error:
        if("is not a Delta table" in str(error)):
            print('1st time we call, not yet created')
        else:
            raise(error)
    return (deltaTable)

def mergetoDF(microDF, batchId):
    print(f"inside foreachBatch for batchId{batchId}. rows passed={microDF.count()}")
    microDF=microDF.dropDuplicates(["id"])
    deltaDf.alias("t").merge(microDF.alias("s"), "s.id = t.id")\
        .whenMatchedUpdateAll()\
        .whenNotMatchedInsertAll()\
        .execute()
#table_category=raw/bronze/silver/gold
def get_uri(table_name, table_category='raw',is_checkpoint=False ,base_uri='s3a://minio-sink-bucket/'):
    uri=base_uri
    if(table_category=='raw'):
        uri=uri + 'topics/' 
    else:
        uri=uri + table_category + '/' 
        if(is_checkpoint):
            uri=uri+'checkpoint/'
        else:
            uri=uri+'data/'
    return(uri+ table_name)

def get_tables_uri (table_name):
    raw_table=get_uri(table_name)
    bronze_table=get_uri(table_name,table_category='bronze')
    bronze_table_checkpoint=get_uri(table_name,table_category='bronze',is_checkpoint=True)
    silver_table=get_uri(table_name,table_category='silver')
    silver_table_checkpoint=get_uri(table_name,table_category='silver',is_checkpoint=True)
    return(raw_table, bronze_table, bronze_table_checkpoint, silver_table,silver_table_checkpoint )

In [6]:
raw_table, bronze_table, bronze_table_checkpoint, silver_table,silver_table_checkpoint=get_tables_uri (table_name='customers')

streamingCustRawDF=get_streaming_df(raw_table)
streamingCustRawDF=streamingCustRawDF.select(col("after.id"), col("after.first_name"), col("after.last_name"), \
                             col("after.email")).withColumn('inserted', current_timestamp())
partitioned_by=['last_name', 'first_name']
stream=get_from_raw_to_delta_stream(streamingCustRawDF,checkpointLocation=bronze_table_checkpoint, \
                         data_uri=bronze_table, \
                          partitioned_by=partitioned_by)
stream.awaitTermination()

21/11/02 17:01:02 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
21/11/02 17:01:02 INFO MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
21/11/02 17:01:02 INFO MetricsSystemImpl: s3a-file-system metrics system started
21/11/02 17:01:03 INFO DelightStreamingConnector: Successfully sent heartbeat
21/11/02 17:01:04 INFO InMemoryFileIndex: It took 138 ms to list leaf files for 1 paths.
21/11/02 17:01:04 INFO InMemoryFileIndex: It took 28 ms to list leaf files for 4 paths.
21/11/02 17:01:06 INFO FileSourceStrategy: Pushed Filters: 
21/11/02 17:01:06 INFO FileSourceStrategy: Post-Scan Filters: 
21/11/02 17:01:06 INFO FileSourceStrategy: Output Data Schema: struct<value: string>
21/11/02 17:01:06 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 183.0 KiB, free 434.2 MiB)
21/11/02 17:01:06 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated

In [7]:
deltaTable=getDeltaTableFromPath(bronze_table)
print("#############  Original Delta Table ###############")
deltaTable.toDF().show()

#############  Original Delta Table ###############


21/11/02 17:01:27 INFO DataSourceStrategy: Pruning directories with: 
21/11/02 17:01:27 INFO FileSourceStrategy: Pushed Filters: 
21/11/02 17:01:27 INFO FileSourceStrategy: Post-Scan Filters: 
21/11/02 17:01:27 INFO FileSourceStrategy: Output Data Schema: struct<id: bigint, email: string, inserted: timestamp ... 1 more fields>
21/11/02 17:01:27 INFO DelightStreamingConnector: Flushing 1 events, now 3 pending events
21/11/02 17:01:27 INFO CodeGenerator: Code generated in 18.5021 ms
21/11/02 17:01:27 INFO MemoryStore: Block broadcast_7 stored as values in memory (estimated size 184.9 KiB, free 432.5 MiB)
21/11/02 17:01:27 INFO MemoryStore: Block broadcast_7_piece0 stored as bytes in memory (estimated size 30.5 KiB, free 432.5 MiB)
21/11/02 17:01:27 INFO BlockManagerInfo: Added broadcast_7_piece0 in memory on pyspark:43275 (size: 30.5 KiB, free: 434.1 MiB)
21/11/02 17:01:27 INFO SparkContext: Created broadcast 7 from showString at NativeMethodAccessorImpl.java:0
21/11/02 17:01:27 INFO Sna

+----+----------+---------+--------------------+--------------------+
|  id|first_name|last_name|               email|            inserted|
+----+----------+---------+--------------------+--------------------+
|1001|     Sally|   Thomas|sally.thomas@acme...|2021-11-02 16:27:...|
|1004|      Anne|Kretchmar|  annek@noanswer.org|2021-11-02 16:27:...|
|1002|    George|   Bailey|  gbailey@foobar.com|2021-11-02 16:27:...|
|1003|    Edward|   Walker|       ed@walker.com|2021-11-02 16:27:...|
+----+----------+---------+--------------------+--------------------+



21/11/02 17:01:31 INFO TaskSetManager: Finished task 0.0 in stage 7.0 (TID 105) in 217 ms on 172.29.0.2 (executor 0) (1/1)
21/11/02 17:01:31 INFO TaskSchedulerImpl: Removed TaskSet 7.0, whose tasks have all completed, from pool 
21/11/02 17:01:31 INFO DAGScheduler: ResultStage 7 (showString at NativeMethodAccessorImpl.java:0) finished in 0.222 s
21/11/02 17:01:31 INFO DAGScheduler: Job 4 is finished. Cancelling potential speculative or zombie tasks for this job
21/11/02 17:01:31 INFO TaskSchedulerImpl: Killing all running tasks in stage 7: Stage finished
21/11/02 17:01:31 INFO DelightStreamingConnector: Flushing 6 events, now 15 pending events
21/11/02 17:01:31 INFO DelightStreamingConnector: Flushing 1 events, now 16 pending events
21/11/02 17:01:31 INFO DAGScheduler: Job 4 finished: showString at NativeMethodAccessorImpl.java:0, took 0.226189 s
21/11/02 17:01:31 INFO CodeGenerator: Code generated in 13.2897 ms
21/11/02 17:01:31 INFO DelightStreamingConnector: Flushing 1 events, now 1

In [8]:
customer_stream_df=get_streaming_df(bronze_table, 'delta')

In [9]:
deltaDf = getDeltaTableFromPath(silver_table)
data_stream_writer=customer_stream_df.writeStream \
      .format("delta") \
      .outputMode("append") \
      .trigger(once=True) \
      .option("checkpointLocation", silver_table_checkpoint) 
if not deltaDf:
      print('first time, creating table')
      data_stream_writer.start(silver_table)
else:
    print('not first time, merging data')
    data_stream_writer.foreachBatch(mergetoDF).start()

21/11/02 17:01:31 INFO DelegatingLogStore: LogStore org.apache.spark.sql.delta.storage.S3SingleDriverLogStore is used for scheme s3a
21/11/02 17:01:31 INFO DeltaLog: Loading version 0.
21/11/02 17:01:31 INFO Snapshot: [tableId=735be8a5-0563-44bd-82b3-bf808c8f1fbe] DELTA: Compute snapshot for version: 0
21/11/02 17:01:31 INFO MemoryStore: Block broadcast_11 stored as values in memory (estimated size 183.0 KiB, free 433.0 MiB)
21/11/02 17:01:31 INFO MemoryStore: Block broadcast_11_piece0 stored as bytes in memory (estimated size 29.8 KiB, free 433.0 MiB)
21/11/02 17:01:31 INFO BlockManagerInfo: Added broadcast_11_piece0 in memory on pyspark:43275 (size: 29.8 KiB, free: 434.2 MiB)
21/11/02 17:01:31 INFO SparkContext: Created broadcast 11 from toString at String.java:2951
21/11/02 17:01:31 INFO DeltaLogFileIndex: Created DeltaLogFileIndex(JSON, numFilesInSegment: 1, totalFileSize: 1417)
21/11/02 17:01:31 INFO DelightStreamingConnector: Successfully sent payload
21/11/02 17:01:31 INFO Block

not first time, merging data


21/11/02 17:01:35 INFO DelightStreamingConnector: Successfully sent payload
21/11/02 17:01:35 INFO MicroBatchExecution: Starting [id = e7d7244d-4c2d-4595-b94c-ead7337ee48a, runId = 22a0b2af-4a60-4d50-8372-94f791db2c8c]. Use s3a://minio-sink-bucket/silver/checkpoint/customers to store the query checkpoint.
21/11/02 17:01:35 INFO DelightStreamingConnector: Flushing 1 events, now 9 pending events


In [10]:
getDeltaTableFromPath(silver_table).toDF().show()

21/11/02 17:01:35 INFO MicroBatchExecution: Using Source [DeltaSource[s3a://minio-sink-bucket/bronze/data/customers]] from DataSourceV1 named 'delta' [DataSource(org.apache.spark.sql.SparkSession@700e06d7,delta,List(),None,List(),None,Map(path -> s3a://minio-sink-bucket/bronze/data/customers),None)]
21/11/02 17:01:35 WARN MicroBatchExecution: The read limit MaxFiles: 1000 for DeltaSource[s3a://minio-sink-bucket/bronze/data/customers] is ignored when Trigger.Once() is used.
21/11/02 17:01:35 INFO DelightStreamingConnector: Successfully sent payload
21/11/02 17:01:35 INFO FileSourceStrategy: Pushed Filters: 
21/11/02 17:01:35 INFO FileSourceStrategy: Post-Scan Filters: 
21/11/02 17:01:35 INFO FileSourceStrategy: Output Data Schema: struct<id: bigint, first_name: string, last_name: string, email: string, inserted: timestamp ... 3 more fields>
21/11/02 17:01:35 INFO DelightStreamingConnector: Flushing 1 events, now 2 pending events
21/11/02 17:01:35 INFO CodeGenerator: Code generated in 15

+----+----------+---------+--------------------+--------------------+
|  id|first_name|last_name|               email|            inserted|
+----+----------+---------+--------------------+--------------------+
|1001|     Sally|   Thomas|sally.thomas@acme...|2021-11-02 16:27:...|
|1004|      Anne|Kretchmar|  annek@noanswer.org|2021-11-02 16:27:...|
|1002|    George|   Bailey|  gbailey@foobar.com|2021-11-02 16:27:...|
|1003|    Edward|   Walker|       ed@walker.com|2021-11-02 16:27:...|
+----+----------+---------+--------------------+--------------------+



21/11/02 17:01:37 INFO DelightStreamingConnector: Flushing 16 events, now 78 pending events
21/11/02 17:01:37 INFO DelightStreamingConnector: Flushing 1 events, now 79 pending events
21/11/02 17:01:37 INFO DelightStreamingConnector: Flushing 1 events, now 80 pending events


orders table

In [11]:
table_name='orders'
raw_table, bronze_table, bronze_table_checkpoint, silver_table,silver_table_checkpoint=get_tables_uri (table_name='orders')
streamingCustRawDF=get_streaming_df(raw_table)
streamingOrderRawDF=streamingCustRawDF.select(col("after.order_number"), col("after.order_date"), col("after.purchaser"), \
                             col("after.product_id"))
partitioned_by=['purchaser']
stream=get_from_raw_to_delta_stream(streamingOrderRawDF,checkpointLocation=bronze_table_checkpoint, \
                         data_uri=bronze_table, \
                          partitioned_by=partitioned_by)
stream.awaitTermination()

deltaDf = getDeltaTableFromPath(bronze_table)
deltaDf.toDF().show()
orderDeltaDf=None
order_stream_df=get_streaming_df(bronze_table, 'delta')
def orderMergetoDF(microDF, batchId):
    print(f"inside foreachBatch for batchId{batchId}. rows passed={microDF.count()}")
    microDF=microDF.dropDuplicates(["order_number"])
    orderDeltaDf.alias("t").merge(microDF.alias("s"), "s.order_number = t.order_number")\
        .whenMatchedUpdateAll()\
        .whenNotMatchedInsertAll()\
        .execute()
orderDeltaDf = getDeltaTableFromPath(silver_table)
data_stream_writer=order_stream_df.writeStream \
      .format("delta") \
      .outputMode("append") \
      .trigger(once=True) \
      .option("checkpointLocation", silver_table_checkpoint) 
if not orderDeltaDf:
      print('first time, creating table')
      data_stream_writer.start(silver_table)
else:
    print('not first time, merging data')
    data_stream_writer.foreachBatch(orderMergetoDF).start()     

21/11/02 17:01:37 INFO TaskSetManager: Finished task 28.0 in stage 14.0 (TID 242) in 53 ms on 172.29.0.6 (executor 1) (33/50)
21/11/02 17:01:37 INFO TaskSetManager: Starting task 29.0 in stage 14.0 (TID 243) (172.29.0.6, executor 1, partition 29, PROCESS_LOCAL, 4457 bytes) taskResourceAssignments Map()
21/11/02 17:01:37 INFO TaskSetManager: Finished task 29.0 in stage 14.0 (TID 243) in 21 ms on 172.29.0.6 (executor 1) (34/50)
21/11/02 17:01:37 INFO TaskSetManager: Starting task 30.0 in stage 14.0 (TID 244) (172.29.0.6, executor 1, partition 30, PROCESS_LOCAL, 4457 bytes) taskResourceAssignments Map()
21/11/02 17:01:37 INFO TaskSetManager: Starting task 31.0 in stage 14.0 (TID 245) (172.29.0.6, executor 1, partition 31, PROCESS_LOCAL, 4457 bytes) taskResourceAssignments Map()
21/11/02 17:01:37 INFO TaskSetManager: Finished task 30.0 in stage 14.0 (TID 244) in 37 ms on 172.29.0.6 (executor 1) (35/50)
21/11/02 17:01:37 INFO TaskSetManager: Starting task 32.0 in stage 14.0 (TID 246) (172.2

+------------+----------+---------+----------+
|order_number|order_date|purchaser|product_id|
+------------+----------+---------+----------+
|       10001|     16816|     1001|       102|
|       10003|     16850|     1002|       106|
|       10002|     16817|     1002|       105|
|       10004|     16852|     1003|       107|
+------------+----------+---------+----------+



21/11/02 17:01:45 INFO BlockManagerInfo: Removed broadcast_32_piece0 on 172.29.0.6:41489 in memory (size: 108.8 KiB, free: 366.1 MiB)
21/11/02 17:01:45 INFO BlockManagerInfo: Removed broadcast_32_piece0 on 172.29.0.2:35013 in memory (size: 108.8 KiB, free: 366.3 MiB)
21/11/02 17:01:45 INFO BlockManagerInfo: Removed broadcast_32_piece0 on pyspark:43275 in memory (size: 108.8 KiB, free: 434.2 MiB)
21/11/02 17:01:45 INFO BlockManagerInfo: Removed broadcast_34_piece0 on pyspark:43275 in memory (size: 5.8 KiB, free: 434.2 MiB)
21/11/02 17:01:45 INFO BlockManagerInfo: Removed broadcast_34_piece0 on 172.29.0.6:41489 in memory (size: 5.8 KiB, free: 366.1 MiB)
21/11/02 17:01:45 INFO BlockManagerInfo: Removed broadcast_31_piece0 on pyspark:43275 in memory (size: 30.4 KiB, free: 434.2 MiB)
21/11/02 17:01:45 INFO BlockManagerInfo: Removed broadcast_31_piece0 on 172.29.0.2:35013 in memory (size: 30.4 KiB, free: 366.3 MiB)
21/11/02 17:01:45 INFO BlockManagerInfo: Removed broadcast_31_piece0 on 172.2

not first time, merging data


21/11/02 17:01:48 INFO TaskSetManager: Finished task 0.0 in stage 33.0 (TID 474) in 58 ms on 172.29.0.6 (executor 1) (1/1)
21/11/02 17:01:48 INFO TaskSchedulerImpl: Removed TaskSet 33.0, whose tasks have all completed, from pool 
21/11/02 17:01:48 INFO DAGScheduler: ResultStage 33 (toString at String.java:2951) finished in 0.067 s
21/11/02 17:01:48 INFO DAGScheduler: Job 17 is finished. Cancelling potential speculative or zombie tasks for this job
21/11/02 17:01:48 INFO TaskSchedulerImpl: Killing all running tasks in stage 33: Stage finished
21/11/02 17:01:48 INFO DAGScheduler: Job 17 finished: toString at String.java:2951, took 2.905963 s
21/11/02 17:01:48 INFO DelightStreamingConnector: Flushing 6 events, now 161 pending events
21/11/02 17:01:48 INFO DelightStreamingConnector: Flushing 1 events, now 162 pending events
21/11/02 17:01:48 INFO Snapshot: [tableId=8eeb3f67-f8d3-452e-98c0-215e4f870cbc] DELTA: Done
21/11/02 17:01:48 INFO DelightStreamingConnector: Flushing 1 events, now 163

In [12]:
getDeltaTableFromPath(silver_table).toDF().show()

21/11/02 17:01:48 INFO MicroBatchExecution: Using Source [DeltaSource[s3a://minio-sink-bucket/bronze/data/orders]] from DataSourceV1 named 'delta' [DataSource(org.apache.spark.sql.SparkSession@700e06d7,delta,List(),None,List(),None,Map(path -> s3a://minio-sink-bucket/bronze/data/orders),None)]
21/11/02 17:01:48 WARN MicroBatchExecution: The read limit MaxFiles: 1000 for DeltaSource[s3a://minio-sink-bucket/bronze/data/orders] is ignored when Trigger.Once() is used.
21/11/02 17:01:49 INFO FileSourceStrategy: Pushed Filters: 
21/11/02 17:01:49 INFO FileSourceStrategy: Post-Scan Filters: 
21/11/02 17:01:49 INFO FileSourceStrategy: Output Data Schema: struct<order_number: bigint, order_date: bigint, purchaser: bigint, product_id: bigint ... 2 more fields>
21/11/02 17:01:49 INFO DelightStreamingConnector: Flushing 1 events, now 165 pending events
21/11/02 17:01:49 INFO CodeGenerator: Code generated in 13.7691 ms
21/11/02 17:01:49 INFO MemoryStore: Block broadcast_40 stored as values in memor

+------------+----------+---------+----------+
|order_number|order_date|purchaser|product_id|
+------------+----------+---------+----------+
|       10003|     16850|     1002|       106|
|       10004|     16852|     1003|       107|
|       10001|     16816|     1001|       102|
|       10002|     16817|     1002|       105|
+------------+----------+---------+----------+



21/11/02 17:01:50 INFO TaskSetManager: Finished task 0.0 in stage 41.0 (TID 613) in 196 ms on 172.29.0.2 (executor 0) (1/1)
21/11/02 17:01:50 INFO TaskSchedulerImpl: Removed TaskSet 41.0, whose tasks have all completed, from pool 
21/11/02 17:01:50 INFO DAGScheduler: ResultStage 41 (showString at NativeMethodAccessorImpl.java:0) finished in 0.457 s
21/11/02 17:01:50 INFO DAGScheduler: Job 21 is finished. Cancelling potential speculative or zombie tasks for this job
21/11/02 17:01:50 INFO TaskSchedulerImpl: Killing all running tasks in stage 41: Stage finished
21/11/02 17:01:50 INFO DelightStreamingConnector: Flushing 89 events, now 89 pending events
21/11/02 17:01:50 INFO DAGScheduler: Job 21 finished: showString at NativeMethodAccessorImpl.java:0, took 0.459835 s
21/11/02 17:01:50 INFO DelightStreamingConnector: Flushing 1 events, now 90 pending events
21/11/02 17:01:50 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
21/11/02 17:01:50 INFO DAGScheduler: 