# This script implements tiers of tables using streaming to pass data from one tier to the next.

## Start pyspark

In [13]:
# import and enable spark

from pyspark.sql.types import StructType, StructField, DoubleType, LongType, StringType
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

import import_ipynb
from Bronze import BronzeETL

from delta.pip_utils import configure_spark_with_delta_pip
    
builder = SparkSession.builder\
       .appName('raw_etl')\
       .config('spark.sql.warehouse.dir', 'pyspark_tables')\
       .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
       .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
       .config('spark.databricks.delta.retentionDurationCheck.enabled', False) \
       .config('spark.databricks.delta.schema.autoMerge.enabled', True) \
       .config('spark.databricks.delta.checkLatestSchemaOnRead', True) \
       .config('delta.enableChangeDataFeed', True) \
       .config('spark.sql.shuffle.partitions', 10) \
       .config('spark.databricks.preemption.enabled', True) \
       .config('spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite', True)

spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()

23/02/22 18:08:27 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


## Clear previuous tables from other runs

In [22]:
spark.sql('CREATE DATABASE IF NOT EXISTS etl_tiers')
spark.sql('DROP TABLE IF EXISTS etl_tiers.raw')
spark.sql('DROP TABLE IF EXISTS etl_tiers.bronze')
spark.sql('DROP TABLE IF EXISTS etl_tiers.silver')
spark.sql('DROP TABLE IF EXISTS etl_tiers.gold')

23/02/22 18:26:43 ERROR MicroBatchExecution: Query [id = e7e11fdb-754b-4482-9a8f-33b802d298da, runId = 5e9003da-5d59-4e3c-80f6-9c40c244e78a] terminated with error
java.io.FileNotFoundException: No such file or directory: file:/home/ahow/MyGitHub/pyspark_tests/gen_tiers/pyspark_tables/etl_tiers.db/raw/_delta_log
	at io.delta.storage.HadoopFileSystemLogStore.listFrom(HadoopFileSystemLogStore.java:56)
	at org.apache.spark.sql.delta.storage.LogStoreAdaptor.listFrom(LogStore.scala:452)
	at org.apache.spark.sql.delta.storage.DelegatingLogStore.listFrom(DelegatingLogStore.scala:127)
	at org.apache.spark.sql.delta.DeltaLog.getChanges(DeltaLog.scala:297)
	at org.apache.spark.sql.delta.sources.DeltaSourceCDCSupport.filterAndIndexDeltaLogs$1(DeltaSourceCDCSupport.scala:205)
	at org.apache.spark.sql.delta.sources.DeltaSourceCDCSupport.getFileChangesForCDC(DeltaSourceCDCSupport.scala:260)
	at org.apache.spark.sql.delta.sources.DeltaSourceCDCSupport.getFileChangesForCDC$(DeltaSourceCDCSupport.scala:

DataFrame[]

## Create raw table

In [23]:
data = [[1,1.0], [2,1.0], [3,1.0], [4,1.0]]
schema = StructType([
    StructField('idx', LongType(), False),
    StructField('value', DoubleType(), False),
])

df = spark.createDataFrame([], schema=schema)
df.write.partitionBy('idx').format('delta').mode('overwrite').saveAsTable('etl_tiers.raw')
spark.sql("ALTER TABLE etl_tiers.raw SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")

df = spark.createDataFrame(data, schema=schema)
df.write.format('delta').mode('append').saveAsTable('etl_tiers.raw')

spark.sql("SELECT * FROM etl_tiers.raw").toPandas()

23/02/22 18:26:45 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `etl_tiers`.`raw` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.


Unnamed: 0,idx,value
0,3,1.0
1,1,1.0
2,2,1.0
3,4,1.0


In [24]:
# describes table history
spark.sql('describe history etl_tiers.raw').toPandas()

  series = series.astype(t, copy=False)


Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2,2023-02-22 18:26:46.091,,,WRITE,"{'mode': 'Append', 'partitionBy': '[]'}",,,,1.0,Serializable,True,"{'numOutputRows': '4', 'numOutputBytes': '1956...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0
1,1,2023-02-22 18:26:45.497,,,SET TBLPROPERTIES,"{'properties': '{""delta.enableChangeDataFeed"":...",,,,0.0,Serializable,True,{},,Apache-Spark/3.3.1 Delta-Lake/2.2.0
2,0,2023-02-22 18:26:45.024,,,CREATE OR REPLACE TABLE AS SELECT,"{'description': None, 'partitionBy': '[""idx""]'...",,,,,Serializable,False,"{'numOutputRows': '0', 'numOutputBytes': '0', ...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0


## Copy inserted data to bronze table using streaming

In [25]:
source_table = 'etl_tiers.raw'
target_table = 'etl_tiers.bronze'

b_etl = BronzeETL(spark)
b_etl.etl(source_table, '', target_table)

from time import sleep
sleep(10)

display(spark.sql('SELECT * FROM etl_tiers.bronze').toPandas())
display(spark.sql('DESCRIBE HISTORY etl_tiers.bronze').toPandas())

spark.sql('use database default')
spark.sql('select * from stream').toPandas()

  series = series.astype(t, copy=False)


23/02/22 18:26:46 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-679c152f-1716-4c86-9376-7915a4e316c2. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/02/22 18:26:46 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/02/22 18:26:46 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/02/22 18:26:46 ERROR MicroBatchExecution: Query [id = 451f0320-dc82-46a7-8179-cd9684d7664f, runId = 22e53847-5b76-410c-926d-1fa94745b072] terminated with error
org.apache.spark.sql.delta.DeltaIllegalStateException: Delta table 11e9442d-85d6-459f-81ef-e8958030b8cb doesn't exist. Please delete your streaming query checkpoint

  series = series.astype(t, copy=False)


Unnamed: 0,idx,value,_change_type,_commit_version,_commit_timestamp
0,4,1.0,insert,2,2023-02-22 18:26:46.091
1,2,1.0,insert,2,2023-02-22 18:26:46.091
2,3,1.0,insert,2,2023-02-22 18:26:46.091
3,1,1.0,insert,2,2023-02-22 18:26:46.091


  series = series.astype(t, copy=False)


Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,0,2023-02-22 18:26:47.161,,,CREATE OR REPLACE TABLE AS SELECT,"{'description': None, 'partitionBy': '[]', 'pr...",,,,,Serializable,False,"{'numOutputRows': '4', 'numOutputBytes': '6400...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0


  series = series.astype(t, copy=False)


Unnamed: 0,idx,value,_change_type,_commit_version,_commit_timestamp
0,4,3.14,update_preimage,4,2023-02-22 02:12:41.062
1,4,3.14,update_postimage,4,2023-02-22 02:12:41.062
2,3,3.14,update_preimage,4,2023-02-22 02:12:41.062
3,3,3.14,update_postimage,4,2023-02-22 02:12:41.062
4,3,1.0,update_preimage,3,2023-02-22 02:05:59.913
5,3,3.14,update_postimage,3,2023-02-22 02:05:59.913
6,4,1.0,update_preimage,3,2023-02-22 02:05:59.913
7,4,3.14,update_postimage,3,2023-02-22 02:05:59.913
8,2,1.0,insert,4,2023-02-22 01:47:47.709
9,2,1.0,delete,2,2023-02-22 01:47:45.467


In [26]:
b_etl = BronzeETL(spark)
b_etl.etl(source_table, '', target_table)

spark.sql('update etl_tiers.raw set value = 3.14 where idx > 2')
display(spark.sql('SELECT * FROM etl_tiers.bronze').toPandas())
display(spark.sql('DESCRIBE HISTORY etl_tiers.bronze').toPandas().head(4))

display(spark.sql('DESCRIBE HISTORY etl_tiers.bronze').toPandas().head(4))
spark.sql('select * from etl_tiers.bronze').toPandas()

  series = series.astype(t, copy=False)


23/02/22 18:26:57 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-41f4e09a-1bff-413f-8410-d96d39a836e8. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/02/22 18:26:57 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/02/22 18:26:57 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/02/22 18:26:57 ERROR MicroBatchExecution: Query [id = 451f0320-dc82-46a7-8179-cd9684d7664f, runId = ceaeb6b8-1570-4b9d-adca-14e4296f26ce] terminated with error
org.apache.spark.sql.delta.DeltaIllegalStateException: Delta table 11e9442d-85d6-459f-81ef-e8958030b8cb doesn't exist. Please delete your streaming query checkpoint

  series = series.astype(t, copy=False)


Unnamed: 0,idx,value,_change_type,_commit_version,_commit_timestamp
0,1,1.0,insert,2,2023-02-22 18:26:46.091
1,2,1.0,insert,2,2023-02-22 18:26:46.091
2,3,1.0,insert,2,2023-02-22 18:26:46.091
3,4,1.0,insert,2,2023-02-22 18:26:46.091


  series = series.astype(t, copy=False)


Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,1,2023-02-22 18:26:58.540,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,0.0,Serializable,False,"{'numOutputRows': '4', 'numTargetRowsInserted'...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0
1,0,2023-02-22 18:26:47.161,,,CREATE OR REPLACE TABLE AS SELECT,"{'description': None, 'partitionBy': '[]', 'pr...",,,,,Serializable,False,"{'numOutputRows': '4', 'numOutputBytes': '6400...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0


  series = series.astype(t, copy=False)


Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,1,2023-02-22 18:26:58.540,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,0.0,Serializable,False,"{'numOutputRows': '4', 'numTargetRowsInserted'...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0
1,0,2023-02-22 18:26:47.161,,,CREATE OR REPLACE TABLE AS SELECT,"{'description': None, 'partitionBy': '[]', 'pr...",,,,,Serializable,False,"{'numOutputRows': '4', 'numOutputBytes': '6400...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0


23/02/22 18:27:00 ERROR MicroBatchExecution: Query [id = 5aa0cb13-c572-4846-b81b-918b83985aae, runId = dd1700e2-be7e-4a73-8233-bad014212745] terminated with error
py4j.Py4JException: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
  File "/home/ahow/main_env/lib/python3.10/site-packages/py4j/clientserver.py", line 617, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
  File "/home/ahow/main_env/lib/python3.10/site-packages/pyspark/sql/utils.py", line 272, in call
    raise e
  File "/home/ahow/main_env/lib/python3.10/site-packages/pyspark/sql/utils.py", line 269, in call
    self.func(DataFrame(jdf, self.session), batch_id)
  File "<string>", line 38, in merge
  File "<string>", line 27, in merge
  File "/home/ahow/main_env/lib/python3.10/site-packages/delta/tables.py", line 938, in execute
    self._jbuilder.execute()
  File "/home/ahow/main_env/lib/python3.10/site-packages/py4j/java_gateway.py", line 1321, i

  series = series.astype(t, copy=False)


Unnamed: 0,idx,value,_change_type,_commit_version,_commit_timestamp
0,1,1.0,insert,2,2023-02-22 18:26:46.091
1,2,1.0,insert,2,2023-02-22 18:26:46.091
2,3,3.14,update_postimage,3,2023-02-22 18:26:57.964
3,4,3.14,update_postimage,3,2023-02-22 18:26:57.964


## Create table raw2 and insert values

In [27]:
# create raw2 table and insert values

data = [[5,1.0, 'asdf'], [6,1.0, 'rewq']]
schema = StructType([
    StructField('idx', LongType(), False),
    StructField('value', DoubleType(), False),
    StructField('description', StringType(), False),
])

df = spark.createDataFrame([], schema=schema)
df.write.partitionBy('idx').format('delta').mode('overwrite').saveAsTable('etl_tiers.raw2')
spark.sql("ALTER TABLE etl_tiers.raw2 SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")

df = spark.createDataFrame(data, schema=schema)
df.write.format('delta').mode('append').saveAsTable('etl_tiers.raw2')

spark.sql("SELECT * FROM etl_tiers.raw2").toPandas()

Unnamed: 0,idx,value,description
0,5,1.0,asdf
1,6,1.0,rewq


## Call bronze ETL again using raw2 as source

In [28]:
b_etl = BronzeETL(spark)
b_etl.etl('etl_tiers.raw2', '', target_table)

display(spark.sql('SELECT * FROM etl_tiers.bronze').toPandas())
display(spark.sql('DESCRIBE HISTORY etl_tiers.bronze').toPandas().head(4))
spark.sql('select * from etl_tiers.bronze').toPandas()

  series = series.astype(t, copy=False)


23/02/22 18:28:51 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-c2b2edb9-6bf5-4001-9304-2c6be4c01113. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/02/22 18:28:51 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/02/22 18:28:51 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/02/22 18:28:51 ERROR MicroBatchExecution: Query [id = 451f0320-dc82-46a7-8179-cd9684d7664f, runId = 8662c4a2-81bf-422b-9950-beb9c4523ba4] terminated with error
org.apache.spark.sql.delta.DeltaIllegalStateException: Delta table 11e9442d-85d6-459f-81ef-e8958030b8cb doesn't exist. Please delete your streaming query checkpoint

  series = series.astype(t, copy=False)


Unnamed: 0,idx,value,_change_type,_commit_version,_commit_timestamp,description
0,5,1.0,insert,8,2023-02-22 18:28:24.889,asdf
1,6,1.0,insert,8,2023-02-22 18:28:24.889,rewq
2,1,1.0,insert,2,2023-02-22 18:26:46.091,
3,2,1.0,insert,2,2023-02-22 18:26:46.091,
4,3,3.14,update_postimage,3,2023-02-22 18:26:57.964,
5,4,3.14,update_postimage,3,2023-02-22 18:26:57.964,


  series = series.astype(t, copy=False)


Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,4,2023-02-22 18:28:26.485,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,3.0,Serializable,False,"{'numOutputRows': '2', 'numTargetRowsInserted'...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0
1,3,2023-02-22 18:28:24.955,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,2.0,Serializable,False,"{'numOutputRows': '0', 'numTargetRowsInserted'...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0
2,2,2023-02-22 18:26:59.616,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,1.0,Serializable,False,"{'numOutputRows': '4', 'numTargetRowsInserted'...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0
3,1,2023-02-22 18:26:58.540,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,0.0,Serializable,False,"{'numOutputRows': '4', 'numTargetRowsInserted'...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0


  series = series.astype(t, copy=False)


Unnamed: 0,idx,value,_change_type,_commit_version,_commit_timestamp,description
0,5,1.0,insert,8,2023-02-22 18:28:24.889,asdf
1,6,1.0,insert,8,2023-02-22 18:28:24.889,rewq
2,1,1.0,insert,2,2023-02-22 18:26:46.091,
3,2,1.0,insert,2,2023-02-22 18:26:46.091,
4,3,3.14,update_postimage,3,2023-02-22 18:26:57.964,
5,4,3.14,update_postimage,3,2023-02-22 18:26:57.964,


## Check number of rows updated

In [32]:
spark.sql('DESCRIBE HISTORY etl_tiers.bronze').toPandas().loc[0, 'operationMetrics']

  series = series.astype(t, copy=False)


{'numOutputRows': '2',
 'numTargetRowsInserted': '0',
 'numTargetRowsUpdated': '2',
 'numTargetFilesAdded': '1',
 'numTargetFilesRemoved': '1',
 'numTargetRowsDeleted': '0',
 'scanTimeMs': '221',
 'numSourceRows': '2',
 'numTargetChangeFilesAdded': '0',
 'executionTimeMs': '411',
 'numTargetRowsCopied': '0',
 'rewriteTimeMs': '181'}

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 56042)
Traceback (most recent call last):
  File "/usr/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/home/ahow/main_env/lib/python3.10/site-packages/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/home/ahow/main_env/lib/python3.10/site-packages/pyspark/accumulators.py", line 253, in poll
    if func():
  File "/home/ahow/main_env/lib/python3.10/site-packages/pyspark/accumulators.py", line 257, in accum_updates
    num_updates = re