In [85]:
%run ../Utils/delta_utils.ipynb import merge
import json

In [86]:
def write_json(path, data):
    data = json.dumps(data, indent=4)
    with open(path, 'w') as f:
        f.write(data)

In [147]:
class RawETLJSON:
    
    pk = ['idx']
    
    def __init__(self, spark):
        self.spark = spark
        
    def extract(self, folder):
        stream = self.spark.readStream.json(folder, primitivesAsString=True, multiLine=True)
    
        return stream
    
    def merge(self, batch_df, idx):
        merge(batch_df, self.target_table, self.pk, spark_session=self.spark, partition=self.pk)
    
    def transform(self, df):
        return df
    
    def load(self, stream, target_table):
        self.target_table = target_table
        
        query = stream.writeStream.outputMode('update') \
                .option("checkpointLocation", f"./checkpoints/{target_table}") \
                .foreachBatch(self.merge).start()
        return    
    
    def etl(self, target_table, folder):
        stream = self.extract(folder)
        self.load(stream, target_table)

In [148]:
class BronzeETL:
    
    pk = ['idx']
    
    def __init__(self, spark):
        self.spark = spark
        
    def extract(self, target_table, source_table):
        
        last_ts = self.spark.sql(f'DESCRIBE HISTORY {target_table}').toPandas()
        if last_ts.index.size == 0:
            last_ts = '1990-01-01 00:00:00'
        else:
            last_ts = last_ts.loc[0, 'timestamp']
        
        stream = self.spark.readStream.format('delta') \
                     .option('delta.enableChangeDataFeed', True) \
                     .option('readChangeFeed', True) \
                     .option('startingTimestamp', last_ts) \
                     .table(source_table)
        #                      \
    
        return stream
    
    def merge(self, batch_df, idx):
        batch_df.show()
        batch_df = batch_df.filter("_change_type='update_postimage'")
        merge(batch_df, self.target_table, self.pk, spark_session=self.spark, partition=self.pk)
    
    def transform(self, df):
        return df
    
    def load(self, stream, target_table, source_table):
        self.target_table = target_table
        
        query = stream.writeStream.format('delta').outputMode('update') \
                .option("checkpointLocation", f"./checkpoints/{target_table}") \
                .foreachBatch(self.merge).start()
        #                .option('ignoreChanges', True) \
    
    def etl(self, target_table, source_table):
        stream = self.extract(target_table, source_table)
        self.load(stream, target_table, source_table)

In [149]:
data = [
    {'idx': 10, 'value': 6.0},
    {'idx': 11, 'value': 6.0},
    {'idx': 12, 'value': 6.0},
]
write_json('data/json8.json', data)

In [150]:
if __name__ == '__main__':
    from pyspark.sql import SparkSession
    from delta.pip_utils import configure_spark_with_delta_pip
    
    builder = SparkSession.builder\
           .appName('raw_etl')\
           .config('spark.sql.warehouse.dir', 'pyspark_tables')\
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
           .config('spark.databricks.delta.retentionDurationCheck.enabled', False) \
           .config('spark.databricks.delta.schema.autoMerge.enabled', True) \
           .config('spark.databricks.delta.checkLatestSchemaOnRead', True) \
           .config('delta.enableChangeDataFeed', True) \
           .config('spark.sql.streaming.schemaInference', True)
    
    folder = 'data/'
    target_table = 'streaming_json.raw'
    spark = configure_spark_with_delta_pip(builder).enableHiveSupport().getOrCreate()
    
    etl = RawETLJSON(spark)
    etl.etl(target_table, folder)

23/02/23 02:05:50 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
23/02/23 02:05:50 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/02/23 02:05:51 ERROR MicroBatchExecution: Query [id = c0fa3602-bf96-4cba-8b3a-c779fcc54cf9, runId = e8ba1299-7612-4b57-b264-30566eb9d473] terminated with error
py4j.Py4JException: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
  File "/home/ahow/main_env/lib/python3.10/site-packages/py4j/clientserver.py", line 617, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
  File "/home/ahow/main_env/lib/python3.10/site-packages/pyspark/sql/utils.py", line 272, in call
    raise e
  File "/home/ahow/main_env/lib/python3.10/site-packages/pyspark/sql/utils.py", line 269, in call
    self.func(DataFrame(jdf, self.session), batch_id)
  File "/tmp/ipykernel_7581

In [141]:
if __name__ == '__main__':
    target_table = 'streaming_json.bronze'
    source_table = 'streaming_json.raw'
    
    etl = BronzeETL(spark)
    etl.etl(target_table, source_table)

23/02/23 02:01:00 ERROR MicroBatchExecution: Query [id = c0fa3602-bf96-4cba-8b3a-c779fcc54cf9, runId = fa72829a-c7d9-4b73-b56b-f1d1d403c01b] terminated with error
py4j.Py4JException: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
  File "/home/ahow/main_env/lib/python3.10/site-packages/py4j/clientserver.py", line 617, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
  File "/home/ahow/main_env/lib/python3.10/site-packages/pyspark/sql/utils.py", line 272, in call
    raise e
  File "/home/ahow/main_env/lib/python3.10/site-packages/pyspark/sql/utils.py", line 269, in call
    self.func(DataFrame(jdf, self.session), batch_id)
  File "/tmp/ipykernel_75816/349814426.py", line 14, in merge
    merge(batch_df, self.target_table, self.pk, spark_session=self.spark, partition=self.pk)
  File "/tmp/ipykernel_75816/1499376899.py", line 33, in merge
    .execute()
  File "/home/ahow/main_env/lib/python3.10/site-packages/d

  series = series.astype(t, copy=False)


23/02/23 02:01:00 ERROR MicroBatchExecution: Query [id = 92f85b4e-093c-4fb9-a4a8-a1c633fc4319, runId = c96bfef4-a5b7-4da2-8fff-288939f2ab28] terminated with error
org.apache.spark.sql.delta.DeltaAnalysisException: The provided timestamp (2023-02-23 01:52:19.912) is after the latest version available to this
table (2023-02-22 23:35:10.549). Please use a timestamp before or at 2023-02-22 23:35:10.
	at org.apache.spark.sql.delta.DeltaErrorsBase.timestampGreaterThanLatestCommit(DeltaErrors.scala:1302)
	at org.apache.spark.sql.delta.DeltaErrorsBase.timestampGreaterThanLatestCommit$(DeltaErrors.scala:1298)
	at org.apache.spark.sql.delta.DeltaErrors$.timestampGreaterThanLatestCommit(DeltaErrors.scala:2489)
	at org.apache.spark.sql.delta.sources.DeltaSource$.getStartingVersionFromTimestamp(DeltaSource.scala:1046)
	at org.apache.spark.sql.delta.sources.DeltaSource.getStartingVersion$lzycompute(DeltaSource.scala:993)
	at org.apache.spark.sql.delta.sources.DeltaSource.getStartingVersion(DeltaSour

In [142]:
spark.sql('SELECT * FROM streaming_json.bronze').toPandas()
#spark.sql('DELETE FROM streaming_json.bronze').toPandas()

Unnamed: 0,idx,value,_change_type,_commit_version,_commit_timestamp


In [129]:
spark.sql('SELECT * FROM streaming_json.raw').toPandas()
#spark.sql('DESCRIBE HISTORY streaming_json.raw').toPandas().loc[2,'operationParameters']

Unnamed: 0,idx,value
0,11,6.0
1,12,6.0
2,10,6.0
3,1,4.0
4,4,4.0
5,3,4.0
6,9,4.0
7,2,4.0
8,5,1.0
9,6,1.0


In [74]:
spark.sql('select * from streaming_json.bronze order by idx').toPandas()

AnalysisException: Table or view not found: streaming_json.bronze; line 1 pos 14;
'Sort ['idx ASC NULLS FIRST], true
+- 'Project [*]
   +- 'UnresolvedRelation [streaming_json, bronze], [], false


In [61]:
df = spark.sql('DESCRIBE HISTORY streaming_json.raw').toPandas()
om = df['operationMetrics']
df

  series = series.astype(t, copy=False)


Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,7,2023-02-22 23:16:42.531,,,SET TBLPROPERTIES,"{'properties': '{""delta.enableChangeDataFeed"":...",,,,6.0,Serializable,True,{},,Apache-Spark/3.3.1 Delta-Lake/2.2.0
1,6,2023-02-22 23:15:44.253,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,5.0,Serializable,False,"{'numOutputRows': '3', 'numTargetRowsInserted'...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0
2,5,2023-02-22 23:15:07.368,,,SET TBLPROPERTIES,"{'properties': '{""delta.enableChangeDataFeed"":...",,,,4.0,Serializable,True,{},,Apache-Spark/3.3.1 Delta-Lake/2.2.0
3,4,2023-02-22 20:45:08.060,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,3.0,Serializable,False,"{'numOutputRows': '6', 'numTargetRowsInserted'...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0
4,3,2023-02-22 20:44:45.590,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,2.0,Serializable,False,"{'numOutputRows': '4', 'numTargetRowsInserted'...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0
5,2,2023-02-22 20:40:42.763,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,1.0,Serializable,False,"{'numOutputRows': '4', 'numTargetRowsInserted'...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0
6,1,2023-02-22 20:40:20.535,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,0.0,Serializable,False,"{'numOutputRows': '4', 'numTargetRowsInserted'...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0
7,0,2023-02-22 20:37:43.615,,,CREATE OR REPLACE TABLE AS SELECT,"{'description': None, 'partitionBy': '[""idx""]'...",,,,,Serializable,False,"{'numOutputRows': '4', 'numOutputBytes': '1852...",,Apache-Spark/3.3.1 Delta-Lake/2.2.0


In [36]:
for v in om:
    print(json.dumps(v, indent=4))

{
    "numOutputRows": "6",
    "numTargetRowsInserted": "2",
    "numTargetRowsUpdated": "4",
    "numTargetFilesAdded": "6",
    "numTargetFilesRemoved": "4",
    "numTargetRowsDeleted": "0",
    "scanTimeMs": "230",
    "numSourceRows": "6",
    "numTargetChangeFilesAdded": "0",
    "executionTimeMs": "455",
    "numTargetRowsCopied": "0",
    "rewriteTimeMs": "214"
}
{
    "numOutputRows": "4",
    "numTargetRowsInserted": "0",
    "numTargetRowsUpdated": "4",
    "numTargetFilesAdded": "4",
    "numTargetFilesRemoved": "4",
    "numTargetRowsDeleted": "0",
    "scanTimeMs": "230",
    "numSourceRows": "4",
    "numTargetChangeFilesAdded": "0",
    "executionTimeMs": "424",
    "numTargetRowsCopied": "0",
    "rewriteTimeMs": "186"
}
{
    "numOutputRows": "4",
    "numTargetRowsInserted": "2",
    "numTargetRowsUpdated": "2",
    "numTargetFilesAdded": "4",
    "numTargetFilesRemoved": "2",
    "numTargetRowsDeleted": "0",
    "scanTimeMs": "267",
    "numSourceRows": "4",
    "nu

In [64]:
df = spark.read.format('delta').load('../gen_tiers/pyspark_tables/etl_tiers.db/bronze/')
df.toPandas()

  series = series.astype(t, copy=False)


Unnamed: 0,idx,value,_change_type,_commit_version,_commit_timestamp,description
0,5,1.0,insert,8,2023-02-22 18:28:24.889,asdf
1,6,1.0,insert,8,2023-02-22 18:28:24.889,rewq
2,1,1.0,insert,2,2023-02-22 18:26:46.091,
3,2,1.0,insert,2,2023-02-22 18:26:46.091,
4,3,3.14,update_postimage,3,2023-02-22 18:26:57.964,
5,4,3.14,update_postimage,3,2023-02-22 18:26:57.964,
