In [0]:
dbutils.widgets.text("src_table","default_src")
dbutils.widgets.text("target_table","default_target")
dbutils.widgets.text("sql_group","default_id")
dbutils.widgets.text("db_name","default")
dbutils.widgets.text("mode","default")
dbutils.widgets.text("hostname","default")
dbutils.widgets.text("key_scope","default")
dbutils.widgets.text("cred_type","default")
dbutils.widgets.text("userid","default")
dbutils.widgets.text("password","default")
dbutils.widgets.text("tenantId","tenant_id")

In [0]:
src_table = dbutils.widgets.get("src_table")
target_table = dbutils.widgets.get("target_table")
sql_group = dbutils.widgets.get("sql_group")
db_name = dbutils.widgets.get("db_name")
mode = dbutils.widgets.get("mode")
hostname = dbutils.widgets.get("hostname")
key_scope = dbutils.widgets.get("key_scope")
cred_type = dbutils.widgets.get("cred_type")
userid = dbutils.widgets.get("userid")
password = dbutils.widgets.get("password")
tenantId = dbutils.widgets.get("tenantId")

In [0]:

import time
from datetime import datetime

storagename= "jridatalakesng"
tempdir = "abfss://synapsestaging@" + storagename + ".dfs.core.windows.net/mydir"

start_time = datetime.now()
print(f"{datetime.now()}: INFO : Table : {target_table}. Processing started. {start_time}")

try:
    # Reading as a stream ensures that only the new data is pickedup. It also ensures easy restartability incase of failure.
    spark.readStream \
        .option("skipChangeCommits", "True") \
        .table(src_table) \
        .createOrReplaceTempView("temp_view")

    df = spark.sql(f"SELECT * FROM temp_view")

    #WRITE DATA WITH OVERWRITE
    dbtable_new = f"dbo.{target_table}"
    if cred_type == "useridpassword":
        query_stream = df.writeStream \
            .format("com.databricks.spark.sqldw") \
            .option("host", hostname) \
            .trigger(availableNow=True) \
            .option("port", "1433") \
            .option("user", dbutils.secrets.get(scope=key_scope, key=userid)) \
            .option("password", dbutils.secrets.get(scope=key_scope, key=password)) \
            .option("database", db_name) \
            .option("dbtable", target_table) \
            .option("mode",mode) \
            .option("tempDir", tempdir) \
            .option("forwardSparkAzureStorageCredentials", "true") \
            .option("checkpointLocation", f"abfss://checkpoint@{storagename}.dfs.core.windows.net/synapsewrite/{target_table}") \
            .start()
    else:
        query_stream = df.writeStream \
            .format("com.databricks.spark.sqldw") \
            .option("host", hostname) \
            .trigger(availableNow=True) \
            .option("port", "1433") \
            .option("tenantId", tenantId) \
            .option("clientId", dbutils.secrets.get(scope=key_scope, key=userid)) \
            .option("clientSecret", dbutils.secrets.get(scope=key_scope, key=password)) \
            .option("database", db_name) \
            .option("dbtable", target_table) \
            .option("mode",mode) \
            .option("tempDir", tempdir) \
            .option("forwardSparkAzureStorageCredentials", "true") \
            .option("checkpointLocation", f"abfss://checkpoint@{storagename}.dfs.core.windows.net/synapsewrite/{target_table}") \
            .start()

    print(f"{datetime.now()}: INFO : Stream id is : {query_stream.id}")
    print(f"{datetime.now()}: INFO : Stream run id is : {query_stream.runId}")

    # Wait for stream write to terminate before reporting back status
    query_stream.awaitTermination()
    end_time = datetime.now()

    raw_data = query_stream.lastProgress
    row_cnt = raw_data['numInputRows']
    completion_status = "Success"
except Exception as e:
    print(f"{datetime.now()}: ERROR : {e}")
    completion_status = "Failed"

time_taken = end_time - start_time
print(f"{datetime.now()}: INFO : Row_count {row_cnt}")
print(f"{datetime.now()}: INFO : Table {target_table} completed. {end_time}")
print(f"{datetime.now()}: INFO : time taken {time_taken}")
#spark.streams.resetTerminated() # Otherwise awaitAnyTermination() will return immediately after first stream has terminated
#spark.streams.awaitAnyTermination()

In [0]:
import json

exit_msg = json.dumps({"sql_group":sql_group, "source_table": src_table, "target_table": target_table, "status": completion_status, "time_taken": str(time_taken),"load_start_ts":str(start_time), "load_end_ts": str(end_time), "rows_written": row_cnt, "stream_runid" : query_stream.runId, "stream_id": query_stream.id})

dbutils.notebook.exit(exit_msg)