In [ ]:
%%configure -f
{
"conf": {
     "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
     "spark.dynamicAllocation.enabled": true,
     "spark.dynamicAllocation.minExecutors": 2,
     "spark.dynamicAllocation.maxExecutors": 8
   }
}

In [ ]:
enriched_images_tbl_name = ''
clustered_images_tbl_name_1 = ''
clustered_images_tbl_name_2 = ''
clustered_images_tbl_name_3 = ''
clustered_multimodal_tbl_name = ''
image_contents_tbl_name = ''
batch_root = ''
batch_num = ''
batch_description = ''
input_container=''
output_container=''
blob_account_name = ''
file_system = ""
azure_storage_domain = ''
image_file_count = 0
minted_tables_output_path = ""

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

config_integration.trace_integrations(['logging'])

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'image_contents_tbl_name': image_contents_tbl_name,
    'enriched_images_tbl_name': enriched_images_tbl_name,
    'clustered_images_tbl_name_1': clustered_images_tbl_name_1,
    'clustered_images_tbl_name_2': clustered_images_tbl_name_2,
    'clustered_images_tbl_name_3': clustered_images_tbl_name_3,
    'batch_description': batch_description,
    'batch_root': batch_root,
    'batch_num': batch_num,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
file_path_col = "path"
file_name_col = "file_name"
file_type_col = "file_type"
image_analysis_col = "analysis_results"
image_analysis_error_col = "image_analysis_error"
read_col = "read_results"
read_error_col = "read_error"

cluster_path_col = "original_uri"
cluster_col = "cluster"
image_expl_col = "Explanations"
x_col = "X"
y_col = "Y"

cluster_col_1 = "cluster_1"
image_expl_col_1 = "Explanations_1"
x_col_1 = "X_1"
y_col_1 = "Y_1"

cluster_col_2 = "cluster_2"
image_expl_col_2 = "Explanations_2"
x_col_2 = "X_2"
y_col_2 = "Y_2"

cluster_col_3 = "cluster_3"
image_expl_col_3 = "Explanations_3"
x_col_3 = "X_3"
y_col_3 = "Y_3"

# For multimodal clustering results
cluster_path_col_4 = "file_path"
summarized_text_xsum_col = "summarized_text_xsum"
summarized_text_xsum_col_4 = "summarized_text_xsum_4"
topic_name_col = "topic_name"
topic_name_col_4 = "topic_name_4"
cluster_col_4 = "cluster_4"
x_col_4 = "X_4"
y_col_4 = "Y_4"

output_cols = [ 
    file_name_col, 
    file_type_col
]

error_cols = [
    image_analysis_error_col
]

In [ ]:
import json
import os
import random
import uuid
import csv
from types import SimpleNamespace

import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, StructType, StructField, MapType
from pyspark import SparkContext
from pyspark.sql import SparkSession

# Initialise session and config
sc = spark.sparkContext
spark = SparkSession.builder.appName(f"ImageProcessing {mssparkutils.runtime.context}").getOrCreate()

def read_batch_config(batch_root: str):
    """
    We read the config file using the Java File System API as we do not need to let multiple nodes read individual lines and join it
    all back together again
    """
    # Change our file system from 'synapse' to 'input'
    sc._jsc.hadoopConfiguration().set("fs.defaultFS", file_system)

    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())
    config_path = sc._jvm.org.apache.hadoop.fs.Path(f'{batch_root}/config.json')

    # If we don't have a batch config, copy the global one.
    if fs.exists(config_path) != True:
        logger.error(f'{config_path} not found.')

    # Open our file directly rather than through spark
    input_stream = fs.open(config_path)  # FSDataInputStream

    config_string = sc._jvm.java.io.BufferedReader(
        sc._jvm.java.io.InputStreamReader(input_stream, sc._jvm.java.nio.charset.StandardCharsets.UTF_8)
        ).lines().collect(sc._jvm.java.util.stream.Collectors.joining("\n"))

    # Load it into json    
    return json.loads(''.join(config_string), object_hook=lambda dictionary: SimpleNamespace(**dictionary))

with tracer.span(name=f"Load config: {mssparkutils.runtime.context['notebookname']}"):
    try:
        config = read_batch_config(batch_root)
    except Exception as e:
        logger.exception(e)
        raise e

    # Set log level
    if config.log_level == "INFO":
        logger.setLevel(logging.INFO)
    else:
        logger.setLevel(logging.ERROR)
        config.log_level = "ERROR"

In [ ]:
import pyodbc
from pyspark.sql.functions import col, current_timestamp, from_json
# Dedicated and serverless SQL config
dedicated_database = 'dedicated'
database = 'minted'   
driver= '{ODBC Driver 17 for SQL Server}'

# secrets
sql_user_name = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLUserName")
sql_user_pwd = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLPassword")
serverless_sql_endpoint = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseServerlessSQLEndpoint")
dedicated_sql_endpoint = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseDedicatedSQLEndpoint")

In [ ]:
with tracer.span(name=f'Read the dataframe from the given table {image_contents_tbl_name}'):
    # Load Dataframes and rename columns 
    image_contents_df = spark.read.parquet(f'{minted_tables_output_path}{image_contents_tbl_name}')
    enriched_images_df = spark.read.parquet(f'{minted_tables_output_path}{enriched_images_tbl_name}')
    clustered_images_df_1 = spark.read.parquet(f'{minted_tables_output_path}{clustered_images_tbl_name_1}')
    clustered_images_df_2 = (spark.read.parquet(f'{minted_tables_output_path}{clustered_images_tbl_name_2}')
        .withColumnRenamed(image_expl_col, image_expl_col_2)
        .withColumnRenamed(cluster_col, cluster_col_2)
        .withColumnRenamed(x_col, x_col_2)
        .withColumnRenamed(y_col, y_col_2)
    )
    clustered_images_df_3 = (spark.read.parquet(f'{minted_tables_output_path}{clustered_images_tbl_name_3}')
        .withColumnRenamed(image_expl_col, image_expl_col_3)
        .withColumnRenamed(cluster_col, cluster_col_3)
        .withColumnRenamed(x_col, x_col_3)
        .withColumnRenamed(y_col, y_col_3)
    )
    # For multimodal clustering results
    clustered_multimodal_df = (spark.read.parquet(f'{minted_tables_output_path}{clustered_multimodal_tbl_name}')
        .withColumnRenamed(summarized_text_xsum_col, summarized_text_xsum_col_4)
        .withColumnRenamed(topic_name_col, topic_name_col_4)
        .withColumnRenamed(cluster_col, cluster_col_4)
        .withColumnRenamed(x_col, x_col_4)
        .withColumnRenamed(y_col, y_col_4)
    )
    clustered_multimodal_df = (clustered_multimodal_df
        .where(clustered_multimodal_df.type_of_file == 'images')
    )	
    clustered_multimodal_df = (clustered_multimodal_df
        .drop(clustered_multimodal_df.type_of_file)
    )
    # Join Dataframes
    images_df = (image_contents_df
        .join(enriched_images_df, file_path_col, 'left_outer')
        .join(clustered_images_df_1, [image_contents_df[file_path_col] == clustered_images_df_1[cluster_path_col]], 'left_outer')
        .join(clustered_images_df_2, [image_contents_df[file_path_col] == clustered_images_df_2[cluster_path_col]], 'left_outer')
        .join(clustered_images_df_3, [image_contents_df[file_path_col] == clustered_images_df_3[cluster_path_col]], 'left_outer')
        .join(clustered_multimodal_df, [image_contents_df[file_path_col] == clustered_multimodal_df[cluster_path_col_4]], 'left_outer')    
        # Select specific columns
        .select(
            image_contents_df.path,
            image_contents_df.file_name,
            image_contents_df.file_type,
            enriched_images_df[image_analysis_col],
            enriched_images_df[image_analysis_error_col],
            enriched_images_df[read_col],
            enriched_images_df[read_error_col],
            clustered_images_df_1[image_expl_col],
            clustered_images_df_1[cluster_col],
            clustered_images_df_1[x_col],
            clustered_images_df_1[y_col],
            clustered_images_df_2[image_expl_col_2],
            clustered_images_df_2[cluster_col_2],
            clustered_images_df_2[x_col_2],
            clustered_images_df_2[y_col_2],
            clustered_images_df_3[image_expl_col_3],
            clustered_images_df_3[cluster_col_3],
            clustered_images_df_3[x_col_3],
            clustered_images_df_3[y_col_3],
            clustered_multimodal_df[summarized_text_xsum_col_4], # For multimodal clustering results
            clustered_multimodal_df[topic_name_col_4], # For multimodal clustering results
            clustered_multimodal_df[cluster_col_4], # For multimodal clustering results
            clustered_multimodal_df[x_col_4], # For multimodal clustering results
            clustered_multimodal_df[y_col_4] # For multimodal clustering results
        )
        # Rename file path column
        .withColumnRenamed(file_path_col, 'file_path')
        .withColumn("batch_num", F.lit(batch_num))
    )

In [ ]:
with tracer.span(name='Persist processed images as SQL table'):
    processed_images_tbl_name = f'{batch_num}_processed_images'
    images_df.write.mode("overwrite").parquet(f'{minted_tables_output_path}{processed_images_tbl_name}')
    sql_command_processed_images = f'''
        IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = '{processed_images_tbl_name}') 
        CREATE EXTERNAL TABLE [{processed_images_tbl_name}] (
            [file_path] nvarchar(1000), 
            [file_name] nvarchar(1000), 
            [file_type] nvarchar(1000), 
            [analysis_results] varchar(max),
            [image_analysis_error] varchar(max),
            [read_results] varchar(max),
            [read_error] varchar(max),
            [Explanations] nvarchar(4000),
            [cluster] bigint,
            [X] float,
            [Y] float,
            [Explanations_2] nvarchar(4000),
            [cluster_2] bigint,
            [X_2] float,
            [Y_2] float,
            [Explanations_3] nvarchar(4000),
            [cluster_3] bigint,
            [X_3] float,
            [Y_3] float,
            [summarized_text_xsum_4] nvarchar(4000),
            [topic_name_4] nvarchar(4000),
            [cluster_4] bigint,
            [X_4] float,
            [Y_4] float,
            [batch_num] nvarchar(4000)
            
        )
        WITH (
            LOCATION = 'minted_tables/{processed_images_tbl_name}/**', 
            DATA_SOURCE = [synapse_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], 
            FILE_FORMAT = [SynapseParquetFormat]
        )
    '''

    # The batch_num_image_read_errors dataframe is used as a simplified query in the powerBI query
    schema = StructType([ 
        StructField("error",MapType(StringType(),StringType()),True)
    ])
    batch_num_image_read_errors = images_df.select(from_json(f"{read_error_col}.response", schema).alias("data")) \
            .select("data.error.message").groupBy("message").count()
    batch_num_image_read_errors = batch_num_image_read_errors.selectExpr("message as Errors", "count as Count")
    batch_num_image_read_errors = batch_num_image_read_errors.select("Errors", "Count")

    batch_num_image_read_errors_tbl_name = f'{batch_num}_image_read_errors'
    batch_num_image_read_errors.write.mode("overwrite").parquet(f'{minted_tables_output_path}{batch_num_image_read_errors_tbl_name}')
    
    # Create the external table for batch_num_image_read_errors to be used in powerBI
    sql_command_batch_num_image_read_errors = f'''
        IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = '{batch_num_image_read_errors_tbl_name}') 
        CREATE EXTERNAL TABLE [{batch_num_image_read_errors_tbl_name}] (
            [Errors] nvarchar(1000), 
            [Count] bigint, 
        )
        WITH (
            LOCATION = 'minted_tables/{batch_num_image_read_errors_tbl_name}/**', 
            DATA_SOURCE = [synapse_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], 
            FILE_FORMAT = [SynapseParquetFormat]
        )
    '''

    # The batch_num_image_analyze_errors dataframe is used as a simplified query in the powerBI query
    schema = StructType([ 
        StructField("code",StringType(),True), 
        StructField("requestId",StringType(),True), 
        StructField("message",StringType(),True)
    ])
    batch_num_image_analyze_errors = images_df.select(from_json(f"{image_analysis_error_col}.response", schema).alias("data")) \
            .select("data.message").groupBy("message").count()
    batch_num_image_analyze_errors = batch_num_image_analyze_errors.selectExpr("message as Errors", "count as Count")
    batch_num_image_analyze_errors = batch_num_image_analyze_errors.select("Errors", "Count")

    batch_num_image_analyze_errors_tbl_name = f'{batch_num}_image_analyze_errors'
    batch_num_image_analyze_errors.write.mode("overwrite").parquet(f'{minted_tables_output_path}{batch_num_image_analyze_errors_tbl_name}')
    
    # Create the external table for batch_num_image_analyze_errors to be used in powerBI
    sql_command_image_analysis_errors = f'''
        IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = '{batch_num_image_analyze_errors_tbl_name}') 
        CREATE EXTERNAL TABLE [{batch_num_image_analyze_errors_tbl_name}] (
            [Errors] nvarchar(1000), 
            [Count] bigint, 
        )
        WITH (
            LOCATION = 'minted_tables/{batch_num_image_analyze_errors_tbl_name}/**', 
            DATA_SOURCE = [synapse_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], 
            FILE_FORMAT = [SynapseParquetFormat]
        )
    '''
    with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+serverless_sql_endpoint+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD='+ sql_user_pwd) as conn:
        with conn.cursor() as cursor:
            cursor.execute(sql_command_processed_images)
            cursor.execute(sql_command_image_analysis_errors)
            cursor.execute(sql_command_batch_num_image_read_errors)

with tracer.span(name='Persist processed images as json'):
    output_path = f'abfss://{output_container}@{blob_account_name}.dfs.{azure_storage_domain}/{batch_num}'

    images_df = images_df \
        .withColumn("json", F.to_json(F.struct(col("*"))))

    out_lst = images_df.collect()

    for row in out_lst:
        json_path = f'{output_path}/image_processing_json/{row.file_name}.output.json'
        mssparkutils.fs.put(json_path, row.json, overwrite=True)

with tracer.span(name='Persist clustering results as CSV and a table'):
    clustered_df_1 = images_df.select(col('file_path').alias(cluster_path_col), \
        col(image_expl_col), \
        col(cluster_col), \
        col(x_col), \
        col(y_col))
    clustered_df_2 = images_df.select(col('file_path').alias(cluster_path_col), \
        col(image_expl_col_2).alias(image_expl_col), \
        col(cluster_col_2).alias(cluster_col), \
        col(x_col_2).alias(x_col), \
        col(y_col_2).alias(y_col))
    clustered_df_3 = images_df.select(col('file_path').alias(cluster_path_col), \
        col(image_expl_col_3).alias(image_expl_col), \
        col(cluster_col_3).alias(cluster_col), \
        col(x_col_3).alias(x_col), \
        col(y_col_3).alias(y_col))
    # For multimodal clustering results
    clustered_df_4 = images_df.select(col('file_path'), \
        col(summarized_text_xsum_col_4).alias(summarized_text_xsum_col), \
		col(topic_name_col_4).alias(topic_name_col), \
        col(cluster_col_4).alias(cluster_col), \
        col(x_col_4).alias(x_col), \
        col(y_col_4).alias(y_col))

    clustered_df_1 = clustered_df_1 \
        .withColumn("batch_num", F.lit(batch_num))
    clustered_df_2 = clustered_df_2 \
        .withColumn("batch_num", F.lit(batch_num))
    clustered_df_3 = clustered_df_3 \
        .withColumn("batch_num", F.lit(batch_num))
    # For multimodal clustering results
    clustered_df_4 = clustered_df_4 \
        .withColumn("batch_num", F.lit(batch_num))

    # Save the clustering results type 1 as a single
    clustered_df_1.write.mode("overwrite").parquet(f"{minted_tables_output_path}{batch_num}_clustered_images_report_1")
    clustered_lst_1 = clustered_df_1.collect()

    # Save the clustering results type 2 as a single
    clustered_df_2.write.mode("overwrite").parquet(f"{minted_tables_output_path}{batch_num}_clustered_images_report_2")
    clustered_lst_2 = clustered_df_2.collect()

    # Save the clustering results type 3 as a single
    clustered_df_3.write.mode("overwrite").parquet(f"{minted_tables_output_path}{batch_num}_clustered_images_report_3")
    clustered_lst_3 = clustered_df_3.collect()

    # Save the clustering results type 4 as a single
    # Drop NaN values
    clustered_df_4 = clustered_df_4.na.drop() 
    clustered_df_4.write.mode("overwrite").parquet(f"{minted_tables_output_path}{batch_num}_clustered_images_report_4")
    clustered_lst_4 = clustered_df_4.collect()
    
    with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+serverless_sql_endpoint+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD='+ sql_user_pwd) as conn:
        with conn.cursor() as cursor:
            for tbl_name in [f'{batch_num}_clustered_images_report_{i}' for i in [1,2,3]]:
                sql_cmd = f'''
                    IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = '{tbl_name}') 
                    CREATE EXTERNAL TABLE [{tbl_name}] (
                        [original_uri] nvarchar(4000),
                        [Explanations] nvarchar(max),
                        [cluster] bigint,
                        [X] float,
                        [Y] float,
                        [batch_num] nvarchar(4000)
                    )
                    WITH (
                        LOCATION = 'minted_tables/{tbl_name}/**', 
                        DATA_SOURCE = [synapse_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], 
                        FILE_FORMAT = [SynapseParquetFormat]
                    )
                '''  
                            
                cursor.execute(sql_cmd)

    # For multimodal clustering results
    # Save the clustering results type 4 as a single
    clustered_images_report_tbl_name_4 = f'{batch_num}_clustered_images_report_4'

    clustered_df_sql_command_4 = f"""IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = '{clustered_images_report_tbl_name_4}') 
    CREATE EXTERNAL TABLE [{clustered_images_report_tbl_name_4}] 
    (
        [file_path] nvarchar(4000), 
        [summarized_text_xsum] nvarchar(max), 
		[topic_name] nvarchar(max), 
        [cluster] bigint,
        [X] float,
        [Y] float,
        [batch_num] nvarchar(4000)
    ) WITH (
            LOCATION = 'minted_tables/{clustered_images_report_tbl_name_4}/**', 
            DATA_SOURCE = [synapse_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], 
            FILE_FORMAT = [SynapseParquetFormat]
            )"""

    with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+serverless_sql_endpoint+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD='+ sql_user_pwd) as conn:
      with conn.cursor() as cursor:
        cursor.execute(clustered_df_sql_command_4)

    # Output as CSV
    # Saving to a 'local' file first as we can't save directly to the outut container via open()
    with open('clusters_1.csv', 'w') as f:
        write = csv.writer(f)
        write.writerow(clustered_df_1.columns)
        write.writerows(clustered_lst_1)

    with open('clusters_1.csv', 'r') as f:
        contents = f.read() 
        mssparkutils.fs.put(f'{output_path}/image_processing_clustering/clusters_1.csv', contents, overwrite=True)
    
    with open('clusters_2.csv', 'w') as f:
        write = csv.writer(f)
        write.writerow(clustered_df_2.columns)
        write.writerows(clustered_lst_2)

    with open('clusters_2.csv', 'r') as f:
        contents = f.read() 
        mssparkutils.fs.put(f'{output_path}/image_processing_clustering/clusters_2.csv', contents, overwrite=True)
    
    with open('clusters_3.csv', 'w') as f:
        write = csv.writer(f)
        write.writerow(clustered_df_3.columns)
        write.writerows(clustered_lst_3)

    with open('clusters_3.csv', 'r') as f:
        contents = f.read() 
        mssparkutils.fs.put(f'{output_path}/image_processing_clustering/clusters_3.csv', contents, overwrite=True)

    # For multimodal clustering results
    with open('clusters_4.csv', 'w') as f:
        write = csv.writer(f)
        write.writerow(clustered_df_4.columns)
        write.writerows(clustered_lst_4)

    with open('clusters_4.csv', 'r') as f:
        contents = f.read() 
        mssparkutils.fs.put(f'{output_path}/image_processing_clustering/clusters_4.csv', contents, overwrite=True)

In [ ]:
# return name of new table
output = {'custom_dimensions': {
    'batch_num': batch_num,
    'processed_images_tbl_name': processed_images_tbl_name,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }

## Raise clustering complete event
This is also dependent on image enrichments completing, but this is accepted

In [ ]:
# Prepare the event contents
df = images_df

with tracer.span(name='preparing contents to send to event grid'):   
    from datetime import datetime
    now = datetime.now().strftime("%Y-%m-%dT%H:%M:%S%Z")    
    web_app_uri = config.rule_sets.webapp_uri
    subscriber_uri = config.rule_sets.teams_webhook_uri
    alert_email = config.rule_sets.alert_email
    files_count = df.count()
    df_cluster_count = df.groupBy("cluster").count()
    df_cluster_count = df_cluster_count.orderBy('cluster', ascending=True)
    cluster_json_list = df_cluster_count.toJSON().collect()
    num_of_clusters = df_cluster_count.distinct().count ()
    cluster_output = ''
    for x in range(len(cluster_json_list)): 
        cluster_output = cluster_output + ', ' + cluster_json_list[x]   
    cluster_output = cluster_output[2:]
    cluster_output_str = ''.join(cluster_output)

    # generate the Event Grid json 
    event_data = f'{{"batch_id": "{batch_num}",' \
        f'"batch_description": "{batch_description}",' \
        f'"eventDate": "{now}",' \
        f'"eventMetrics": {{' \
        f'  "event_type": "image",' \
        f'  "files_processed_count": "{image_file_count}",' \
        f'  "event_detail_uri": "https://{web_app_uri}/reports",' \
        f'  "num_of_clusters": {num_of_clusters},' \
        f'  "clusters": [' \
        f'      {cluster_output_str}' \
        f'  ]' \
        f'}},' \
        f'"teams_webhook_endpoint": "{subscriber_uri}",' \
        f'"alert_email": "{alert_email}"' \
        f'}}'

    print(event_data)

    event_data_obj = json.loads(event_data)

In [ ]:
# Raise the event
with tracer.span(name='sending message to event grid'):    
    from azure.identity import ClientSecretCredential
    from azure.eventgrid import EventGridPublisherClient, EventGridEvent    

    # Get value from keyvault to build Event Grid Topic event
    subscription_id = TokenLibrary.getSecretWithLS("keyvault", 'SubscriptionId')
    resource_group_name = TokenLibrary.getSecretWithLS("keyvault", 'ResourceGroupName')
    event_grid_topic_name = TokenLibrary.getSecretWithLS("keyvault", 'EventGridTopicName')
    event_grid_topic_endpoint = TokenLibrary.getSecretWithLS("keyvault", 'EventGridTopicEndpointUri')
    tenant_id = TokenLibrary.getSecretWithLS("keyvault", 'TenantID')
    client_id = TokenLibrary.getSecretWithLS("keyvault", 'ADAppRegClientId')
    client_secret = TokenLibrary.getSecretWithLS("keyvault", 'ADAppRegClientSecret')
    event_grid_topic = f'/subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/providers/Microsoft.EventGrid/topics/{event_grid_topic_name}'
    credential = ClientSecretCredential(tenant_id, client_id, client_secret)
    client = EventGridPublisherClient(event_grid_topic_endpoint, credential)

    try:
        # queue event grid message
        event = EventGridEvent(data=event_data_obj, subject="MINTED/ClusterAlert", event_type="MINTED.ruleTriggered", data_version="1.0", topic=event_grid_topic)
        client.send(event)
        print("done")
    except Exception as e:
        logger.exception(e)
        raise e


In [ ]:
from time import sleep
import datetime
# Update Status Table
def get_recent_status(batch_num, driver, dedicated_sql_endpoint, dedicated_database, sql_user_name, sql_user_pwd):
    query = f"""
        SELECT TOP (1) 
        [num_stages_complete], [description]
        FROM [dbo].[batch_status] 
        WHERE [batch_id] = ?
        ORDER BY [num_stages_complete] DESC;
    """
    with pyodbc.connect(f'DRIVER={driver};SERVER=tcp:{dedicated_sql_endpoint};PORT=1433;DATABASE={dedicated_database};UID={sql_user_name};PWD={sql_user_pwd}',autocommit=True) as conn:
        with conn.cursor() as cursor:
            cursor.execute(query, batch_num)
            num_stages_complete, description = cursor.fetchone()
            return num_stages_complete, description

def update_status_table(status_text, minted_tables_path, batch_num, driver, dedicated_sql_endpoint, sql_user_name, sql_user_pwd):
    retries = 0 
    exc = ''
    while retries < 10:
        try:
            stages_complete, description = get_recent_status(batch_num, driver, dedicated_sql_endpoint, dedicated_database, sql_user_name, sql_user_pwd)
            stages_complete += 1
            status = f'[{stages_complete}/10] {status_text}'
            x = datetime.datetime.now()
            time_stamp = x.strftime("%Y-%m-%d %H:%M:%S")

            sql_command = f"UPDATE batch_status SET status = ?, update_time_stamp = ?, num_stages_complete = ? WHERE batch_id = ?"
            with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+dedicated_sql_endpoint+';PORT=1433;DATABASE='+dedicated_database+';UID='+sql_user_name+';PWD='+ sql_user_pwd+'',autocommit=True) as conn:
                with conn.cursor() as cursor:
                    cursor.execute(sql_command, status, time_stamp, stages_complete, batch_num)
                    cursor.commit()
            return 
        except Exception as e:
            exc_str = str(e)
            exc = e 
            logger.warning(f'Failed to update status table: {exc_str}, retrying . . .')
            retries += 1
            sleep(3)

    raise exc

update_status_table('Image Processing Complete', minted_tables_output_path, batch_num, driver, dedicated_sql_endpoint, sql_user_name, sql_user_pwd)


# Return the object to the pipeline
logger.info(f"{mssparkutils.runtime.context['notebookname']}: OUTPUT", extra=output)
mssparkutils.notebook.exit(output['custom_dimensions'])        