In [ ]:
%%configure -f
{
"conf": {
     "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
     "spark.dynamicAllocation.enabled": true,
     "spark.dynamicAllocation.minExecutors": 2,
     "spark.dynamicAllocation.maxExecutors": 8,
     "spark.jars.packages": "com.microsoft.azure:synapseml_2.12:0.10.0-19-c3a445c5-SNAPSHOT",
     "spark.jars.repositories": "https://mmlspark.azureedge.net/maven",
     "spark.jars.excludes": "org.scala-lang:scala-reflect,org.apache.spark:spark-tags_2.12,org.scalactic:scalactic_2.12,org.scalatest:scalatest_2.12,com.fasterxml.jackson.core:jackson-databind",
     "spark.yarn.user.classpath.first": "true"
   }
}

In [ ]:
azure_storage_domain = ""
batch_num = ""
batch_root = ""
file_system = ""
image_contents_tbl_name = ""
blob_account_name = ""
minted_tables_output_path = ""

In [ ]:
file_path_col = "path"

# Load secrets
instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
image_analytics_keys = mssparkutils.credentials.getSecretWithLS("keyvault", 'ComputerVisonKeys').split(',')

image_analytics_key_col = "image_analytics_key"

cog_svc_concurrency = 1

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

config_integration.trace_integrations(['logging'])

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'batch_num': batch_num,
    'file_system': file_system,
    'image_contents_tbl_name': image_contents_tbl_name,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
import json
import os
import random
import uuid
from types import SimpleNamespace

import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, StructType, StructField
from pyspark import SparkContext
from pyspark.sql import SparkSession

from synapse.ml.cognitive import *

# Initialise session and config
sc = spark.sparkContext
spark = SparkSession.builder.appName(f"TextProcessing {mssparkutils.runtime.context}").getOrCreate()

def read_batch_config(batch_root: str):
    """
    We read the config file using the Java File System API as we do not need to let multiple nodes read individual lines and join it
    all back together again
    """
    # Change our file system from 'synapse' to 'input'
    sc._jsc.hadoopConfiguration().set("fs.defaultFS", file_system)

    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())
    config_path = sc._jvm.org.apache.hadoop.fs.Path(f'{batch_root}/config.json')

    # If we don't have a batch config, copy the global one.
    if fs.exists(config_path) != True:
        logger.error(f'{config_path} not found.')

    # Open our file directly rather than through spark
    input_stream = fs.open(config_path)  # FSDataInputStream

    config_string = sc._jvm.java.io.BufferedReader(
        sc._jvm.java.io.InputStreamReader(input_stream, sc._jvm.java.nio.charset.StandardCharsets.UTF_8)
        ).lines().collect(sc._jvm.java.util.stream.Collectors.joining("\n"))

    # Load it into json    
    return json.loads(''.join(config_string), object_hook=lambda dictionary: SimpleNamespace(**dictionary))

with tracer.span(name=f"Load config: {mssparkutils.runtime.context['notebookname']}"):
    try:
        config = read_batch_config(batch_root)
    except Exception as e:
        logger.exception(e)
        raise e

    # Set log level
    if config.log_level == "INFO":
        logger.setLevel(logging.INFO)
    else:
        logger.setLevel(logging.ERROR)
        config.log_level = "ERROR"

In [ ]:
import pyodbc
from pyspark.sql.functions import current_timestamp
# serverless SQL config
database = 'minted'   
driver= '{ODBC Driver 17 for SQL Server}'

# secrets
sql_user_name = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLUserName")
sql_user_pwd = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLPassword")
serverless_sql_endpoint = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseServerlessSQLEndpoint")

In [ ]:
with tracer.span(name='Load image contents table'):
    df_image_contents = spark.read.parquet(f'{minted_tables_output_path}{image_contents_tbl_name}')

with tracer.span(name='Distribute cognitive service keys across rows'):
    def rand_key() :
        index = random.randint(0, len(image_analytics_keys)-1)
        return image_analytics_keys[index]
    udf_rand_key = F.udf(rand_key, StringType())

    df_image_contents = df_image_contents.withColumn(image_analytics_key_col, udf_rand_key())

In [ ]:
# Use AnalyzeImage tranformer to process the list of visual features for each image
with tracer.span(name='Analyze Images with Cog Services'):
    analysis = (AnalyzeImage()
            .setLocation(config.location)
            .setSubscriptionKeyCol(image_analytics_key_col)
            .setVisualFeatures(["Categories", "Color", "Description", "Faces", "Objects", "Tags", "Adult"])
            .setDetails(["Landmarks"])
            .setDescriptionExclude(["Celebrities"])
            .setOutputCol("analysis_results")
            .setImageBytesCol("content")
            .setErrorCol("image_analysis_error")
            .setConcurrency(cog_svc_concurrency)
            <<SYNAPSE_ML_ANALYZE_IMAGE_ENDPOINT_CMD>>
            )

    df_enriched_images = analysis.transform(df_image_contents)

# Use AnalyzeImage tranformer to OCR any text within each image
with tracer.span(name="Read Images with Cognitive Services"):
        read = (ReadImage()
                .setLocation(config.location)
                .setSubscriptionKeyCol(image_analytics_key_col)
                .setImageBytesCol("content")
                .setOutputCol("read_results")
                .setErrorCol("read_error")
                .setConcurrency(cog_svc_concurrency)
                .setSuppressMaxRetriesExceededException(True)
                <<SYNAPSE_ML_READ_IMAGE_ENDPOINT_CMD>>
                )
        df_enriched_images = read.transform(df_enriched_images)

In [ ]:
with tracer.span(name='Persist enriched images as table'):
    enriched_image_tbl_name = f'{batch_num}_enriched_images'
    df_enriched_images = df_enriched_images.drop('content')
    df_enriched_images.write.mode("overwrite").parquet(f'{minted_tables_output_path}{enriched_image_tbl_name}')
    sql_command = f'''
        IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = '{enriched_image_tbl_name}') 
        CREATE EXTERNAL TABLE [{enriched_image_tbl_name}] (
            [path] nvarchar(1000), 
            [modificationTime] datetime2(7), 
            [length] bigint,
            [file_name] nvarchar(1000), 
            [file_type] nvarchar(1000), 
            [image_analytics_key] nvarchar(1000),
            [image_analysis_error] varchar(max),
            [analysis_results] varchar(max),
            [read_error] varchar(max),
            [read_results] varchar(max),
        )
        WITH (
            LOCATION = 'minted_tables/{enriched_image_tbl_name}/**', 
            DATA_SOURCE = [synapse_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], 
            FILE_FORMAT = [SynapseParquetFormat]
        )
    '''
    with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+serverless_sql_endpoint+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD='+ sql_user_pwd) as conn:
        with conn.cursor() as cursor:
            cursor.execute(sql_command)


# return name of new table
output = {'custom_dimensions': {
    'batch_num': batch_num,
    'enriched_image_tbl_name': enriched_image_tbl_name,
    'file_system': file_system,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }

# Return the object to the pipeline
logger.info(f"{mssparkutils.runtime.context['notebookname']}: OUTPUT", extra=output)
mssparkutils.notebook.exit(output['custom_dimensions'])