In [ ]:
%%configure -f
{
"conf": {
     "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
     "spark.dynamicAllocation.enabled": true,
     "spark.dynamicAllocation.minExecutors": 2,
     "spark.dynamicAllocation.maxExecutors": 4
   }
}

In [ ]:
image_contents_tbl_name = ''
batch_root = ''
batch_num = ''
file_system = ''
minted_tables_output_path = ""

In [ ]:
import numpy as np
import io
import pandas as pd
import ntpath
import os

from pyspark.sql.types import StringType
from pyspark.sql.functions import col, pandas_udf, lit, struct, PandasUDFType, udf
import pyspark.sql.types as Types
from pyspark.ml.linalg import Vectors, VectorUDT

from PIL import Image
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input, decode_predictions

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

config_integration.trace_integrations(['logging'])

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'image_contents_tbl_name': image_contents_tbl_name,
    'batch_root': batch_root,
    'batch_num': batch_num,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
import json
import os
import random
import uuid
from types import SimpleNamespace

import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, StructType, StructField
from pyspark import SparkContext
from pyspark.sql import SparkSession

# Initialise session and config
sc = spark.sparkContext
spark = SparkSession.builder.appName(f"ImageProcessing {mssparkutils.runtime.context}").getOrCreate()

def read_batch_config(batch_root: str):
    """
    We read the config file using the Java File System API as we do not need to let multiple nodes read individual lines and join it
    all back together again
    """
    # Change our file system from 'synapse' to 'input'
    sc._jsc.hadoopConfiguration().set("fs.defaultFS", file_system)

    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())
    config_path = sc._jvm.org.apache.hadoop.fs.Path(f'{batch_root}/config.json')

    # If we don't have a batch config, copy the global one.
    if fs.exists(config_path) != True:
        logger.error(f'{config_path} not found.')

    # Open our file directly rather than through spark
    input_stream = fs.open(config_path)  # FSDataInputStream

    config_string = sc._jvm.java.io.BufferedReader(
        sc._jvm.java.io.InputStreamReader(input_stream, sc._jvm.java.nio.charset.StandardCharsets.UTF_8)
        ).lines().collect(sc._jvm.java.util.stream.Collectors.joining("\n"))

    # Load it into json    
    return json.loads(''.join(config_string), object_hook=lambda dictionary: SimpleNamespace(**dictionary))

with tracer.span(name=f"Load config: {mssparkutils.runtime.context['notebookname']}"):
    try:
        config = read_batch_config(batch_root)
    except Exception as e:
        logger.exception(e)
        raise e

    # Set log level
    if config.log_level == "INFO":
        logger.setLevel(logging.INFO)
    else:
        logger.setLevel(logging.ERROR)
        config.log_level = "ERROR"

    job_config_parameters = {'custom_dimensions': {
        'batch_num': batch_num,
        'notebook_name': mssparkutils.runtime.context['notebookname']
    } }
    
    logger.info(f"{mssparkutils.runtime.context['notebookname']}: JOB_CONFIG", extra=job_config_parameters)

In [ ]:
import pyodbc
from pyspark.sql.functions import col
# serverless SQL config
database = 'minted'   
driver= '{ODBC Driver 17 for SQL Server}'

# secrets
sql_user_name = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLUserName")
sql_user_pwd = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLPassword")
serverless_sql_endpoint = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseServerlessSQLEndpoint")

In [ ]:
with tracer.span(name='Load images table'):
    images_df = spark.read.parquet(f'{minted_tables_output_path}{image_contents_tbl_name}')
    images_df = images_df.select(col('path'), col('file_name'), col('file_type'), col('content'))
    # images_df = spark.sql("SELECT path, file_name, file_type, content FROM " + image_contents_tbl_name)

In [ ]:
def get_filename(row):
    return ntpath.basename(row)

def preprocess(img_data):
  try:
    img = Image.open(io.BytesIO(img_data)).convert('RGB')
    img = img.resize([299, 299])
    x = np.asarray(img, dtype="float32")
  except:
    x = np.zeros((299, 299, 3))
  return preprocess_input(x)

def keras_model_udf(model_fn):
  def predict(image_batch_iter):
    model = model_fn()
    for img_series in image_batch_iter:
      processed_images = np.array([preprocess(img) for img in img_series])
      predictions = model.predict(processed_images, batch_size=64)
      predicted_labels = [x[0] for x in decode_predictions(predictions, top=1)]
      results = []
      for i, tuples in enumerate(predicted_labels):
        all_predictions = tuples + (predictions[i],)
        results.append(all_predictions)

      yield pd.DataFrame(results)

  return_type = "class: string, desc: string, score:float, inceptionv3: array<float>"
  return pandas_udf(return_type, PandasUDFType.SCALAR_ITER)(predict)  

def inceptionv3_fn():
    model = InceptionV3(weights='imagenet')
    model.set_weights(bc_model_weights.value)
    return model

In [ ]:
with tracer.span(name='Extract features and predict InceptionV3'):
    model = InceptionV3()
    bc_model_weights = sc.broadcast(model.get_weights())
    inceptionv3_udf = keras_model_udf(inceptionv3_fn)
    predictions = images_df.withColumn("preds", inceptionv3_udf(col("content")))

In [ ]:
with tracer.span(name='Build vectors dataframe'):
    list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
    df_with_vectors = predictions.select(
        predictions["path"],
        predictions["content"],
        predictions["file_name"],
        predictions["file_type"],
        predictions["preds.desc"], 
        predictions["preds.inceptionv3"], 
        list_to_vector_udf(predictions["preds.inceptionv3"]).alias("features")
    )
with tracer.span(name='Persist to feature image table'):
    image_features_tbl_name = f'{batch_num}_image_features'
    df_with_vectors.write.mode("overwrite").parquet(f'{minted_tables_output_path}{image_features_tbl_name}')
    sql_command = f'''
        IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = '{image_features_tbl_name}') 
        CREATE EXTERNAL TABLE [{image_features_tbl_name}] (
            [path] nvarchar(1000), 
            [content] varbinary(max), 
            [file_name] nvarchar(1000), 
            [file_type] nvarchar(1000), 
            [desc] nvarchar(4000), 
            [inceptionv3] varchar(max),
            [features] varchar(max)
        )
        WITH (
            LOCATION = 'minted_tables/{image_features_tbl_name}/**', 
            DATA_SOURCE = [synapse_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], 
            FILE_FORMAT = [SynapseParquetFormat]
        )
    '''
    with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+serverless_sql_endpoint+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD='+ sql_user_pwd) as conn:
        with conn.cursor() as cursor:
            cursor.execute(sql_command)

# return name of new table
output = {'custom_dimensions': {
    'batch_num': batch_num,
    'image_features_tbl_name': image_features_tbl_name,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }

# Return the object to the pipeline
logger.info(f"{mssparkutils.runtime.context['notebookname']}: OUTPUT", extra=output)
mssparkutils.notebook.exit(output['custom_dimensions'])