In [ ]:
%%configure -f
{
"conf": {
     "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
     "spark.dynamicAllocation.enabled": true,
     "spark.dynamicAllocation.minExecutors": 2,
     "spark.dynamicAllocation.maxExecutors": 4
   }
}

In [ ]:
import numpy as np
import io
import pandas as pd
import ntpath
import os

from pyspark.sql.types import StringType
from pyspark.sql.functions import col, pandas_udf, lit, struct, PandasUDFType, udf
import pyspark.sql.types as Types
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import PCA
from pyspark.ml.clustering import KMeans, BisectingKMeans
from pyspark.ml import Pipeline
from sklearn.manifold import TSNE

from PIL import Image
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input, decode_predictions


In [ ]:
image_features_tbl_name = ''
batch_root = ''
batch_num = ''
file_system = ''
image_clustering_config = ''
blob_account_name = ''
azure_storage_domain = ''
minted_tables_output_path = ""

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

config_integration.trace_integrations(['logging'])

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'image_features_tbl_name': image_features_tbl_name,
    'batch_root': batch_root,
    'batch_num': batch_num,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
import json
import os
import random
import uuid
from types import SimpleNamespace

import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, StructType, StructField, IntegerType, FloatType
from pyspark import SparkContext
from pyspark.sql import SparkSession

# Initialise session and config
sc = spark.sparkContext
spark = SparkSession.builder.appName(f"ImageProcessing {mssparkutils.runtime.context}").getOrCreate()

def read_batch_config(batch_root: str):
    """
    We read the config file using the Java File System API as we do not need to let multiple nodes read individual lines and join it
    all back together again
    """
    # Change our file system from 'synapse' to 'input'
    sc._jsc.hadoopConfiguration().set("fs.defaultFS", file_system)

    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())
    config_path = sc._jvm.org.apache.hadoop.fs.Path(f'{batch_root}/config.json')

    # If we don't have a batch config, copy the global one.
    if fs.exists(config_path) != True:
        logger.error(f'{config_path} not found.')

    # Open our file directly rather than through spark
    input_stream = fs.open(config_path)  # FSDataInputStream

    config_string = sc._jvm.java.io.BufferedReader(
        sc._jvm.java.io.InputStreamReader(input_stream, sc._jvm.java.nio.charset.StandardCharsets.UTF_8)
        ).lines().collect(sc._jvm.java.util.stream.Collectors.joining("\n"))

    # Load it into json    
    return json.loads(''.join(config_string), object_hook=lambda dictionary: SimpleNamespace(**dictionary))

with tracer.span(name=f"Load config: {mssparkutils.runtime.context['notebookname']}"):
    try:
        config = read_batch_config(batch_root)
        job_config = config.__dict__[image_clustering_config]
    except Exception as e:
        logger.exception(e)
        raise e

    # Set log level
    if config.log_level == "INFO":
        logger.setLevel(logging.INFO)
    else:
        logger.setLevel(logging.ERROR)
        config.log_level = "ERROR"

    job_config_parameters = {'custom_dimensions': {
        'batch_num': batch_num,
        'k': job_config.k,
        'pca1': job_config.pca1,
        'perplexity': job_config.perplexity,
        'algorithm' : job_config.algorithm,
        'max_iter': job_config.max_iter,
        'notebook_name': mssparkutils.runtime.context['notebookname']
    } }
    
    logger.info(f"{mssparkutils.runtime.context['notebookname']}: JOB_CONFIG", extra=job_config_parameters)

In [ ]:
import pyodbc
from pyspark.sql.functions import col
# serverless SQL config
database = 'minted'   
driver= '{ODBC Driver 17 for SQL Server}'

# secrets
sql_user_name = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLUserName")
sql_user_pwd = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLPassword")
serverless_sql_endpoint = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseServerlessSQLEndpoint")

In [ ]:
with tracer.span(name='Load images table'):
    df_with_vectors = spark.read.parquet(f'{minted_tables_output_path}{image_features_tbl_name}')
    df_with_vectors = df_with_vectors.select(col('path'), col('file_name'), col('features'), col('inceptionv3'), col('desc'), col('content'))

    # Until we have more functional test data, we will need to exit here as clustering will fail on the current test data set as it contains 1 record

    if df_with_vectors.count() <= 1:
        # return name of new table
        empty_RDD = spark.sparkContext.emptyRDD()
        columns = StructType([
            StructField('original_uri', StringType()),
            StructField('file_name', StringType()),
            StructField('Explanations', StringType()),
            StructField('cluster', IntegerType()),
            StructField('X', FloatType()),
            StructField('Y', FloatType()),
        ])
        df = spark.createDataFrame(data=empty_RDD, schema=columns)
        clustered_image_tbl_name = f'{batch_num}_clustered_image_' + str(image_clustering_config[-1])
        df.write.mode("overwrite").parquet(f'{minted_tables_output_path}{clustered_image_tbl_name}')

        output = {'custom_dimensions': {
            'batch_num': batch_num,
            'clustered_image_tbl_name': clustered_image_tbl_name,
            'notebook_name': mssparkutils.runtime.context['notebookname'],
            'error': "Not enough records to cluster"
        } }
        logger.info(f"{mssparkutils.runtime.context['notebookname']}: OUTPUT", extra=output)
        mssparkutils.notebook.exit(output['custom_dimensions'])


In [ ]:
with tracer.span(name='Create image clustering pipeline'):
    k = int(job_config.k) 
    pca_1 = PCA(k=int(job_config.pca1), inputCol="features")
    pca_1.setOutputCol("pca_features")

    if job_config.algorithm == "kmeans":
        kmeans = KMeans(k=k, seed=42, initMode="k-means||", distanceMeasure="cosine")
    else:
        kmeans = BisectingKMeans(k=k, seed=42, distanceMeasure="cosine", maxIter=int(job_config.max_iter))
    pipeline = Pipeline(stages=[pca_1, kmeans])

with tracer.span(name='Fit image pipeline'):
    model = pipeline.fit(df_with_vectors)

with tracer.span(name='Transform image pipeline'):    
    results = model.transform(df_with_vectors)

with tracer.span(name='Convert to pandas image dataframe'):
    pandas_df = results.toPandas()

with tracer.span(name='T-SNE dimensionality reduction'):
    series = pandas_df['features'].apply(lambda x : np.array(x.toArray())).to_numpy().reshape(-1,1)
    features = np.apply_along_axis(lambda x : x[0], 1, series)
    tsne = TSNE(verbose=1, perplexity=int(job_config.perplexity)) 
    X_embedded = tsne.fit_transform(features)

with tracer.span(name='Clean up image dataframe'):
    pandas_df.rename(columns = {'path':'original_uri', 'prediction':'cluster', 'desc': 'Explanations'}, inplace = True)
    pandas_df['X'] = X_embedded[:,0]
    pandas_df['Y'] = X_embedded[:,1]
    pandas_df.drop(columns = ["content", "inceptionv3", "features", "pca_features"], inplace = True)

with tracer.span(name='Move back to spark RDD'):
    df = spark.createDataFrame(pandas_df)

with tracer.span(name='Persist to clustered image table'):
    clustered_image_tbl_name = f'{batch_num}_clustered_image_' + str(image_clustering_config[-1])
    df.write.mode("overwrite").parquet(f'{minted_tables_output_path}{clustered_image_tbl_name}')
    ext_table_command = f"""
        IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = '{clustered_image_tbl_name}') 
        CREATE EXTERNAL TABLE [{clustered_image_tbl_name}] (
            [original_uri] nvarchar(4000),
            [file_name] nvarchar(4000),
            [Explanations] nvarchar(4000),
            [cluster] bigint,
            [X] float,
            [Y] float
        ) 
        WITH (
            LOCATION = 'minted_tables/{clustered_image_tbl_name}/**', 
            DATA_SOURCE = [synapse_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], 
            FILE_FORMAT = [SynapseParquetFormat]
        )
    """
    with pyodbc.connect(f'DRIVER={driver};SERVER=tcp:{serverless_sql_endpoint};PORT=1433;DATABASE={database};UID={sql_user_name};PWD={sql_user_pwd}') as conn:
        with conn.cursor() as cursor:
            cursor.execute(ext_table_command)

# return name of new table
output = {'custom_dimensions': {
    'batch_num': batch_num,
    'clustered_image_tbl_name': clustered_image_tbl_name,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }

# Return the object to the pipeline
logger.info(f"{mssparkutils.runtime.context['notebookname']}: OUTPUT", extra=output)
mssparkutils.notebook.exit(output['custom_dimensions'])