In [ ]:
%%configure -f
{
"conf": {
     "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
     "spark.dynamicAllocation.enabled": true,
     "spark.dynamicAllocation.minExecutors": 2,
     "spark.dynamicAllocation.maxExecutors": 8
   }
}

In [ ]:
media_tbl_name = ""
batch_root = ""
batch_num = ""
file_system = ""
azure_storage_domain = ""
blob_account_name = ""
minted_tables_output_path = ""

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

config_integration.trace_integrations(['logging'])

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
  'batch_root': batch_root,
  'batch_num': batch_num,
  'file_system': file_system,
  'media_tbl_name': media_tbl_name,
  'notebook_name': mssparkutils.runtime.context['notebookname']
} }

logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
import json
import random
from types import SimpleNamespace
from typing import List

import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, StructType, StructField
from pyspark import SparkContext
from pyspark.sql import SparkSession

def read_batch_config(batch_root: str):
    """
    We read the config file using the Java File System API as we do not need to let multiple nodes read individual lines and join it
    all back together again
    """
    # Change our file system from 'synapse' to 'input'
    sc._jsc.hadoopConfiguration().set("fs.defaultFS", file_system)

    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())
    config_path = sc._jvm.org.apache.hadoop.fs.Path(f'{batch_root}/config.json')

    # If we don't have a batch config, copy the global one.
    if fs.exists(config_path) != True:
        logger.error(f'{config_path} not found.')

    # Open our file directly rather than through spark
    input_stream = fs.open(config_path)  # FSDataInputStream

    config_string = sc._jvm.java.io.BufferedReader(
        sc._jvm.java.io.InputStreamReader(input_stream, sc._jvm.java.nio.charset.StandardCharsets.UTF_8)
        ).lines().collect(sc._jvm.java.util.stream.Collectors.joining("\n"))

    # Load it into json    
    return json.loads(''.join(config_string), object_hook=lambda dictionary: SimpleNamespace(**dictionary))

with tracer.span(name='Initialise Spark session'):
    sc = spark.sparkContext
    spark = SparkSession.builder.appName(f"ImageProcessing {mssparkutils.runtime.context}").getOrCreate()

with tracer.span(name=f"Load config: {mssparkutils.runtime.context['notebookname']}"):
    try:
        config = read_batch_config(batch_root)
    except Exception as e:
        logger.exception(e)
        raise e

    # Set log level
    if config.log_level == "INFO":
        logger.setLevel(logging.INFO)
    else:
        logger.setLevel(logging.ERROR)
        config.log_level = "ERROR"

In [ ]:
import pyodbc
from pyspark.sql.functions import current_timestamp
# Dedicated and serverless SQL config
dedicated_database = 'dedicated'
database = 'minted'   
driver= '{ODBC Driver 17 for SQL Server}'

# secrets
sql_user_name = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLUserName")
sql_user_pwd = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLPassword")
serverless_sql_endpoint = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseServerlessSQLEndpoint")
dedicated_sql_endpoint = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseDedicatedSQLEndpoint")

In [ ]:
from time import sleep
import datetime
# Update Status Table
def get_recent_status(batch_num, driver, dedicated_sql_endpoint, dedicated_database, sql_user_name, sql_user_pwd):
    query = f"""
        SELECT TOP (1) 
        [num_stages_complete], [description]
        FROM [dbo].[batch_status] 
        WHERE [batch_id] = ?
        ORDER BY [num_stages_complete] DESC;
    """
    with pyodbc.connect(f'DRIVER={driver};SERVER=tcp:{dedicated_sql_endpoint};PORT=1433;DATABASE={dedicated_database};UID={sql_user_name};PWD={sql_user_pwd}',autocommit=True) as conn:
        with conn.cursor() as cursor:
            cursor.execute(query, batch_num)
            num_stages_complete, description = cursor.fetchone()
            return num_stages_complete, description

def update_status_table(status_text, minted_tables_path, batch_num, driver, dedicated_sql_endpoint, sql_user_name, sql_user_pwd):
    retries = 0 
    exc = ''
    while retries < 10:
        try:
            stages_complete, description = get_recent_status(batch_num, driver, dedicated_sql_endpoint, dedicated_database, sql_user_name, sql_user_pwd)
            stages_complete += 1
            status = f'[{stages_complete}/10] {status_text}'
            x = datetime.datetime.now()
            time_stamp = x.strftime("%Y-%m-%d %H:%M:%S")

            sql_command = f"UPDATE batch_status SET status = ?, update_time_stamp = ?, num_stages_complete = ? WHERE batch_id = ?"
            with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+dedicated_sql_endpoint+';PORT=1433;DATABASE='+dedicated_database+';UID='+sql_user_name+';PWD='+ sql_user_pwd+'',autocommit=True) as conn:
                with conn.cursor() as cursor:
                    cursor.execute(sql_command, status, time_stamp, stages_complete, batch_num)
                    cursor.commit()
            return 
        except Exception as e:
            exc_str = str(e)
            exc = e 
            logger.warning(f'Failed to update status table: {exc_str}, retrying . . .')
            retries += 1
            sleep(3)

    raise exc

update_status_table('Media Prep Started', minted_tables_output_path, batch_num, driver, dedicated_sql_endpoint, sql_user_name, sql_user_pwd)  

In [ ]:
with tracer.span(name='Get media contents'):
    #Load media contents into table to be used by downstream notebooks. 
    media_df = spark.read.parquet(f'{minted_tables_output_path}{media_tbl_name}')
    #media_avi = spark.read.format("binaryFile").option("recursiveFileLookup", "true").option("pathGlobFilter", "*.avi").load(f'{batch_root}')
    #media_mp4 = spark.read.format("binaryFile").option("recursiveFileLookup", "true").option("pathGlobFilter", "*.mp4").load(f'{batch_root}')
    #media_mp3 = spark.read.format("binaryFile").option("recursiveFileLookup", "true").option("pathGlobFilter", "*.mp3").load(f'{batch_root}')

    #media_mpg = spark.read.format("binaryFile").option("recursiveFileLookup", "true").option("pathGlobFilter", "*.mpg").load(f'{batch_root}')
    #media_wmv = spark.read.format("binaryFile").option("recursiveFileLookup", "true").option("pathGlobFilter", "*.wmv").load(f'{batch_root}')
    #media_wav = spark.read.format("binaryFile").option("recursiveFileLookup", "true").option("pathGlobFilter", "*.wav").load(f'{batch_root}')    
    #media_mov = spark.read.format("binaryFile").option("recursiveFileLookup", "true").option("pathGlobFilter", "*.mov").load(f'{batch_root}')    
    
    #media_content_df = media_avi.union(media_mp4).union(media_mp3).union(media_mpg).union(media_wmv).union(media_wav).union(media_mov)
    #media_content_df = media_content_df.join(media_df, media_df.file_path == media_content_df.path, 'inner').drop('file_path')
    media_content_df = media_df.select(col("file_path").alias("path"), col("file_name"), col("file_type"))

## De-duplicate Media
For MINTED 2.0 Accelerator, it assumed that the media data provided is already de-duplicated.

In [ ]:
with tracer.span(name='De-duplicate media'):
    #Insert image de-duplication logic here based you the types of image content being consumed. 
    temp = ''

In [ ]:
# Persist Media 
with tracer.span(name='Persist final media set to table'):
    media_contents_tbl_name = f"{batch_num}_media_contents"
    media_content_df.write.mode("overwrite").parquet(f'{minted_tables_output_path}{media_contents_tbl_name}')
    ext_table_command = f"IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = '{media_contents_tbl_name}') CREATE EXTERNAL TABLE [{media_contents_tbl_name}] ([path] nvarchar(4000), [file_name] nvarchar(4000), [file_type] nvarchar(4000)) WITH (LOCATION = 'minted_tables/{media_contents_tbl_name}/**', DATA_SOURCE = [synapse_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], FILE_FORMAT = [SynapseParquetFormat])"
    with pyodbc.connect(f'DRIVER={driver};SERVER=tcp:{serverless_sql_endpoint};PORT=1433;DATABASE={database};UID={sql_user_name};PWD={sql_user_pwd}') as conn:
        with conn.cursor() as cursor:
            cursor.execute(ext_table_command)

In [ ]:
# return values to be used by other notebooks
output = {'custom_dimensions': {
    'batch_num': batch_num,
    'media_contents_tbl_name': media_contents_tbl_name,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }

# Return the object to the pipeline
logger.info(f"{mssparkutils.runtime.context['notebookname']}: OUTPUT", extra=output)
mssparkutils.notebook.exit(output['custom_dimensions'])