In [ ]:
%%configure -f
{
"conf": {
     "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
     "spark.dynamicAllocation.enabled": true,
     "spark.dynamicAllocation.minExecutors": 2,
     "spark.dynamicAllocation.maxExecutors": 8
   }
}

In [ ]:
# This is a parameters cell where we define the batch_file details as params to be passed in by the pipeline
log_level = ''
batch_num = ''
batch_root = ''
output_container = ''
file_system = ''
json_path = ''
video_id = ''
state = ''
media_path = ''
media_file_name = ''
batch_description = ''
azure_storage_domain = ''
blob_account_name = ''

In [ ]:
from azure.identity import ClientSecretCredential
from azure.mgmt.keyvault import KeyVaultManagementClient
from pyspark.sql import SparkSession
import json
from types import SimpleNamespace
from requests.structures import CaseInsensitiveDict
import requests as req
from datetime import datetime

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

config_integration.trace_integrations(['logging'])
instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
media_contents_tbl_name = f'{batch_num}_submitted_media'
run_time_parameters = {'custom_dimensions': {
    'batch_num': batch_num,
    'file_system': file_system,
    'media_contents_tbl_name': media_contents_tbl_name,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }

logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)


In [ ]:
# Load keys, set defaults
with tracer.span(name='load values from key vault'):
    instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
    subscription_id = mssparkutils.credentials.getSecretWithLS("keyvault", "SubscriptionId")
    resource_group_name = mssparkutils.credentials.getSecretWithLS("keyvault", "ResourceGroupName")
    subscription_id = mssparkutils.credentials.getSecretWithLS("keyvault", "SubscriptionId")
    tenant_id = mssparkutils.credentials.getSecretWithLS("keyvault", "TenantID")
    client_id = mssparkutils.credentials.getSecretWithLS("keyvault", "ADAppRegClientId")
    client_secret = mssparkutils.credentials.getSecretWithLS("keyvault", "ADAppRegClientSecret")
    storage_account_name = mssparkutils.credentials.getSecretWithLS("keyvault", "StorageAccountName")
    storage_account_key = mssparkutils.credentials.getSecretWithLS("keyvault", "StorageAccountKey")
    vi_account_name = mssparkutils.credentials.getSecretWithLS("keyvault", "VideoIndexerAccountName")
    apiUrl = "<<TF_VAR_azure_avam_api_domain>>" #api's are documented here... https://api-portal.videoindexer.ai/

    azure_resource_manager = "<<TF_VAR_azure_arm_management_api>>";
    credential = ClientSecretCredential(tenant_id, client_id, client_secret)

In [ ]:
# Initialise session and config
sc = spark.sparkContext
spark = SparkSession.builder.appName(f"TextProcessing {mssparkutils.runtime.context}").getOrCreate()

def read_batch_config(batch_root: str):
    """
    We read the config file using the Java File System API as we do not need to let multiple nodes read individual lines and join it
    all back together again
    """
    # Change our file system from 'synapse' to 'input'
    sc._jsc.hadoopConfiguration().set("fs.defaultFS", file_system)

    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())
    config_path = sc._jvm.org.apache.hadoop.fs.Path(f'{batch_root}/config.json')

    # If we don't have a batch config, copy the global one.
    if fs.exists(config_path) != True:
        logger.error(f'{config_path} not found.')

    # Open our file directly rather than through spark
    input_stream = fs.open(config_path)  # FSDataInputStream

    config_string = sc._jvm.java.io.BufferedReader(
        sc._jvm.java.io.InputStreamReader(input_stream, sc._jvm.java.nio.charset.StandardCharsets.UTF_8)
        ).lines().collect(sc._jvm.java.util.stream.Collectors.joining("\n"))

    # Load it into json    
    return json.loads(''.join(config_string), object_hook=lambda dictionary: SimpleNamespace(**dictionary))

with tracer.span(name=f"Load config: {mssparkutils.runtime.context['notebookname']}"):
    try:
        config = read_batch_config(batch_root)
    except Exception as e:
        logger.exception(e)
        raise e

    # Set log level
    if config.log_level == "INFO":
        logger.setLevel(logging.INFO)
    else:
        logger.setLevel(logging.ERROR)
        config.log_level = "ERROR"

In [ ]:
import pyodbc
# serverless SQL config
database = 'minted'   
driver= '{ODBC Driver 17 for SQL Server}'
minted_tables_path = f'abfss://synapse@{blob_account_name}.dfs.{azure_storage_domain}/minted_tables/'

# secrets
sql_user_name = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLUserName")
sql_user_pwd = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLPassword")
serverless_sql_endpoint = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseServerlessSQLEndpoint")

# Retrieve access tokens and retrive data from VI

In [ ]:
with tracer.span(name='Get ARM access token (bearer token)'):
    # Get ARM access token (bearer token)
    token_context = "<<TF_VAR_azure_arm_management_api>>/.default"
    arm_token = credential.get_token(token_context).token

In [ ]:
with tracer.span(name='Get VI account details'):
    api_version = config.video_indexer_api_version
    # Get Account information
    # these top level API's are documented and testable here... https://docs.microsoft.com/en-us/rest/api/videoindexer/accounts/list
    request_url = f'{azure_resource_manager}/subscriptions/{subscription_id}/resourcegroups/{resource_group_name}/providers/Microsoft.VideoIndexer/accounts/{vi_account_name}/?api-version={api_version}'
    headers = CaseInsensitiveDict()
    headers["Accept"] = "application/json"
    headers["Authorization"] = "Bearer " + arm_token
    response = req.get(request_url, headers=headers)
    response = response.json()
    vi_account_id = response['properties']['accountId']
    vi_account_location = response['location']

In [ ]:
with tracer.span(name='Get account level access token for Azure Video Analyzer for Media'):
    # Get account level access token for Azure Video Analyzer for Media 
    request_url = f'{azure_resource_manager}/subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/providers/Microsoft.VideoIndexer/accounts/{vi_account_name}/generateAccessToken?api-version={api_version}'
    headers = CaseInsensitiveDict()
    headers["Accept"] = "application/json"
    headers["Authorization"] = "Bearer " + arm_token
    body = '{"permissionType":"Contributor","scope":"Account","projectId":null,"videoId":null}'
    body = json.loads(body)
    response = req.post(request_url, headers=headers, json=body)
    response = response.json()
    account_access_token = response["accessToken"] 

In [ ]:
with tracer.span(name='Retrieve media enrichments'):
    # Retrieve media enrichments
    params = CaseInsensitiveDict()
    params["accessToken"] = account_access_token
    headers = CaseInsensitiveDict()
    request_url = f'{apiUrl}/{vi_account_location}/Accounts/{vi_account_id}/Videos/{video_id}/Index'
    response = req.get(request_url, headers=headers, params=params)
    response_json = response.json()
    response_str = json.dumps(response_json)

In [ ]:
# write output to a df
columns = ["media_path", "media_file_name", "video_id", "enrichments", "original_lang"]
data = [[media_path, media_file_name, video_id, response_str, response_json["videos"][0]["insights"]["sourceLanguage"] ]]
df_enriched_media = spark.createDataFrame(data, columns)

# Persist Enrichments

In [ ]:
with tracer.span(name='Persist processed text as json'):
    submitted_media_tbl_name = f'{batch_num}_submitted_media'
    df_enriched_media_sql = spark.read.parquet(f'{minted_tables_path}{submitted_media_tbl_name}')
    df_enriched_media_sql = df_enriched_media_sql.where(df_enriched_media_sql.path == media_path)
    file_type = df_enriched_media_sql.first()['file_type']
    media_output_dict = {
        'file_path': media_path, 
        'file_name': media_file_name, 
        'file_type': file_type, 
        'batch_num': batch_num, 
        'media_enrichment': json.loads(response_str)
    }
    media_output_json = json.dumps(media_output_dict, indent=4)
    mssparkutils.fs.put(json_path, media_output_json, overwrite=True)

In [ ]:
with tracer.span(name='Perist media enrichments to sql'):  
    # set the sql table name
    enriched_media_tbl_name = f'{batch_num}_enriched_media'
    enriched_media_tbl_name = enriched_media_tbl_name.replace("/", "_")
    enriched_media_tbl_name = enriched_media_tbl_name.replace(".", "_")

    df_enriched_media.write.mode("append").parquet(f'{minted_tables_path}{enriched_media_tbl_name}')

In [ ]:
# output and exit
output = {'custom_dimensions': {
    'batch_num': batch_num,
    'enriched_media_tbl_name': enriched_media_tbl_name,
    'file_system': file_system,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }

# Return the object to the pipeline
logger.info(f"{mssparkutils.runtime.context['notebookname']}: OUTPUT", extra=output)
mssparkutils.notebook.exit(output['custom_dimensions'])
