In [ ]:
blob_account_name = ''
input_container = ''
output_container = ''
image_file_path = ''
ais_file_path = ''
azure_storage_domain = ''
config_path = ''
kml_path = ''

In [ ]:
from azure.storage.blob import generate_blob_sas, BlobSasPermissions, generate_container_sas, ContainerSasPermissions, BlobClient
from datetime import datetime, timedelta
import fsspec
import json
import logging
import os
from py4j.protocol import Py4JJavaError
from pyspark.sql.functions import abs
from pyspark.sql.functions import col, date_trunc
from pyspark.sql.functions import date_format
from pyspark.sql.functions import lit
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp
import requests
import xml.dom.minidom
from opencensus.ext.azure.log_exporter import AzureLogHandler

# Setup Logger

In [ ]:
instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

# Set paths for satellite image and configuration
    Used for retrieving the satellite image timestamp

In [ ]:
image_path = f'https://{blob_account_name}.blob.{azure_storage_domain}/{input_container}/'
image_path_abfss = f'abfss://{input_container}@{blob_account_name}.dfs.{azure_storage_domain}/'
image_folder = os.path.dirname(image_file_path)
image_root = f"{image_path}{image_folder}"
image_root_abfss = f'{image_path_abfss}{image_folder}'
global_config_path = f'abfss://configuration@{blob_account_name}.dfs.{azure_storage_domain}/anomdet.config.global.json'

# Load the satellite configuration file
    Used for retrieving the satellite image timestamp

In [ ]:
# Create a spark session
spark = SparkSession.builder.appName(f"AnomalyDetection {mssparkutils.runtime.context}").getOrCreate()
sc = spark.sparkContext

# Function to enssure config file exists
def prepare_config(image_root: str, global_config_path: str):
    """
    This method makes sure that a config is availabile in the batch root.
    If a config file isn't already there, it is copied over form global_config_path.
    If there is no config under global_config_path, this function will crash (indicating an error in pipeline set up.)
    """
    image_config_path = f'{image_root_abfss}/anomdet.config.json'
    try: 
        mssparkutils.fs.head(image_config_path)
    except Py4JJavaError as e:
        if 'java.io.FileNotFoundException' in str(e):
            # File doesn't exist, copying over the global config path
            mssparkutils.fs.cp(global_config_path, image_config_path)    
        else:
            raise e

# Prepare and load the configuration file
prepare_config(image_root=image_root, global_config_path=global_config_path)
config = json.loads(''.join(sc.textFile(f'{image_path_abfss}/{config_path}').collect()))

# Setup the POST body and URL request to the gdal_container img_info endpoint 
    Used for retrieving the satellite image timestamp

In [ ]:
storage_account_key = mssparkutils.credentials.getSecretWithLS('keyvault', 'StorageAccountKey')
in_blob_sas_tkn = generate_blob_sas(account_name=blob_account_name, 
                            container_name=input_container,
                            blob_name=image_file_path,
                            account_key=storage_account_key,
                            permission=BlobSasPermissions(read=True),
                            expiry=datetime.utcnow() + timedelta(hours=1))


in_img_metadata = {
    "blob_acct": blob_account_name,
    "container": input_container,
    "blob_path": image_file_path,
    "sas_token": in_blob_sas_tkn
}
info_config = { "format": "json"}

# POST Body
gdal_info = { 
    "info_options": info_config,
    "in_img": in_img_metadata
}

#Host URL
gdal_host_url = config['gdal_host']['app_url']

# Call the gdal_container img_info HTTP POST endpoint to retrieve the satellite metadata
    Used for retrieving the satellite image timestamp

In [ ]:
# Function to call the img_info endpoint
def call_info(gdal_endpoint,info_metadata,api_key):
    resp = ""
    try:
        headers = {
            # Request headers
            "Content-Type": "application/json",
            "Gdal-Subscription-Key": api_key,
            "KEY": api_key
        }
        body = info_metadata
        url = f"{gdal_endpoint}/img_info/"
        resp = requests.post(url=url, json=body, headers=headers)
        result_response = resp.json()
        print(json.dumps(result_response, indent=4, sort_keys=True))
    except Exception as e:
        logger.error('Exception', e)
    return resp

# Get the satellite image metadata
info_resp = call_info(gdal_host_url,gdal_info, config['gdal_host']['key'])
logger.info(json.dumps(info_resp.json(), indent=4, sort_keys=True))

# Get the timestamp from the Sattelite Image and convert to a PySpark Timestamp type

In [ ]:
info_json = info_resp.json()
image_timestamp = info_json['img_metadata']['metadata']['']['TIFFTAG_DATETIME']

# Convert the string time to a spark dataframe 
image_time_df = spark.createDataFrame([image_timestamp], "string").toDF('image_timestamp')
image_time_df.show(truncate=False)

# Convert the string time to a timestamp data type 
image_time_df = image_time_df.withColumn("image_timestamp", to_timestamp(col("image_timestamp"),"yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS'Z'"))
image_time_df.show(truncate=False)
# Get the timestamp object
image_time = image_time_df.collect()[0].image_timestamp

# Process and filter the AIS data

In [ ]:
# abfss URL
ais_csv_file_url = f'abfss://{input_container}@{blob_account_name}.dfs.{azure_storage_domain}/{ais_file_path}'

# Read AIS data from input container
ais_df = spark.read.options(inferSchema='False',header=True,delimiter=';').csv(ais_csv_file_url)

# Remove duplicates
ais_df = ais_df.distinct()

# # Find the time difference in minutes from the Satellite image
# ais_df = ais_df.withColumn('TIMESTAMP', to_timestamp(col('TIMESTAMP'))) \
#                .withColumn('DIFFINMIN', abs((col('TIMESTAMP').cast('long') - lit(image_time).cast('long'))/60))

# # Filter the AIS data so that it includes times with +-10 min of sattelite image
# ais_df = ais_df.filter((ais_df['DIFFINMIN'] <= 10))
# ais_df = ais_df.sort('SHIPNAME')

# # Filter the AIS data with time closest to time of sattelite image
# ais_df = ais_df.groupBy(['SHIPNAME']).min('DIFFINMIN').withColumnRenamed('min(DIFFINMIN)', 'DIFFINMIN').join(ais_df, ['SHIPNAME','DIFFINMIN'])
# ais_df = ais_df.sort('SHIPNAME')
# ais_df = ais_df.drop('DIFFINMIN')
# ais_df = ais_df.fillna('None')

ais_df = ais_df.withColumn('TIMESTAMP', date_format(col('TIMESTAMP'), "yyyy-MM-dd HH:mm:ss"))
#ais_df = ais_df.withColumn('date', date_trunc("yyyy-MM-dd HH:mm:ss", col('TIMESTAMP')))
ais_df.show(10,False)
ais_df.printSchema() # used to check the datatype

# Functions to convert AIS data to KML

In [ ]:
def create_placemark(klm_document, row, name_description, extended_data, lat_long):

    # <PlaceMark> 
    placemark_element = klm_document.createElement('Placemark')

    # <name>
    name_element = klm_document.createElement('name')
    try:
        name_element.appendChild(klm_document.createTextNode(row[name_description[0]]))
    except Exception as e:
        logger.info("failed creating placemark")

    placemark_element.appendChild(name_element)

    # <description> 
    description_element = klm_document.createElement('description')
    try:
        description_element.appendChild(klm_document.createTextNode(row[name_description[1]]))
    except Exception as e:
        logger.info("\t\t\t 111111")
    placemark_element.appendChild(description_element)

    # <ExtendedData> 
    ext_element = klm_document.createElement('ExtendedData')
    placemark_element.appendChild(ext_element)
    
    # <Data name="NAME"> <value> VALUE </value> </Data>
    for key in extended_data:
        data_element = klm_document.createElement('Data')  
        data_element.setAttribute('name', key)      
        value_element = klm_document.createElement('value')
        data_element.appendChild(value_element)
        try:
            value_text = klm_document.createTextNode(str(row[key]))
        except Exception as e:
            logger.info("failed creating key in extended_data")
        value_element.appendChild(value_text)
        ext_element.appendChild(data_element)

    # <Point> <coordinates> lat,long </coordinates> <Point>
    point_element = klm_document.createElement('Point')
    placemark_element.appendChild(point_element)
    coor_element = klm_document.createElement('coordinates')
    try:
        coor_element.appendChild(klm_document.createTextNode(row[lat_long[0]]+','+row[lat_long[1]]))
    except Exception as e:
        logger.info("failed appending child to document")
    point_element.appendChild(coor_element)
    
    return placemark_element

def convert_ais_csv_to_kml(ais_df, file_name, kml_path, name_description, extended_data, lat_long):

    # Create a kml document
    klm_document = xml.dom.minidom.Document()

    # Set the XML name space values unders the <kml> tag
    kml_element = klm_document.createElementNS('http://www.opengis.net/kml/2.2','kml') 
    kml_element.setAttribute('xmlns','http://www.opengis.net/kml/2.2')
    kml_element = klm_document.appendChild(kml_element)
    # Google name space
    kml_element.setAttribute('xmlns:gx','http://www.google.com/kml/ext/2.2')
    kml_element = klm_document.appendChild(kml_element)
    # Open GIS name space
    kml_element.setAttribute('xmlns:kml','http://www.opengis.net/kml/2.2')
    kml_element = klm_document.appendChild(kml_element)
    # Atom name space
    kml_element.setAttribute('xmlns:atom','http://www.w3.org/2005/Atom')
    kml_element = klm_document.appendChild(kml_element)

    # Add the <Document> tag
    document_element = klm_document.createElement('Document')
    document_element = kml_element.appendChild(document_element)

    # Create a <PlaceMark> tag for each row
    for row in ais_df.rdd.collect():
        placemark_element = create_placemark(klm_document, row, name_description, extended_data, lat_long)
        document_element.appendChild(placemark_element)

    out_cont_sas_tkn = generate_container_sas(account_name=blob_account_name, 
                            container_name=output_container,
                            account_key=storage_account_key,
                            permission=ContainerSasPermissions(read=True, list=True, write=True, add=True, create=True, update=True),
                            expiry=datetime.utcnow() + timedelta(hours=1))
    connection_string = f'DefaultEndpointsProtocol=https;AccountName={blob_account_name};AccountKey={storage_account_key};EndpointSuffix={azure_storage_domain}'
    blob = BlobClient.from_connection_string(conn_str=connection_string, container_name=f'{output_container}', blob_name=f'{kml_path}', credential=out_cont_sas_tkn)

    #Write XML to file 
    with open(file_name, mode='w') as f:
        f.write(klm_document.toxml())
    
    # Write xml to output_container
    with open(file_name, "rb") as data:
        blob.upload_blob(data)

# Convert AIS data to KML

In [ ]:
# Theses are headers from the AIS CSV file
name_description = ['SHIPNAME','TYPE_NAME']
extended_data = ['IMO','MMSI', 'LENGTH', 'WIDTH', 'SPEED', 'STATUS', 'COURSE', 'HEADING', 'TIMESTAMP'] 
lat_long = ['LAT','LON']

# Convert AIS csv data to KML
convert_ais_csv_to_kml(ais_df, ais_file_path.split('.csv')[0], kml_path, name_description, extended_data, lat_long)