**Configure connection to ADLS Gen2 using Service Principal**

In [0]:
bronze_path = "abfss://bronze@livbusdatastore.dfs.core.windows.net/"
silver_path = "abfss://silver@livbusdatastore.dfs.core.windows.net/"

ServicePrincipalId = dbutils.secrets.get(scope="livbodsbus-keyvault",key="dbx-client-ID")
ServicePrincipalKey = dbutils.secrets.get(scope="livbodsbus-keyvault",key="dbx-secret")
TenantId = dbutils.secrets.get(scope="livbodsbus-keyvault",key="dbx-tenant-ID")


# Configure access to ADLS Gen2
spark.conf.set("fs.azure.account.auth.type.livbusdatastore.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.livbusdatastore.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.livbusdatastore.dfs.core.windows.net", ServicePrincipalId)
spark.conf.set("fs.azure.account.oauth2.client.secret.livbusdatastore.dfs.core.windows.net", ServicePrincipalKey)
spark.conf.set("fs.azure.account.oauth2.client.endpoint.livbusdatastore.dfs.core.windows.net", f"https://login.microsoftonline.com/{TenantId}/oauth2/token")


**Batch loading of files with Autoloader**

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession

# Define the bronze directory path for the operator
bronze_path = "abfss://bronze@livbusdatastore.dfs.core.windows.net/operator=*/year=*/month=*/day=*"

# Define the schema location path (this is where AutoLoader will store schema metadata)
schema_location = "abfss://bronze@livbusdatastore.dfs.core.windows.net/schemas/bus_activity_schema/"

# STEP 1: Read data from the bronze directory using Auto Loader for batch processing

MaxFilesPerTrigger = 600

# AutoLoader will automatically detect new files as they arrive in the source folder
df_batch = spark.readStream.format("cloudFiles") \
    .option("cloudFiles.format", "json") \
    .option("cloudFiles.inferColumnTypes", "true") \
    .option("cloudFiles.schemaLocation", schema_location) \
    .option("cloudFiles.maxFilesPerTrigger", MaxFilesPerTrigger)\
    .load(bronze_path)

# Step 2: Add file timestamp and file path_name
df_with_metadata = df_batch.withColumn("file_path", input_file_name()) \
    .withColumn("file_timestamp", regexp_extract("file_path", r'_(\d{8}_\d{6})\.json', 1))

# Step 3: Extract operator from file path (useful for routing logic)
df_with_operator = df_with_metadata.withColumn("operator_extracted", regexp_extract("file_path", r'operator=([^/]+)', 1))

# Step 4: Try parsing the `Siri` column (which is already a struct if schema inference worked correctly)
# Just alias it cleanly for further processing
df_siri = df_with_operator.select(
    col("Siri.*"),  # Unpack top-level fields inside 'Siri'
    col("operator_extracted").alias("operator"),
    "file_timestamp",
    "file_path"
)
df_flat = df_siri.select(
    explode(col("ServiceDelivery.VehicleMonitoringDelivery.VehicleActivity")).alias("activity"),
    "file_timestamp"
)
# df_flat.display()

# STEP 5: Flatten the nested JSON structure
df_selected = df_flat.select(
    col("activity.RecordedAtTime").alias("recorded_at_time"),
    col("activity.ItemIdentifier").alias("item_identifier"),
    col("activity.ValidUntilTime").alias("valid_until_time"),

    col("activity.MonitoredVehicleJourney.LineRef").alias("line_ref"),
    col("activity.MonitoredVehicleJourney.DirectionRef").alias("direction_ref"),
    col("activity.MonitoredVehicleJourney.FramedVehicleJourneyRef.DataFrameRef").alias("data_frame_ref"),
    col("activity.MonitoredVehicleJourney.FramedVehicleJourneyRef.DatedVehicleJourneyRef").cast("string").alias("dated_vehicle_journey_ref"),
    col("activity.MonitoredVehicleJourney.PublishedLineName").alias("published_line_name"),
    col("activity.MonitoredVehicleJourney.OperatorRef").alias("operator_ref"),
    col("activity.MonitoredVehicleJourney.OriginRef").cast("string").alias("origin_ref"),
    col("activity.MonitoredVehicleJourney.OriginName").alias("origin_name"),
    col("activity.MonitoredVehicleJourney.DestinationRef").cast("string").alias("destination_ref"),
    col("activity.MonitoredVehicleJourney.DestinationName").alias("destination_name"),
    col("activity.MonitoredVehicleJourney.OriginAimedDepartureTime").alias("origin_aimed_departure_time"),
    col("activity.MonitoredVehicleJourney.DestinationAimedArrivalTime").alias("destination_aimed_arrival_time"),

    col("activity.MonitoredVehicleJourney.VehicleLocation.Longitude").alias("longitude"),
    col("activity.MonitoredVehicleJourney.VehicleLocation.Latitude").alias("latitude"),
    col("activity.MonitoredVehicleJourney.BlockRef").alias("block_ref"),
    col("activity.MonitoredVehicleJourney.VehicleRef").cast("string").alias("vehicle_ref"),

    # Handle optional fields with `when` for safe access
    when(col("activity.Extensions.VehicleJourney.Operational.TicketMachine.TicketMachineServiceCode").isNotNull(),
         col("activity.Extensions.VehicleJourney.Operational.TicketMachine.TicketMachineServiceCode"))
    .otherwise(lit(None)).alias("ticket_machine_service_code"),

    when(col("activity.Extensions.VehicleJourney.Operational.TicketMachine.JourneyCode").isNotNull(),
         col("activity.Extensions.VehicleJourney.Operational.TicketMachine.JourneyCode"))
    .otherwise(lit(None)).alias("journey_code"),

    when(col("activity.Extensions.VehicleJourney.VehicleUniqueId").isNotNull(),
         col("activity.Extensions.VehicleJourney.VehicleUniqueId").cast("string"))
    .otherwise(lit(None)).alias("vehicle_unique_id"),

    col("activity.MonitoredVehicleJourney.Bearing").alias("bearing"),
    
    # Handle nested fields like `Monitored`
    when(col("activity.MonitoredVehicleJourney.Monitored").isNotNull(),
         col("activity.MonitoredVehicleJourney.Monitored"))
    .otherwise(lit(None).cast("boolean")).alias("monitored"),

    when(col("activity.Extensions.VehicleJourney.DriverRef").isNotNull(),
         col("activity.Extensions.VehicleJourney.DriverRef").cast("string"))
    .otherwise(lit(None)).alias("driver_ref"),

    col("file_timestamp"),
).withColumn("ingestion_timestamp", to_timestamp(col("file_timestamp"), "yyyyMMdd_HHmmss").cast("string"))\
.withColumn("year", year("ingestion_timestamp")) \
.withColumn("month", month("ingestion_timestamp")) \
.withColumn("day", dayofmonth("ingestion_timestamp"))

checkpoint_path = "abfss://silver@livbusdatastore.dfs.core.windows.net/_checkpoints/bus_activity/"
output_path = "abfss://silver@livbusdatastore.dfs.core.windows.net/bus_activity/"

# # STEP 6: Write the result to Delta Lake (Silver layer)
query = (
    df_selected.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_path)
    .partitionBy("operator_ref", "year", "month", "day")
    .trigger(once=True)
    .start(output_path)
)

query.awaitTermination()

In [0]:
# Register as table for SQL access
spark.sql("CREATE SCHEMA IF NOT EXISTS silver")

spark.sql(f"""
  CREATE TABLE IF NOT EXISTS silver.bus_activity
  USING DELTA
  LOCATION '{output_path}'
""")

DataFrame[]