In [2]:
import os
import sys
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.functions import col

current_dir = os.getcwd() # Current working directory
project_root = os.path.abspath(os.path.join(current_dir, '..', '..', '..', 'src')) # Go up 3 levels and append 'src'
sys.path.append(project_root) # Add src to sys.path
from logging_utils.logger import TaskLogger
from unikargo_utils import add_pipeline_metadata
from config import get_log_config, get_table_config

In [None]:
# Create widgets (required for ADF → Databricks integration)
dbutils.widgets.text("pipeline_id", "")
dbutils.widgets.text("run_id", "")
dbutils.widgets.text("task_id", "")
dbutils.widgets.text("processed_timestamp", "")
dbutils.widgets.text("catalog", "unikargo_dev")

# Extract values from widgets
pipeline_id = dbutils.widgets.get("pipeline_id")
run_id = dbutils.widgets.get("run_id")
task_id = dbutils.widgets.get("task_id")
processed_timestamp = dbutils.widgets.get("processed_timestamp")
catalog = dbutils.widgets.get("catalog")

# Logging parameters for run context
pipeline_name = "pl_unikargo_medallion"
rows_processed = 0
log_type =  'task'
environment = "dev"
entity="airports"
layer="bronze"

# .csv(f"/Volumes/{catalog}/00_raw/source_unicargo_data/airports.csv") # adf cant read from external unity catalog volumes. 
# Your registered volume is 806d999a-a9fd-4bef-aa04-f1ee2b077888, 
# mapped to abfss://medallion@adlsunikarrgodev.dfs.core.windows.net/raw/volumes
airports_cfg = get_table_config(entity="airports", layer="bronze", environment="dev")
print(airports_cfg)
# source_path="abfss://medallion@adlsunikarrgodev.dfs.core.windows.net/raw/volumes/airports.csv"


TableConfig(catalog='unikargo_dev', schema='01_bronze', table='unikargo_airport_bronze', layer='bronze', table_key=None, format='delta', raw_path='abfss://medallion@adlsunikarrgodev.dfs.core.windows.net/raw/volumes/airports.csv')


To use databricks widgets interactively in your notebook, please install databricks sdk using:
	pip install 'databricks-sdk[notebook]'
Falling back to default_value_only implementation for databricks widgets.


In [4]:
airports_schema = StructType([
    StructField("iata_code", StringType(), True),
    StructField("airline", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("country", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
])

In [5]:
# --- Task 1: Read
airports_csv_path = airports_cfg.raw_path
operation = "tsk_flights_read_raw"

with TaskLogger(
    operation=operation,
    pipeline_name=pipeline_name,
    source_path=airports_csv_path,
    log_running=False 

) as logger:
    
    airports_df = (spark.read
      .schema(airports_schema)
      .option("header", "true") 
      .csv(airports_csv_path) 
      )
    
    
    rows_processed = airports_df.count()
    # Update metrics before completion
    logger.set_metrics(rows=rows_processed)

In [6]:
# -----------------------------
# --- Task 2: Add metadata to the dataframe (Pipeline identifier, Run identifier and Task identifier)
# -----------------------------
operation="tsk_flights_add_metadata"

with TaskLogger(
    operation=operation,
    pipeline_name=pipeline_name,
    log_running=False 
) as logger:

    airports_df = add_pipeline_metadata(airports_df, pipeline_id, run_id, task_id)

    # Count rows after transformation
    rows_processed = airports_df.count()

    # Update metrics before completion
    logger.set_metrics(rows=rows_processed)


In [7]:
# -----------------------------
# Write to bronze
# -----------------------------
target_path = airports_cfg.full_name
print(target_path)
operation = "tsk_airports_persist_bronze"

with TaskLogger(
    operation=operation,
    pipeline_name=pipeline_name,
    target_path=target_path,
    log_running=False
) as logger:
    
    # Count rows first
    rows_processed = airports_df.count()

    airports_df.write\
        .format("delta") \
        .mode("overwrite")\
        .option("overwriteSchema", "true")\
        .saveAsTable(target_path)
    
    # Update metrics before completion
    logger.set_metrics(rows=rows_processed)


unikargo_dev.01_bronze.unikargo_airport_bronze


In [9]:
# --------Uncomment to debug (Read Delta logs and show latest logs)-----------------

log_path = get_log_config(log_type, environment=environment)
logs_df = spark.read.format("delta").load(log_path)
logs_df.orderBy("timestamp", ascending=False).show(20, truncate=False)

+-----------+---------------------+-----------+-----------------------+------------------------------------+---------------------------+-------+-------+-----------------+--------------------------------------------------------------------------------+-----------------------------------------------------+----------+-------------+--------------------------+----------+
|pipeline_id|pipeline_name        |environment|run_id                 |task_id                             |operation                  |status |rows   |execution_time_ms|source_path                                                                     |target_path                                          |error_type|error_message|timestamp                 |log_date  |
+-----------+---------------------+-----------+-----------------------+------------------------------------+---------------------------+-------+-------+-----------------+--------------------------------------------------------------------------------+-----------