In [1]:
import os
import sys
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col

current_dir = os.getcwd() # Current working directory
project_root = os.path.abspath(os.path.join(current_dir, '..', '..', '..', 'src')) # Go up 3 levels and append 'src'
sys.path.append(project_root) # Add src to sys.path
from logging_utils import TaskLogger
from unikargo_utils import add_pipeline_metadata
from config import get_log_adls_path, get_table_config
from io_utils import _get_widget

Loaded environments config from: C:\Users\Dele\Documents\D. Professional Registration\IT\DATA-EnGR\00_data_engr_projects\unicargo\unicargo_dab\configs\environments.yaml
Loaded tables config from: C:\Users\Dele\Documents\D. Professional Registration\IT\DATA-EnGR\00_data_engr_projects\unicargo\unicargo_dab\configs\tables.yaml


In [None]:
# Create widgets (required for ADF → Databricks integration)
# dbutils.widgets.text("pipeline_id", "")
# dbutils.widgets.text("run_id", "")
# dbutils.widgets.text("task_id", "")
# dbutils.widgets.text("processed_timestamp", "")
# dbutils.widgets.text("catalog", "unikargo_dev") # Requires an ADF variable for ADF runs
# dbutils.widgets.text("ENV", "dev") 

# # Extract values from widgets
# pipeline_id = dbutils.widgets.get("pipeline_id")
# run_id = dbutils.widgets.get("run_id")
# task_id = dbutils.widgets.get("task_id")
# processed_timestamp = dbutils.widgets.get("processed_timestamp")
# catalog = dbutils.widgets.get("catalog")
# ENV = dbutils.widgets.get("ENV")  # -> "dev". From the variables set in databricks.yml and unikargo_etl_pipeline_nb.job.yml

# Logging parameters for run context
pipeline_name = "pl_unikargo_medallion"
rows_processed = 0

# environment = 'dev'
environment = _get_widget("ENV", "dev")
entity="airlines"
layer="bronze"

airlines_cfg = get_table_config(entity="airlines", layer="bronze", environment=environment)
print(airlines_cfg)

In [None]:
# airline schema
airlines_schema = StructType([
    StructField("iata_code", StringType(), True),
    StructField("airline", StringType(), True)
])

In [None]:
# -----------------------------
# Read the airlines CSV
# -----------------------------

airlines_csv_path = airlines_cfg.raw_path
print(airlines_csv_path)
# source_path="abfss://medallion@adlsunikarrgodev.dfs.core.windows.net/raw/volumes/airlines.csv"
# raw_path = 
operation = "tsk_airlines_read_raw"

with TaskLogger(
    operation=operation,
    pipeline_name=pipeline_name,
    source_path=airlines_csv_path,
    log_running=False  # keep this False unless you explicitly want a "RUNNING" entry
) as logger:
    
    airlines_df = (
        spark.read
        .schema(airlines_schema)    
        .option("header", "true")
        .csv(airlines_csv_path)
    )
    
    rows_processed = airlines_df.count()
    
    # Update metrics before completion
    logger.set_metrics(rows=rows_processed)

    # print(airlines_csv_path)

In [None]:
# -----------------------------
# --- Task 2: Add metadata to the dataframe (Pipeline identifier, Run identifier and Task identifier)
# -----------------------------

operation="tsk_airlines_add_metadata"

with TaskLogger(
    operation=operation,
    pipeline_name=pipeline_name,
    log_running=False 
) as logger:
    
    # airlines_df = add_pipeline_metadata(airlines_df, pipeline_id, run_id, task_id)
    airlines_df = add_pipeline_metadata(
        airlines_df, 
        pipeline_id=logger.kwargs.get("pipeline_id"),
        run_id=logger.kwargs.get("run_id"),
        task_id=logger.kwargs.get("task_id")
        )

    # Count rows after transformation
    rows_processed = airlines_df.count()

    # Update metrics before completion
    logger.set_metrics(rows=rows_processed)

In [None]:
# -----------------------------
# Write to bronze
# -----------------------------

target_path = airlines_cfg.full_name
print(target_path)
operation = "tsk_airlines_persist_bronze"

with TaskLogger(
     operation=operation,
     pipeline_name=pipeline_name,
     target_path=target_path,
     log_running=False
) as logger:
    
    # Count rows first
    rows_processed = airlines_df.count()
    
    # Overwrite Delta table safely
    airlines_df.write.\
    mode("overwrite").\
    option("overwriteSchema", "true").\
    saveAsTable(target_path)

    # # Count rows after transformation
    # rows_processed = airlines_df.count()

    # Update metrics before completion
    logger.set_metrics(rows=rows_processed)

In [2]:
# --------Uncomment to debug (Read Delta logs and show latest logs)-----------------
log_type =  'task'
environment = _get_widget("ENV", "dev")
log_path = get_log_adls_path(log_type, environment=environment) # Path to save logging for tasks
logs_df = spark.read.format("delta").load(log_path)
logs_df.orderBy("timestamp", ascending=False).show(20, truncate=False)


+-----------+----------------+-----------+-----------------------+---------------------------------------------------+--------------------------------------------------+----------+------------------+--------------+--------------+-------+--------------+-----------------+-------------------------------------------------------------------------------+----------------------------------------------+---------------------+---------------------------------------------------------------------+-----------+-----------+--------------------------+----------+----------------------------------------------------------------------------+-------------------------------------+
|pipeline_id|pipeline_name   |environment|run_id                 |run_name                                           |task_id                                           |step_index|step_type         |parent_task_id|attempt_number|status |rows_processed|execution_time_ms|source_path                                                    