In [1]:
# Databricks Notebook: silver_processing.py
import os
import sys
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number

current_dir = os.getcwd() # Current working directory
project_root = os.path.abspath(os.path.join(current_dir, '..', '..', '..', 'src')) # Go up 3 levels and append 'src'
sys.path.append(project_root) # Add src to sys.path
from logging_utils import TaskLogger
from config import get_log_config

Loaded tables config from: C:\Users\Dele\Documents\D. Professional Registration\IT\DATA-EnGR\00_data_engr_projects\unicargo\unicargo_dab\tables.yaml


In [2]:
pipeline_name = "pl_unikargo_medallion"
log_type =  'task'
environment = 'dev'

In [3]:
# -----------------------------
# === STEP 1: READ AIRLINES DATA ===
# -----------------------------

source_path = "unikargo_dev.01_bronze.unikargo_airlines_bronze"
operation = "tsk_airlines_read_bronze"

with TaskLogger(
    operation=operation,
    pipeline_name=pipeline_name,
    source_path=source_path,
    log_running=False  # keep this False unless you explicitly want a "RUNNING" entry
)as logger:

    airlines_df = spark.read.table(source_path)

    rows_processed = airlines_df.count()
    
    # Update metrics before completion
    logger.set_metrics(rows=rows_processed)

In [4]:
# -----------------------------
# --- Task 2: Deduplicate & Generate Airline Dimension
# -----------------------------

operation="tsk_airlines_dim_build"

with TaskLogger(
    operation=operation,
    pipeline_name=pipeline_name,
    log_running=False 
) as logger:
    
    dim_airline = airlines_df \
        .filter(col("iata_code").isNotNull()) \
        .dropDuplicates(["iata_code"]) \
        .select("iata_code", "airline") \
        .withColumn("airline_sk", row_number().over(Window.orderBy("iata_code"))) \
        .select("airline_sk", "iata_code", "airline")
    
    rows_processed = airlines_df.count()
    
    # Update metrics before completion
    logger.set_metrics(rows=rows_processed)

In [5]:
# -----------------------------
# Write the Delta table
# -----------------------------
operation = "tsk_airlines_persist_silver"
target_path ="`unikargo_dev`.`02_silver`.`unikargo_dim_airline_silver`"

with TaskLogger(
    operation=operation,
    pipeline_name=pipeline_name,
    log_running=False 
) as logger:
    
    # Count rows first
    rows_processed = airlines_df.count()

    dim_airline.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(target_path)
    
    # Update metrics before completion
    logger.set_metrics(rows=rows_processed)



In [6]:
# --------Uncomment to debug (Read Delta logs and show latest logs)-----------------
log_path = get_log_config(log_type, environment=environment) # Path to save logging for tasks

logs_df = spark.read.format("delta").load(log_path)
logs_df.orderBy("timestamp", ascending=False).show(20, truncate=False)

+------------------------------------+---------------------+-----------+-----------------------+------------------------------------+------------------------------+-------+----+-----------------+--------------------------------------------------------------------------------+-----------------------------------------------------+----------+-------------+--------------------------+----------+
|pipeline_id                         |pipeline_name        |environment|run_id                 |task_id                             |operation                     |status |rows|execution_time_ms|source_path                                                                     |target_path                                          |error_type|error_message|timestamp                 |log_date  |
+------------------------------------+---------------------+-----------+-----------------------+------------------------------------+------------------------------+-------+----+-----------------+-----------------