In [15]:
import os
import sys
from datetime import datetime


from pyspark.sql.types import StructType, StructField, StringType, TimestampType, LongType
from pyspark.sql.functions import create_map, lit, current_timestamp
from pyspark.sql.utils import AnalysisException

In [16]:
# Current working directory
current_dir = os.getcwd()
# Go up 3 levels and append 'src'
project_root = os.path.abspath(os.path.join(current_dir, '..', '..', '..', 'src'))
# Add src to sys.path
sys.path.append(project_root)

from logging_utils.logger import log_pipeline_event, log_task_event
from unikargo_utils import add_pipeline_metadata

In [17]:
airlines_schema = StructType([
    StructField("iata_code", StringType(), True),
    StructField("airline", StringType(), True)
])

In [18]:
# Create widgets (required for ADF → Databricks integration)
dbutils.widgets.text("pipeline_id", "")
dbutils.widgets.text("run_id", "")
dbutils.widgets.text("task_id", "")
dbutils.widgets.text("processed_timestamp", "")
dbutils.widgets.text("catalog", "unikargo_dev") # Requires an ADF variable for ADF runs

# # Extract values
pipeline_id = dbutils.widgets.get("pipeline_id")
run_id = dbutils.widgets.get("run_id")
task_id = dbutils.widgets.get("task_id")
processed_timestamp = dbutils.widgets.get("processed_timestamp")
catalog = dbutils.widgets.get("catalog")

In [19]:
# Logging parameters for run context

pipeline_name = "airlines_ingestion"
environment = "dev"
# run_id = str(uuid.uuid4())

start_time = datetime.now()
status = "SUCCESS"
message = ""
rows_processed = 0

In [20]:
# --- Task 1: Read
try:
    airlines_df = (
        spark.read
        .schema(airlines_schema)    # predefined schema
        .option("header", "true")   # use the header row for column names only
        .csv("abfss://medallion@adlsunikarrgodev.dfs.core.windows.net/raw/volumes/airlines.csv") # added for adf
        )
    rows_processed = airlines_df.count()
    
    # Log SUCCESS
    log_task_event(
        status="SUCCESS",
        rows=rows_processed,
        message="Airlines data read successfully",
        pipeline_name=pipeline_name
    )
    
except Exception as e:
    # --- Task 3: Log FAILURE
    try:
        log_task_event(
            status="FAILED",
            message=str(e)
        )
    except AnalysisException as log_e:
        print(f"Failed to log task event: {log_e}")
    raise  # re-raise original error


In [21]:
from pyspark.sql.functions import col

LOG_PATH_TASK = "abfss://medallion@adlsunikarrgodev.dfs.core.windows.net/logs/task_logs"

try:
    logs_df = spark.read.format("delta").load(LOG_PATH_TASK)
    logs_df.orderBy(col("timestamp").desc()).show(10, truncate=False)
except Exception:
    print(f"No task logs found yet at {LOG_PATH_TASK}")


+------------------------------------+--------------------+------------+---------------------+-----------+-------+----+----------------------------------+--------------------------+
|pipeline_id                         |pipeline_name       |environment |run_id               |task_id    |status |rows|message                           |timestamp                 |
+------------------------------------+--------------------+------------+---------------------+-----------+-------+----+----------------------------------+--------------------------+
|7c95a047-5673-42e1-95f3-f3eb01fbaab7|airlines_ingestion  |unikargo_dev|manual_run_1756729892|manual_task|SUCCESS|14  |Airlines data read successfully   |2025-09-01 12:31:32.996132|
|dafb2265-e81a-49ee-a42e-412d67fcbe71|manual_pipeline_name|unikargo_dev|manual_run_1756729773|manual_task|SUCCESS|14  |Airlines data written successfully|2025-09-01 12:29:34.483091|
|ffcb778e-d60e-4b72-a6cb-9bb7c5683208|manual_pipeline_name|unikargo_dev|manual_run_1756729

In [None]:
# --- Task 2: Transform ---
   # Add metadata to the dataframe (Pipeline identifier, Run identifier and Task identifier)
try:
    # df = df.withColumn(
    #     "metadata",
    #     create_map(
    #         lit("pipeline_id"), lit(pipeline_id),
    #         lit("run_id"), lit(run_id),
    #         lit("task_id"), lit(task_id),
    #         # lit("processed_timestamp"), lit(processed_timestamp),
    #         lit("processed_timestamp"), current_timestamp(),
    #     ))

  
    airlines_df = add_pipeline_metadata(airlines_df, pipeline_id, run_id, task_id)
    
    # Count rows after transformation
    rows_processed = airlines_df.count()

    # Log SUCCESS
    log_task_event(
    status="SUCCESS",
    rows=rows_processed,
    message="Metadata column added successfully",
    pipeline_name=pipeline_name

)

except Exception as e:
    # Log FAILURE
    log_task_event(
        status="FAILED",
        message=str(e),
        pipeline_name=pipeline_id,
  
    )
    raise

In [23]:
from pyspark.sql.functions import col

LOG_PATH_TASK = "abfss://medallion@adlsunikarrgodev.dfs.core.windows.net/logs/task_logs"

try:
    logs_df = spark.read.format("delta").load(LOG_PATH_TASK)
    logs_df.orderBy(col("timestamp").desc()).show(10, truncate=False)
except Exception:
    print(f"No task logs found yet at {LOG_PATH_TASK}")

+------------------------------------+--------------------+------------+---------------------+-----------+-------+----+----------------------------------+--------------------------+
|pipeline_id                         |pipeline_name       |environment |run_id               |task_id    |status |rows|message                           |timestamp                 |
+------------------------------------+--------------------+------------+---------------------+-----------+-------+----+----------------------------------+--------------------------+
|1fe5a411-3ece-4011-bc5f-17b37db9d50a|airlines_ingestion  |unikargo_dev|manual_run_1756729894|manual_task|SUCCESS|14  |Metadata column added successfully|2025-09-01 12:31:35.29408 |
|7c95a047-5673-42e1-95f3-f3eb01fbaab7|airlines_ingestion  |unikargo_dev|manual_run_1756729892|manual_task|SUCCESS|14  |Airlines data read successfully   |2025-09-01 12:31:32.996132|
|dafb2265-e81a-49ee-a42e-412d67fcbe71|manual_pipeline_name|unikargo_dev|manual_run_1756729

In [24]:
# # --- Task 2: Transform
# airlines_df = airlines_df.withColumn("metadata",
#                    create_map(
#                        lit("pipeline_id"), lit(pipeline_id),
#                        lit("run_id"), lit(run_id),
#                        lit("task_id"), lit(task_id),
#                        lit("processed_timestamp"), lit(processed_timestamp),
#                    ))

# # df.show(5)
# rows_processed = airlines_df.count()

In [25]:
# --- Task 3: Write

try:
    # Count rows first
    rows_processed = airlines_df.count()

    # Overwrite Delta table safely
    airlines_df.write.\
    mode("overwrite").\
    option("overwriteSchema", "true").\
    saveAsTable(f"`{catalog}`.`01_bronze`.`unikargo_airlines_bronze`")

    # Log SUCCESS
    log_task_event(
        status="SUCCESS",
        rows=rows_processed,
        message="Airlines data written successfully",
        pipeline_name=pipeline_name
    )


except Exception as e:
    # Log FAILURE safely
    try:
        log_task_event(
            status="FAILED",
            message=str(e)
        )
    except AnalysisException as log_e:
        print(f"Failed to log task event: {log_e}")
    raise  # re-raise original error


In [26]:
# --- Task 3: Write


# airlines_df.write.\
# mode("overwrite").\
# option("overwriteSchema", "true").\
# saveAsTable(f"`{catalog}`.`01_bronze`.`unikargo_airlines_bronze`") 


# log_task_event(pipeline_id, run_id, "write_bronze", "SUCCESS", rows=df.count())

In [27]:
# end_time = datetime.now()
# log_pipeline_event(
#     pipeline_name=pipeline_name,
#     environment=environment,
#     run_id=run_id,
#     status=status,
#     start_time=start_time,
#     end_time=end_time,
#     rows_processed=rows_processed,
#     message=message
# )

In [28]:
from pyspark.sql.functions import col

LOG_PATH_TASK = "abfss://medallion@adlsunikarrgodev.dfs.core.windows.net/logs/task_logs"

try:
    logs_df = spark.read.format("delta").load(LOG_PATH_TASK)
    logs_df.orderBy(col("timestamp").desc()).show(10, truncate=False)
except Exception:
    print(f"No task logs found yet at {LOG_PATH_TASK}")

+------------------------------------+--------------------+------------+---------------------+-----------+-------+----+----------------------------------+--------------------------+
|pipeline_id                         |pipeline_name       |environment |run_id               |task_id    |status |rows|message                           |timestamp                 |
+------------------------------------+--------------------+------------+---------------------+-----------+-------+----+----------------------------------+--------------------------+
|c203e7fc-0ec1-4043-a1ad-1ed8caebb5a8|airlines_ingestion  |unikargo_dev|manual_run_1756729898|manual_task|SUCCESS|14  |Airlines data written successfully|2025-09-01 12:31:38.926468|
|1fe5a411-3ece-4011-bc5f-17b37db9d50a|airlines_ingestion  |unikargo_dev|manual_run_1756729894|manual_task|SUCCESS|14  |Metadata column added successfully|2025-09-01 12:31:35.29408 |
|7c95a047-5673-42e1-95f3-f3eb01fbaab7|airlines_ingestion  |unikargo_dev|manual_run_1756729