In [None]:
import os
import sys
from datetime import datetime

from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import col
# from pyspark.sql.functions import create_map, lit


# Current working directory
current_dir = os.getcwd()
# Go up 3 levels and append 'src'
project_root = os.path.abspath(os.path.join(current_dir, '..', '..', '..', 'src'))
# Add src to sys.path
sys.path.append(project_root)

from logging_utils.logger import log_task_status
from unikargo_utils import add_pipeline_metadata

In [14]:
# added for adf

dbutils.widgets.text("pipeline_id", "")
dbutils.widgets.text("run_id", "")
dbutils.widgets.text("task_id", "")
dbutils.widgets.text("processed_timestamp", "")
dbutils.widgets.text("catalog", "unikargo_dev")

pipeline_id = dbutils.widgets.get("pipeline_id")
run_id = dbutils.widgets.get("run_id")
task_id = dbutils.widgets.get("task_id")
processed_timestamp = dbutils.widgets.get("processed_timestamp")
catalog = dbutils.widgets.get("catalog")


# Logging parameters for run context
pipeline_name = "airports_ingestion"
environment = "dev"
# run_id = str(uuid.uuid4())
start_time = datetime.now()
status = "SUCCESS"
message = ""
rows_processed = 0

# Path to save logging for tasks
LOG_PATH_TASK = "abfss://medallion@adlsunikarrgodev.dfs.core.windows.net/logs/task_logs"

In [16]:
airports_schema = StructType([
    StructField("iata_code", StringType(), True),
    StructField("airline", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("country", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
])

In [None]:
SHOW VOLUMES;


In [None]:
# --- Task 1: Read
try:
    airports_df = (spark.read
      .schema(airports_schema)
    .option("header", "true") 
      # .csv(f"/Volumes/{catalog}/00_raw/source_unicargo_data/airports.csv") # adf cant read from external unity catalog volumes. 
      .csv("abfss://medallion@adlsunikarrgodev.dfs.core.windows.net/raw/volumes/airports.csv")  # workaround added for adf
      )
    # Your registered volume is 806d999a-a9fd-4bef-aa04-f1ee2b077888, 
    # mapped to abfss://medallion@adlsunikarrgodev.dfs.core.windows.net/raw/volumes
    
    rows_processed = airports_df.count()

    # Log SUCCESS
    log_task_status(
        status="SUCCESS",
        rows=rows_processed,
        message="Airports data read successfully",
        pipeline_name=pipeline_name
    )

except Exception as e:
    # --- Log FAILURE
    try:
        log_task_status(
            status="FAILED",
            message=str(e)
        )
    except AnalysisException as log_e:
        print(f"Failed to log task event: {log_e}")
    raise  # re-raise original error
    

In [None]:
# df = df.withColumn("metadata",
#                    create_map(
#                        lit("pipeline_id"), lit(pipeline_id),
#                        lit("run_id"), lit(run_id),
#                        lit("task_id"), lit(task_id),
#                        lit("processed_timestamp"), lit(processed_timestamp),
#                    ))
# --- Task 2: Add metadata to the dataframe (Pipeline identifier, Run identifier and Task identifier)

try:
    airports_df = add_pipeline_metadata(airports_df, pipeline_id, run_id, task_id)

    # Count rows after transformation
    rows_processed = airports_df.count()

    # Log SUCCESS
    log_task_status(
    status="SUCCESS",
    rows=rows_processed,
    message="Metadata column added successfully",
    pipeline_name=pipeline_name

)

except Exception as e:
    # Log FAILURE
    log_task_status(
        status="FAILED",
        message=str(e),
        pipeline_name=pipeline_id,
  
    )
    raise


In [None]:
# --- Task 3: Write to bronze

try:
    # Count rows first
    rows_processed = airports_df.count()

    airports_df.write\
        .format("delta") \
        .mode("overwrite")\
        .option("overwriteSchema", "true")\
        .saveAsTable(f"`{catalog}`.`01_bronze`.`unikargo_airports_bronze`")
    
        # Log SUCCESS
    log_task_status(
        status="SUCCESS",
        rows=rows_processed,
        message="Airports data written successfully",
        pipeline_name=pipeline_name
    )

except Exception as e:
    # Log FAILURE safely
    try:
        log_task_status(
            status="FAILED",
            message=str(e)
        )
    except AnalysisException as log_e:
        print(f"Failed to log task event: {log_e}")
    raise  # re-raise original error




In [21]:
#  Log read task for debugging
try:
    logs_df = spark.read.format("delta").load(LOG_PATH_TASK)
    logs_df.orderBy(col("timestamp").desc()).show(10, truncate=False)
except Exception:
    print(f"No task logs found yet at {LOG_PATH_TASK}")

+------------------------------------+------------------+------------+--------------------+----------+-------+----+----------------------------------+--------------------------+
|pipeline_id                         |pipeline_name     |environment |run_id              |task_id   |status |rows|message                           |timestamp                 |
+------------------------------------+------------------+------------+--------------------+----------+-------+----+----------------------------------+--------------------------+
|765f651e-3443-41e7-98a4-d40089f91d6a|airports_ingestion|unikargo_dev|local_run_1757018002|local_test|SUCCESS|322 |Airports data written successfully|2025-09-04 20:33:22.877883|
|57fb6ee3-b388-46bf-b5ac-041d706d9785|airports_ingestion|unikargo_dev|local_run_1757017990|local_test|SUCCESS|322 |Metadata column added successfully|2025-09-04 20:33:11.043569|
|fa409e29-3240-4cca-9db7-7aecaaf1e3f5|airports_ingestion|unikargo_dev|local_run_1757017982|local_test|SUCCESS|