In [None]:
import os
import sys
from datetime import datetime


from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import create_map, lit
from pyspark.sql.functions import col

# Current working directory
current_dir = os.getcwd()
# Go up 3 levels and append 'src'
project_root = os.path.abspath(os.path.join(current_dir, '..', '..', '..', 'src'))
# Add src to sys.path
sys.path.append(project_root)

from logging_utils.logger import log_task_status
from unikargo_utils import add_pipeline_metadata

In [20]:


dbutils.widgets.text("pipeline_id", "")
dbutils.widgets.text("run_id", "")
dbutils.widgets.text("task_id", "")
dbutils.widgets.text("processed_timestamp", "")
dbutils.widgets.text("catalog", "unikargo_dev")

pipeline_id = dbutils.widgets.get("pipeline_id")
run_id = dbutils.widgets.get("run_id")
task_id = dbutils.widgets.get("task_id")
processed_timestamp = dbutils.widgets.get("processed_timestamp")
catalog = dbutils.widgets.get("catalog")

# Logging parameters for run context
pipeline_name = "flights_ingestion"
environment = "dev"
# run_id = str(uuid.uuid4())
start_time = datetime.now()
status = "SUCCESS"
message = ""
rows_processed = 0

# Path to save logging for tasks
LOG_PATH_TASK = "abfss://medallion@adlsunikarrgodev.dfs.core.windows.net/logs/task_logs"

In [21]:
flights_schema = StructType([
    StructField("year", IntegerType(), True),
    StructField("month", IntegerType(), True),
    StructField("day", IntegerType(), True),
    StructField("day_of_week", IntegerType(), True),
    StructField("airline", StringType(), True),
    StructField("flight_number", IntegerType(), True),
    StructField("tail_number", StringType(), True),
    StructField("origin_airport", StringType(), True),
    StructField("destination_airport", StringType(), True),
    StructField("scheduled_departure", IntegerType(), True),
    StructField("departure_time", IntegerType(), True),
    StructField("departure_delay", IntegerType(), True),
    StructField("taxi_out", IntegerType(), True),
    StructField("wheels_off", IntegerType(), True),
    StructField("scheduled_time", IntegerType(), True),
    StructField("elapsed_time", IntegerType(), True),
    StructField("air_time", IntegerType(), True),
    StructField("distance", IntegerType(), True),
    StructField("wheels_on", IntegerType(), True),
    StructField("taxi_in", IntegerType(), True),
    StructField("scheduled_arrival", IntegerType(), True),
    StructField("arrival_time", IntegerType(), True),
    StructField("arrival_delay", IntegerType(), True),
    StructField("diverted", IntegerType(), True),
    StructField("cancelled", IntegerType(), True),
    StructField("cancellation_reason", StringType(), True),
    StructField("air_system_delay", IntegerType(), True),
    StructField("security_delay", IntegerType(), True),
    StructField("airline_delay", IntegerType(), True),
    StructField("late_aircraft_delay", IntegerType(), True),
    StructField("weather_delay", IntegerType(), True),
])

In [None]:
try:
    flights_df = (spark.read
        .schema(flights_schema)
        .option("header", "true") 
        # .csv(f"/Volumes/{catalog}/00_raw/source_unicargo_data/flights.csv")
        .csv("abfss://medallion@adlsunikarrgodev.dfs.core.windows.net/raw/volumes/flights.csv") # added for adf
)
    
    rows_processed = flights_df.count()
    
    # Log SUCCESS
    log_task_status(
        status="SUCCESS",
        rows=rows_processed,
        message="Flights data read successfully",
        pipeline_name=pipeline_name
    )
    
except Exception as e:
    # --- Log FAILURE
    try:
        log_task_status(
            status="FAILED",
            message=str(e)
        )
    except AnalysisException as log_e:
        print(f"Failed to log task event: {log_e}")
    raise  # re-raise original error




In [None]:
try:
    flights_df_df = add_pipeline_metadata(flights_df, pipeline_id, run_id, task_id)

    # Count rows after transformation
    rows_processed = flights_df.count()

    # Log SUCCESS
    log_task_status(
    status="SUCCESS",
    rows=rows_processed,
    message="Metadata column added successfully",
    pipeline_name=pipeline_name

    )
except Exception as e:
    # Log FAILURE
    log_task_status(
        status="FAILED",
        message=str(e),
        pipeline_name=pipeline_id,
  
    )
    raise


# df = df.withColumn("metadata",
#                    create_map(
#                        lit("pipeline_id"), lit(pipeline_id),
#                        lit("run_id"), lit(run_id),
#                        lit("task_id"), lit(task_id),
#                        lit("processed_timestamp"), lit(processed_timestamp),
#                    ))

In [None]:
try:
    # Count rows first
    rows_processed = flights_df.count()

    flights_df.write.\
    mode("overwrite").\
    option("overwriteSchema", "true").\
    saveAsTable(f"`{catalog}`.`01_bronze`.`unikargo_flights_bronze`")

        # Log SUCCESS
    log_task_status(
        status="SUCCESS",
        rows=rows_processed,
        message="Flights data written successfully",
        pipeline_name=pipeline_name
    )


except Exception as e:
        # Log FAILURE safely
    try:
        log_task_status(
            status="FAILED",
            message=str(e)
        )
    except AnalysisException as log_e:
        print(f"Failed to log task event: {log_e}")
    raise  # re-raise original error


In [25]:
#  Log read task for debugging
try:
    logs_df = spark.read.format("delta").load(LOG_PATH_TASK)
    logs_df.orderBy(col("timestamp").desc()).show(10, truncate=False)
except Exception:
    print(f"No task logs found yet at {LOG_PATH_TASK}")

+------------------------------------+------------------+------------+--------------------+----------+-------+-------+----------------------------------+--------------------------+
|pipeline_id                         |pipeline_name     |environment |run_id              |task_id   |status |rows   |message                           |timestamp                 |
+------------------------------------+------------------+------------+--------------------+----------+-------+-------+----------------------------------+--------------------------+
|859730bb-7fa0-4876-909f-91701faadfde|flights_ingestion |unikargo_dev|local_run_1757018668|local_test|SUCCESS|5819079|Flights data written successfully |2025-09-04 20:44:28.772976|
|04d5e861-639b-4970-9fcc-d1ed92f234b2|flights_ingestion |unikargo_dev|local_run_1757018634|local_test|SUCCESS|5819079|Metadata column added successfully|2025-09-04 20:43:54.751967|
|dea48e03-b9d6-49ed-9d96-c23b825748d2|flights_ingestion |unikargo_dev|local_run_1757018629|loca