In [1]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.functions import create_map, lit

In [None]:
# added for adf

dbutils.widgets.text("pipeline_id", "")
dbutils.widgets.text("run_id", "")
dbutils.widgets.text("task_id", "")
dbutils.widgets.text("processed_timestamp", "")
dbutils.widgets.text("catalog", "unikargo_dev")

In [None]:
pipeline_id = dbutils.widgets.get("pipeline_id")
run_id = dbutils.widgets.get("run_id")
task_id = dbutils.widgets.get("task_id")
processed_timestamp = dbutils.widgets.get("processed_timestamp")
catalog = dbutils.widgets.get("catalog")

In [2]:
airports_schema = StructType([
    StructField("iata_code", StringType(), True),
    StructField("airline", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("country", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
])

In [None]:
df = (spark.read
      .schema(airports_schema)
    .option("header", "true") 
      # .csv(f"/Volumes/{catalog}/00_raw/source_unicargo_data/airports.csv")
      .csv("abfss://medallion@adlsunikarrgodev.dfs.core.windows.net/raw/volumes/airports.csv") # added for adf
      )

 

In [5]:
df.show(5)

+---------+--------------------+-----------+-----+-------+--------+----------+
|iata_code|             airline|       city|state|country|latitude| longitude|
+---------+--------------------+-----------+-----+-------+--------+----------+
|      ABE|Lehigh Valley Int...|  Allentown|   PA|    USA|40.65236|  -75.4404|
|      ABI|Abilene Regional ...|    Abilene|   TX|    USA|32.41132|  -99.6819|
|      ABQ|Albuquerque Inter...|Albuquerque|   NM|    USA|35.04022|-106.60919|
|      ABR|Aberdeen Regional...|   Aberdeen|   SD|    USA|45.44906| -98.42183|
|      ABY|Southwest Georgia...|     Albany|   GA|    USA|31.53552| -84.19447|
+---------+--------------------+-----------+-----+-------+--------+----------+
only showing top 5 rows


In [None]:
df = df.withColumn("metadata",
                   create_map(
                       lit("pipeline_id"), lit(pipeline_id),
                       lit("run_id"), lit(run_id),
                       lit("task_id"), lit(task_id),
                       lit("processed_timestamp"), lit(processed_timestamp),
                   ))

In [None]:
df.write\
.format("delta") \
.mode("overwrite")\
.option("overwriteSchema", "true")\
.saveAsTable(f"`{catalog}`.`01_bronze`.`unikargo_airports_bronze`")
