![./ImageLab.png](./Images/ImageLab.png "./ImageLab.png")

# Data Engineering with Lakeflow, Jobs, AutoLoader and more

## Catalog and schema creation and deletion of tables in case they exist

In [0]:
LOCATION_VAR = dbutils.widgets.get("param_location")
SCHEMA_LOCATION = dbutils.widgets.get("param_location")+"/schema"
CHECKPOINT_LOCATION = dbutils.widgets.get("param_location")+"/checkpoint"
from pyspark.sql import functions as F

In [0]:
spark.sql(f"CREATE CATALOG IF NOT EXISTS medallion_autoloader MANAGED LOCATION '{LOCATION_VAR}/catalogautoloader'")
spark.sql(f"CREATE DATABASE IF NOT EXISTS medallion_autoloader.bronze MANAGED LOCATION '{LOCATION_VAR}/catalogautoloader/bronze'")
spark.sql(f"CREATE DATABASE IF NOT EXISTS medallion_autoloader.silver MANAGED LOCATION '{LOCATION_VAR}/catalogautoloader/silver'")
spark.sql(f"CREATE DATABASE IF NOT EXISTS medallion_autoloader.gold MANAGED LOCATION '{LOCATION_VAR}/catalogautoloader/gold'")

## Load Table Data

Since this is an incremental streaming use case, we first need to load the data. For this scenario, we will fetch data from Object Storage (now leveraging checkpointing capabilities provided by AutoLoader and CDF).

Customer Table

In [0]:
# Reading files using AutoLoader

from pyspark.sql.types import StructType, StructField, StringType

csv_schema = StructType([
    StructField("customer_bk", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("birth_date", StringType(), True),
    StructField("segment", StringType(), True),
    StructField("region", StringType(), True),
    StructField("effective_ts", StringType(), True)
])

df_raw_customer=spark.readStream \
  .format("cloudFiles") \
  .option("cloudFiles.format", "csv") \
  .option("cloudFiles.schemaLocation", SCHEMA_LOCATION+"/customer") \
  .schema(csv_schema) \
  .option("header", "true") \
  .load(f"{LOCATION_VAR}/data_medallion/customer") \
  .selectExpr(
            "*",
            "_metadata.file_path as file_path",
            "_metadata.file_modification_time as file_mod_time"
        )


df_raw_customer \
  .writeStream \
  .option("checkpointLocation", CHECKPOINT_LOCATION) \
  .trigger(availableNow=True) \
  .toTable("medallion_autoloader.bronze.dim_customer")


spark.sql("""ALTER TABLE medallion_autoloader.bronze.dim_customer
SET TBLPROPERTIES (delta.enableChangeDataFeed = false)""")

display(spark.table("medallion_autoloader.bronze.dim_customer"))