Task 1
Overview: Explore the provided source data and load it into bronze layer
- There are 4 folders with csv data in:
s3://de-40-training-raw/final_exam_data/customers/orders/order_items/products>
- Read the data and explore the provided datasets
- Create Notebook with Databricks autoloader to load the data into bronze
layer tables
o Header is included in the file
o Delimiter is coma
o Add control columns
▪ ingest_datetime – current datetime of the load
▪ batch_id –id of the autoloader process (incremental identifier
of each processed micro batch. Reference:
https://spark.apache.org/docs/latest/api/python/reference/pys
park.ss/api/pyspark.sql.streaming.DataStreamWriter.foreachB
atch.html )
o Keep all rows, no historization is required at this level.
o Target schema is your buddy group schema.
o Target table name template is:
<yourname>_bronze_<customers/orders/order_items/products>
Example: kirilovl_bronze_customers
o The bronze tables should be in delta formato Notebook to generate test data(optional): https://dbc-c1c28eda-
2f09.cloud.databricks.com/editor/notebooks/348054849368280?o=3
72384865193042#command/5114878503435833

In [0]:
###################
##### Bronze ######
###################

from pyspark.sql.functions import current_timestamp, lit

# Base paths and target schema
base_path = "s3://de-40-training-raw/final_exam_data/"
schema_base_path = "s3://de-40-training-raw/output_data/buddy_group_1/"
target_schema = "de_pyspark_training_catalog.buddy_group_1"

# Autoloader for customers
table_name = "customers"
source_path = f"{base_path}{table_name}"
schema_location = f"{schema_base_path}amanolov_bronze_{table_name}_exam/schema"
checkpoint_location = f"{schema_base_path}amanolov_bronze_{table_name}_exam/checkpoint"
output_path = f"{schema_base_path}amanolov_bronze_{table_name}_exam/data"
target_table = f"{target_schema}.amanolov_bronze_{table_name}_exam"

df_stream = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("maxFilesPerTrigger", 5)
    .option("cloudFiles.schemaLocation", schema_location)
    .load(source_path)
    .withColumn("ingest_datetime", current_timestamp())
)

(
    df_stream.writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_location)
    .foreachBatch(lambda df, batchId: (
        df.withColumn("batch_id", lit(batchId))
        .write.format("delta")
        .mode("append")
        .option("mergeSchema", "true")
        .option("path", output_path)
        .saveAsTable(target_table)
    ))
    .trigger(availableNow=True)
    .start()
    .awaitTermination()
)

# Autoloader for orders
table_name = "orders"
source_path = f"{base_path}{table_name}"
schema_location = f"{schema_base_path}amanolov_bronze_{table_name}_exam/schema"
checkpoint_location = f"{schema_base_path}amanolov_bronze_{table_name}_exam/checkpoint"
output_path = f"{schema_base_path}amanolov_bronze_{table_name}_exam/data"
target_table = f"{target_schema}.amanolov_bronze_{table_name}_exam"

df_stream = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("maxFilesPerTrigger", 5)
    .option("cloudFiles.schemaLocation", schema_location)
    .load(source_path)
    .withColumn("ingest_datetime", current_timestamp())
)

(
    df_stream.writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_location)
    .foreachBatch(lambda df, batchId: (
        df.withColumn("batch_id", lit(batchId))
        .write.format("delta")
        .mode("append")
        .option("mergeSchema", "true")
        .option("path", output_path)
        .saveAsTable(target_table)
    ))
    .trigger(availableNow=True)
    .start()
    .awaitTermination()
)

# Autoloader for order_items
table_name = "order_items"
source_path = f"{base_path}{table_name}"
schema_location = f"{schema_base_path}amanolov_bronze_{table_name}_exam/schema"
checkpoint_location = f"{schema_base_path}amanolov_bronze_{table_name}_exam/checkpoint"
output_path = f"{schema_base_path}amanolov_bronze_{table_name}_exam/data"
target_table = f"{target_schema}.amanolov_bronze_{table_name}_exam"

df_stream = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("maxFilesPerTrigger", 5)
    .option("cloudFiles.schemaLocation", schema_location)
    .load(source_path)
    .withColumn("ingest_datetime", current_timestamp())
)

(
    df_stream.writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_location)
    .foreachBatch(lambda df, batchId: (
        df.withColumn("batch_id", lit(batchId))
        .write.format("delta")
        .mode("append")
        .option("mergeSchema", "true")
        .option("path", output_path)
        .saveAsTable(target_table)
    ))
    .trigger(availableNow=True)
    .start()
    .awaitTermination()
)

# Autoloader for products
table_name = "products"
source_path = f"{base_path}{table_name}"
schema_location = f"{schema_base_path}amanolov_bronze_{table_name}_exam/schema"
checkpoint_location = f"{schema_base_path}amanolov_bronze_{table_name}_exam/checkpoint"
output_path = f"{schema_base_path}amanolov_bronze_{table_name}_exam/data"
target_table = f"{target_schema}.amanolov_bronze_{table_name}_exam"

df_stream = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("maxFilesPerTrigger", 5)
    .option("cloudFiles.schemaLocation", schema_location)
    .load(source_path)
    .withColumn("ingest_datetime", current_timestamp())
)

(
    df_stream.writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_location)
    .foreachBatch(lambda df, batchId: (
        df.withColumn("batch_id", lit(batchId))
        .write.format("delta")
        .mode("append")
        .option("mergeSchema", "true")
        .option("path", output_path)
        .saveAsTable(target_table)
    ))
    .trigger(availableNow=True)
    .start()
    .awaitTermination()
)



In [0]:
%sql
select * from de_pyspark_training_catalog.buddy_group_1.amanolov_bronze_customers_exam;

CUSTOMER_ID,CUST_FIRST_NAME,CUST_LAST_NAME,CUST_ADDRESS.COUNTRY_ID,CUST_ADDRESS.STATE_PROVINCE,CUST_ADDRESS.CITY,CUST_ADDRESS.POSTAL_CODE,CUST_ADDRESS.STREET_ADDRESS,PHONE_NUMBER,CUST_EMAIL,ACCOUNT_MGR_ID,DATE_OF_BIRTH,MARITAL_STATUS,GENDER,_rescued_data,ingest_datetime,batch_id
144,Sivaji,Landis,US,IA,Cedar Rapids,52401,221 3Rd Ave Se # 300,+1 319 123 4301,Sivaji.Landis@GOLDENEYE.EXAMPLE.COM,145,09-FEB-70,married,M,,2025-06-26T18:22:28.6Z,0
145,Mammutti,Pacino,US,WI,Eau Claire,54701,2120 Heights Dr,+1 745 123 4306,Mammutti.Pacino@GREBE.EXAMPLE.COM,145,19-FEB-46,single,M,,2025-06-26T18:22:28.6Z,0
146,Elia,Fawcett,US,WI,Milwaukee,53217,8989 N Port Washington Rd,+1 414 123 4307,Elia.Fawcett@JACANA.EXAMPLE.COM,145,12-MAR-63,married,F,,2025-06-26T18:22:28.6Z,0
147,Ishwarya,Roberts,US,WI,Milwaukee,53223,6555 W Good Hope Rd,+1 414 123 4308,Ishwarya.Roberts@LAPWING.EXAMPLE.COM,145,21-MAR-44,single,F,,2025-06-26T18:22:28.6Z,0
148,Gustav,Steenburgen,US,WI,Madison,53714,1314 N Stoughton Rd,+1 608 123 4309,Gustav.Steenburgen@PINTAIL.EXAMPLE.COM,145,10-APR-50,married,M,,2025-06-26T18:22:28.6Z,0
149,Markus,Rampling,US,WI,Madison,53704,4715 Sprecher Rd,+1 608 123 4318,Markus.Rampling@PUFFIN.EXAMPLE.COM,145,20-APR-41,single,M,,2025-06-26T18:22:28.6Z,0
150,Goldie,Slater,US,WI,Milwaukee,53218,6161 N 64Th St,+1 414 123 4323,Goldie.Slater@PYRRHULOXIA.EXAMPLE.COM,145,11-MAY-51,married,M,,2025-06-26T18:22:28.6Z,0
151,Divine,Aykroyd,US,WI,Milwaukee,53227,11016 W Lincoln Ave,+1 414 123 4324,Divine.Aykroyd@REDSTART.EXAMPLE.COM,145,20-MAY-76,single,M,,2025-06-26T18:22:28.6Z,0
152,Dieter,Matthau,US,WI,Milwaukee,53227,8600 W National Ave,+1 414 123 4328,Dieter.Matthau@VERDIN.EXAMPLE.COM,145,09-JUN-22,married,M,,2025-06-26T18:22:28.6Z,0
153,Divine,Sheen,US,WI,Madison,53704,615 N Sherman Ave,+1 608 123 4332,Divine.Sheen@COWBIRD.EXAMPLE.COM,145,20-JUN-67,single,M,,2025-06-26T18:22:28.6Z,0
