In [0]:
%run ./01-config

In [0]:
from pyspark.sql import functions as F

class Bronze:
    def __init__(self, env):
        self.Conf = Config()
        self.landing_dir = self.Conf.base_data_dir + "/raw"
        self.checkpoint_dir = self.Conf.base_checkpoint_dir + "/checkpoints"
        self.catalog = env
        self.db_name = self.Conf.db_name
        spark.sql(f"USE {self.catalog}.{self.db_name}")

    
    def consume_kafka_multiplex(self, once=True, processing_time="5 seconds"):
        schema = "key string, value string, topic string, partition bigint, offset bigint, timestamp bigint"

        stream_df = (spark.readStream
                     .format("cloudFiles")
                     .schema(schema)
                     .option("maxFilesPerTrigger", 1)
                     .option("cloudFiles.format", "json")
                     .load(self.landing_dir + "/kafka-raw")
                     .withColumn("timestamp", (F.col("timestamp")/1000).cast("timestamp"))
                     .withColumn("load_time", F.current_timestamp())
                     .withColumn("source_file", F.col("_metadata.file_path"))
                     .withColumn("year_month", F.date_format("timestamp", "yyyy-MM"))
                )
        
        stream_writer = stream_df.writeStream \
                                 .option("checkpointLocation", self.checkpoint_dir + "/bronze") \
                                 .option("mergeSchema", True) \
                                 .partitionBy("topic", "year_month")

        if once == True:
            return stream_writer.trigger(availableNow=True).toTable(f"{self.catalog}.{self.db_name}.bronze")
        else:
            return stream_writer.trigger(processingTime=processing_time).toTable(f"{self.catalog}.{self.db_name}.bronze")
        
    def consume(self, once=True, processing_time="5 seconds"):
        print(f"Starting bronze layer consumption...")
        self.consume_kafka_multiplex(once, processing_time)
        if once:
            for stream in spark.streams.active:
                stream.awaitTermination()
        print(f"Finished bronze layer consumption...")





                  
