In [0]:
%run ../imports/imports

In [0]:
%run ../config/paths

In [0]:
%run ../utilities/functions

In [0]:
print(landing_path)
print(raw_path)
print(bronze_path)

In [0]:
class LandingStreamReader:

    def __init__(self, builder):
        self.datasource = builder.datasource
        self.dataset = builder.dataset
        self.landing_path = builder.landing_path
        self.raw_path = builder.raw_path
        self.bronze_path = builder.bronze_path
        self.format = builder.format
        self.options = builder.options
        self.metadata_columns = builder.metadata_columns
        self.partitionColumn = builder.partitionColumn
        self.formatted_date_column_params = builder.formatted_date_column_params
        self.dataset_landing_path = f'{self.landing_path}/{self.datasource}/{self.dataset}'
        self.dataset_bronze_schema_location = f'{self.bronze_path}/{self.datasource}/{self.dataset}_schema'
        dbutils.fs.mkdirs(self.dataset_bronze_schema_location)

    def __str__(self):
        return f"LandingStreamReader(datasource='{self.datasource}', dataset='{self.dataset}')"

    def read_cloudFiles(self):
        df = (spark.readStream
              .format("cloudFiles")
              .options(**self.options)
              .option("cloudFiles.schemaLocation", self.dataset_bronze_schema_location)
              .load(self.dataset_landing_path)
        )

        df = add_metadata_columns(
            df,
            self.landing_path,
            self.raw_path,
            self.format,
            self.metadata_columns
        )

        if self.partitionColumn and self.formatted_date_column_params:
            df = add_formatted_date_column(df,self.partitionColumn, **self.formatted_date_column_params)

        return df

    class Builder:
        def __init__(self):
            self.datasource = None
            self.dataset = None
            self.landing_path = None
            self.raw_path = None
            self.bronze_path = None
            self.options = None
            self.formatted_date_column_params = None

        def set_datasource(self, datasource):
            self.datasource = datasource
            return self

        def set_dataset(self, dataset):
            self.dataset = dataset
            return self

        def set_landing_path(self, landing_path):
            self.landing_path = landing_path
            return self

        def set_raw_path(self, raw_path):
            self.raw_path = raw_path
            return self

        def set_bronze_path(self, bronze_path):
            self.bronze_path = bronze_path
            return self
        
        def set_format(self, format):
            self.format = format
            return self

        def set_options(self, options):
            self.options = options
            return self
        
        def set_metadata_columns(self, metadata_columns):
            self.metadata_columns = metadata_columns
            return self
        
        def set_partitionColumn(self, partitionColumn):
            self.partitionColumn = partitionColumn
            return self
        
        def set_formatted_date_column_params(self, formatted_date_column_params):
            self.formatted_date_column_params = formatted_date_column_params
            return self

        def build(self):
            return LandingStreamReader(self)

In [0]:
class BronzeStreamWriter:   
    def __init__(self, builder):
        self.datasource = builder.datasource
        self.dataset = builder.dataset
        self.landing_path = builder.landing_path
        self.raw_path = builder.raw_path
        self.bronze_path = builder.bronze_path
        self.bronze_table_format = builder.bronze_table_format
        self.bronze_write_mode = builder.bronze_write_mode
        self.options = builder.options
        self.partitionColumn = builder.partitionColumn
        self.formatted_date_column_params = builder.formatted_date_column_params
        self.dataset_landing_path = f"{self.landing_path}/{self.datasource}/{self.dataset}"
        self.dataset_raw_path = f"{self.raw_path}/{self.datasource}/{self.dataset}"
        self.dataset_bronze_path = f"{self.bronze_path}/{self.datasource}/{self.dataset}"
        self.dataset_checkpoint_location = f"{self.dataset_bronze_path}_checkpoint"
        self.table = f"hive_metastore.bronze.{self.datasource}__{self.dataset}"
        self.query_name = f"bronze-{self.datasource}-{self.dataset}"

        dbutils.fs.mkdirs(self.dataset_raw_path)
        dbutils.fs.mkdirs(self.dataset_bronze_path)
        dbutils.fs.mkdirs(self.dataset_checkpoint_location)

    def __str__(self):
        return f"BronzeStreamWriter(datasource='{self.datasource}', dataset='{self.dataset}')"
         
    def archive_raw_files(self, df):
        """
        Moves ingested raw files from landing to raw path after processing.
        """
        if "_ingested_filename" in df.columns:
            files = [row["_ingested_filename"] for row in df.select("_ingested_filename").distinct().collect()]
            for file in files:
                if file:
                    file_landing_path = file.replace(self.dataset_raw_path, self.dataset_landing_path)
                    dbutils.fs.mkdirs(file[0:file.rfind('/')+1])
                    dbutils.fs.mv(file_landing_path, file)
    
    def write_data(self, df):
        """
        Writes DataFrame to Delta table in bronze layer with schema merge and Delta Lake support.
        """
        spark.sql("CREATE DATABASE IF NOT EXISTS hive_metastore.bronze") 
        #spark.sql(f"CREATE TABLE IF NOT EXISTS {self.table} USING DELTA LOCATION '{self.dataset_bronze_path}'") 
        
        writer = (
            df.write
           .format(self.bronze_table_format)
           .mode(self.bronze_write_mode)
           .option("mergeSchema", "true")
        )

        if self.partitionColumn and not self.formatted_date_column_params:
            writer.partitionBy(self.partitionColumn)

        if self.partitionColumn and self.formatted_date_column_params:
            writer.partitionBy(self.formatted_date_column_params["output_col"])
        
        (writer
        .option("path", self.dataset_bronze_path)
        .saveAsTable(self.table)
        )
        
        
    def append_2_bronze(self, batch_df, batch_id):
        """
        Main entrypoint for Structured Streaming write logic.
        Persists, writes, archives, and unpersists the batch DataFrame.
        """
        batch_df.persist()
        self.write_data(batch_df)
        self.archive_raw_files(batch_df)
        batch_df.unpersist()

    class Builder:
        def __init__(self):
            self.datasource = None
            self.dataset = None
            self.landing_path = None
            self.raw_path = None
            self.bronze_path = None
            self.bronze_table_format = None
            self.bronze_write_mode = None
            self.options = None
            self.partitionColumn = None
            self.formatted_date_column_params = None
        
        def set_datasource(self, datasource):
            self.datasource = datasource
            return self
        
        def set_dataset(self, dataset):
            self.dataset = dataset
            return self
        
        def set_landing_path(self, landing_path):
            self.landing_path = landing_path
            return self
        
        def set_raw_path(self, raw_path):
            self.raw_path = raw_path
            return self
        
        def set_bronze_path(self, bronze_path):
            self.bronze_path = bronze_path
            return self
        
        def set_bronze_table_format(self, bronze_table_format):
            self.bronze_table_format = bronze_table_format
            return self
        
        def set_bronze_write_mode(self, bronze_write_mode):
            self.bronze_write_mode = bronze_write_mode
            return self
        
        def set_options(self, options):
            self.options = options
            return self
        
        def set_partitionColumn(self, partitionColumn):
            self.partitionColumn = partitionColumn
            return self
        
        def set_formatted_date_column_params(self, formatted_date_column_params):
            self.formatted_date_column_params = formatted_date_column_params
            return self
        
        def build(self):
            return BronzeStreamWriter(self)

In [0]:
json_path = '../config/datasets/batch/retail_sales_order.json'

with open(json_path) as f:
    config = json.load(f)

config.get('source').get('options').get('cloudFiles.format')

bronze_config_path = '../config/bronze.json'
with open(bronze_config_path) as f:
    bronze_config = json.load(f)

In [0]:
#format = "jpg"
#datasource = 'tensorflow'
#dataset = "flower_photos"

bronze_format = bronze_config.get('format')
bronze_mode = bronze_config.get('mode')

print(bronze_format)
print(bronze_mode)

format = config.get('source').get('options').get('cloudFiles.format')
datasource = config.get('datasource')
dataset = config.get('dataset')
options = config.get('source').get('options')
metadata_columns = config.get('metadata')
partitionColumn = config.get("partition").get("column")
formatted_date_column_params = config.get("partition").get("formatted_date_column_params")

print(format)
print(datasource)
print(dataset)
print(options)
print(metadata_columns)
print(partitionColumn)
print(formatted_date_column_params)

dataset_landing_path = f"{landing_path}/{datasource}/{dataset}"
dataset_raw_path =  f"{raw_path}/{datasource}/{dataset}"
dataset_bronze_path = f"{bronze_path}/{datasource}/{dataset}"

print(dataset_landing_path)
print(dataset_raw_path)
print(dataset_bronze_path)

In [0]:
reader = (LandingStreamReader.Builder()          
  .set_datasource(datasource)
  .set_dataset(dataset)
  .set_landing_path(landing_path)
  .set_raw_path(raw_path)
  .set_bronze_path(bronze_path)
  .set_format(format)
  .set_options(options)
  .set_metadata_columns(metadata_columns)
  .set_partitionColumn(partitionColumn)
  .set_formatted_date_column_params(formatted_date_column_params)
  .build()
)

print(reader)   

In [0]:
reader.formatted_date_column_params

In [0]:
print(reader.options)

In [0]:
writer = (BronzeStreamWriter.Builder()
  .set_datasource(datasource)
  .set_dataset(dataset)
  .set_landing_path(landing_path)
  .set_raw_path(raw_path)
  .set_bronze_path(bronze_path)
  .set_bronze_table_format(bronze_format)
  .set_bronze_write_mode(bronze_mode)
  .set_partitionColumn(partitionColumn)
  .set_formatted_date_column_params(formatted_date_column_params)
  .build()
)

print(writer)

In [0]:
writer.formatted_date_column_params

In [0]:
(reader
  .read_cloudFiles()
  .writeStream
  .foreachBatch(writer.append_2_bronze)
  .trigger(availableNow=True)
  #.trigger(processingTime="60 seconds") # modo continuo
  .option("checkpointLocation", writer.dataset_checkpoint_location)
  .queryName(writer.query_name)
  .start()
)

In [0]:
query = f"""
select * 
from delta.`{writer.dataset_bronze_path}`
order by _ingested_at desc
limit 10
"""
display(spark.sql(query))

In [0]:
query = f"""
select distinct _ingested_filename 
from delta.`{writer.dataset_bronze_path}`
"""
display(spark.sql(query))