# Pipeline: Bronze to Silver

## Data Source

- **Catalog Location:**  `workspace.hospital_bronze`
- **Dimensional Tables:** `cities, departments, diagnoses, procedures, providers, insurance`
- **Format:** Delta Lake Table


## Destination

- **Catalog Location:** `workspace.hospital_silver`
- **Dimensional Tables:** `cities, departments, diagnoses, procedures, providers, insurance`
- **Format:** Delta Lake Table 

# Configurations

In [0]:
dbutils.widgets.text("dimensional_table", "")
dimensional_table = dbutils.widgets.get("dimensional_table")

assert dimensional_table, "Missing required parameter: dimensional_table"

In [0]:
# Databricks Storage
catalog_name = "workspace"
schema_silver = "hospital_silver"
schema_bronze = "hospital_bronze"
schema_gold = "hospital_gold"

# data source path
data_source = "s3://buckethospitaldata/data_batching/"

In [0]:
def read_from_silver(dimensional_table):
    dataframe = spark.readStream.table(f"{catalog_name}.{schema_silver}.{dimensional_table}")
    return dataframe



def write_to_gold(dataframe, checkpoint_location: str, gold_table: str):
    # Get existing target table schema
    target_schema = spark.table(f"{catalog_name}.{schema_gold}.{gold_table}").schema
    target_columns = [field.name for field in target_schema]

    # Filter input DataFrame to only include columns that match the target table
    common_columns = list(set(dataframe.columns) & set(target_columns))
    df_filtered = dataframe.select(common_columns)

    
    # Write to the gold layer using writeStream
    (
        df_filtered.writeStream
        .format("delta")
        .outputMode("append")
        .option("checkpointLocation", checkpoint_location)
        .trigger(once=True)
        .toTable(f"{catalog_name}.{schema_gold}.{gold_table}")
    )

# Load Tables

## Patients

## Cities

In [0]:
if dimensional_table == 'cities':
    print(f"Preparing streaming write for '{dimensional_table}' table...")

    # config checkpoint and location in gold layer
    checkpoint_location = f"s3://buckethospitaldata/pipeline_checkpoints/data_batching/_checkpoints/gold/{dimensional_table}"
    gold_table = f"dim_{dimensional_table}"

    # read from silver
    df_cities = read_from_silver(dimensional_table).drop('rescued_data')

    # write to gold
    write_to_gold(df_cities, checkpoint_location, gold_table)

    print(f"Write to {gold_table} initiated.")


## Departments

In [0]:
if dimensional_table == 'departments':
    print(f"Preparing streaming write for '{dimensional_table}' table...")

    # config checkpoint and location in gold layer
    checkpoint_location = f"s3://buckethospitaldata/pipeline_checkpoints/data_batching/_checkpoints/gold/{dimensional_table}"
    gold_table = f"dim_{dimensional_table}"

    # read from silver
    df_departments = read_from_silver(dimensional_table).drop('rescued_data')

    # write to gold
    write_to_gold(df_departments, checkpoint_location, gold_table)

    print(f"Write to {gold_table} initiated.")


## Diagnoses

In [0]:
if dimensional_table == 'diagnoses':
    print(f"Preparing streaming write for '{dimensional_table}' table...")

    # config checkpoint and location in gold layer
    checkpoint_location = f"s3://buckethospitaldata/pipeline_checkpoints/data_batching/_checkpoints/gold/{dimensional_table}"
    gold_table = f"dim_{dimensional_table}"

    # read from silver
    df_diagnoses = read_from_silver(dimensional_table).drop('rescued_data')

    # write to gold
    write_to_gold(df_diagnoses, checkpoint_location, gold_table)

    print(f"Write to {gold_table} initiated.")


## Insurance

In [0]:
if dimensional_table == 'insurance':
    print(f"Preparing streaming write for '{dimensional_table}' table...")

    # config checkpoint and location in gold layer
    checkpoint_location = f"s3://buckethospitaldata/pipeline_checkpoints/data_batching/_checkpoints/gold/{dimensional_table}"
    gold_table = f"dim_{dimensional_table}"

    # read from silver
    df_insurance = read_from_silver(dimensional_table).drop('rescued_data')

    # write to gold
    write_to_gold(df_insurance, checkpoint_location, gold_table)

    print(f"Write to {gold_table} initiated.")


## Procedures

In [0]:
if dimensional_table == 'procedures':
    print(f"Preparing streaming write for '{dimensional_table}' table...")

    # config checkpoint and location in gold layer
    checkpoint_location = f"s3://buckethospitaldata/pipeline_checkpoints/data_batching/_checkpoints/gold/{dimensional_table}"
    gold_table = f"dim_{dimensional_table}"

    # read from silver
    df_procedures = read_from_silver(dimensional_table).drop('rescued_data')

    # write to gold
    write_to_gold(df_procedures, checkpoint_location, gold_table)

    print(f"Write to {gold_table} initiated.")


## Providers

In [0]:
if dimensional_table == 'providers':
    print(f"Preparing streaming write for '{dimensional_table}' table...")

    # config checkpoint and location in gold layer
    checkpoint_location = f"s3://buckethospitaldata/pipeline_checkpoints/data_batching/_checkpoints/gold/{dimensional_table}"
    gold_table = f"dim_{dimensional_table}"

    # read from silver
    df_providers = read_from_silver(dimensional_table).drop('rescued_data')

    # write to gold
    write_to_gold(df_providers, checkpoint_location, gold_table)

    print(f"Write to {gold_table} initiated.")
