In [0]:
# import os

# landing_path = "/Volumes/workspace/source_data/landing"
# schema = "workspace.bronze"

# for file_name in os.listdir(landing_path):
#     file_path = os.path.join(landing_path, file_name)
#     if os.path.isfile(file_path):
#         table_name = os.path.splitext(file_name)[0].replace("-", "_").replace(".", "_")
#         df = spark.read.option("header", "true").csv(file_path)
#         df.write.format("delta").mode("overwrite").saveAsTable(f"{schema}.{table_name}")

In [0]:
schema = "workspace.bronze"
source_details_df = spark.table("workspace.control.source_file_details").filter("is_active = true")
source_specs_df = spark.table("workspace.control.source_file_specifications").filter("is_active = true")

Read from source_file_details and join with source_file_specifications table, to validate schema


In [0]:
for row in source_details_df.collect():
    source_id = row['source_id']
    file_path = row['file_path']
    file_format = row['file_format']
    file_delimitor = row['file_delimitor']
    is_header = row['is_header']
    file_name = row['file_name']
    load_type = row['load_type']
    table_name = file_name.replace("-", "_").replace(".", "_")
    
    # display(source_id)
    # Get field specifications for this source_id
    spec_fields = (
        source_specs_df
        .filter(f"source_id = '{source_id}'")
        .orderBy("field_order")
        .select("field_name", "data_type")
        .collect()
    )

    # display(spec_fields)
    field_names = [f['field_name'] for f in spec_fields]
    field_types = [f['data_type'] for f in spec_fields]
    
    # Build schema string for Spark
    schema_str = ", ".join([f"{n} {t}" for n, t in zip(field_names, field_types)])
    
    # Read file with or without header
    read_options = {
        "delimiter": file_delimitor,
        "inferSchema": "false",
        "header": "true" if is_header else "false"
    }
    df = spark.read.format(file_format).options(**read_options).load(file_path)
    
    if is_header:
        # Validate schema: header columns must match spec field_names
        df_cols = [c.lower() for c in df.columns]
        spec_cols = [c.lower() for c in field_names]
        if df_cols != spec_cols:
            raise Exception(f"Schema mismatch for {file_name}: file columns {df.columns} != spec {field_names}")
        # Select columns in spec order
        df = df.select(field_names)
    else:
        # No header: assign columns from spec
        df = df.toDF(*field_names)
    
    # Cast columns to specified data types
    for name, dtype in zip(field_names, field_types):
        df = df.withColumn(name, df[name].cast(dtype))
    
    # Write to Delta table
    write_mode = "overwrite" if load_type.lower() == "full_load" else "append"
    df.write.format("delta").mode(write_mode).saveAsTable(f"{schema}.{table_name}")