# init

Initializes the job by creating necessary schemas and tables in Databricks (if non-existent).

In [None]:
ctr_catalog_name = dbutils.widgets.get("ctr_catalog_name")
raw_catalog_name = dbutils.widgets.get("raw_catalog_name")
stg_catalog_name = dbutils.widgets.get("stg_catalog_name")
managed_location = dbutils.widgets.get("managed_location")

spark.sql(f"""
    create schema if not exists {ctr_catalog_name}.loading
    managed location '{managed_location}'
""")

spark.sql(f"""
    create table if not exists {ctr_catalog_name}.loading.data_ingestion (
        schema_name string, 
        table_name string,
        primary_key string not null,
        stg_primary_key string,
        active boolean not null,
        filter string,
        selected string,
        constraint data_ingestion_pk primary key (schema_name, table_name)
    )
""")

spark.sql(f"""
    create table if not exists {ctr_catalog_name}.loading.data_ingestion_log (
        source_catalog_name string,
        source_schema_name string not null, 
        source_table_name string not null,
        ingestion_date timestamp,
        target_catalog_name string,
        target_schema_name string,
        target_table_name string,
        movements integer,
        error string,
        constraint data_ingestion_log_pk primary key (target_catalog_name, target_schema_name, target_table_name, ingestion_date)
    )
""")

In [None]:
from lib.naming import pascal_to_snake

df_schemas = (spark.read
    .table(f"{ctr_catalog_name}.loading.data_ingestion")
    .select("schema_name")
    .distinct()
    .where("active = true")
)

for schema_name in [row["schema_name"] for row in df_schemas.collect()]:
    spark.sql(f"""
        create schema if not exists {raw_catalog_name}.{schema_name}
        managed location '{managed_location}';    
    """)

    # The schema in the staging zone will be created following snake_case naming convention, as this
    # standard will be used for schemas, tables and columns from the staging zone onwards.
    spark.sql(f"""
        create schema if not exists {stg_catalog_name}.{pascal_to_snake(schema_name)}
        managed location '{managed_location}';    
    """)