# init

Initializes the job by creating necessary schemas and tables in Databricks (if non-existent), as well as populating loading.data_ingestion with initial entries.

In [1]:
import sys, os

sys.path.append(os.path.abspath("../"))

In [None]:
ctr_catalog_name = dbutils.widgets.get("ctr_catalog_name")
raw_catalog_name = dbutils.widgets.get("raw_catalog_name")
stg_catalog_name = dbutils.widgets.get("stg_catalog_name")
managed_location = dbutils.widgets.get("managed_location")

spark.sql(f"""
    CREATE SCHEMA IF NOT EXISTS {ctr_catalog_name}.loading
    MANAGED LOCATION '{managed_location}'
""")

if not spark.catalog.tableExists(f"{ctr_catalog_name}.loading.data_ingestion"):
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS {ctr_catalog_name}.loading.data_ingestion (
            schema_name string, 
            table_name string,
            primary_key string not null,
            stg_primary_key string,
            active boolean not null,
            filter string,
            selected string,
            partition_column string,
            num_partitions int,
            constraint data_ingestion_pk primary key (schema_name, table_name)
        )
    """)

    spark.sql(f"""
        INSERT INTO {ctr_catalog_name}.loading.data_ingestion (
            schema_name, table_name, primary_key, selected, filter, active, stg_primary_key, partition_column, num_partitions
        ) VALUES 
            ('Sales', 'SalesTerritory', 'TerritoryID', NULL, 'ModifiedDate >= :start_date', TRUE, 'id', 'ModifiedDate', 4),
            ('Sales', 'ShoppingCartItem', 'ShoppingCartItemID', NULL, 'ModifiedDate >= :start_date', TRUE, 'id', 'ModifiedDate', 4),
            ('Sales', 'SpecialOffer', 'SpecialOfferID', NULL, 'ModifiedDate >= :start_date', TRUE, 'id', 'ModifiedDate', 4),
            ('Sales', 'CurrencyRate', 'CurrencyRateID', NULL, 'ModifiedDate >= :start_date', TRUE, 'id', 'ModifiedDate', 4),
            ('Sales', 'Customer', 'CustomerID', NULL, 'ModifiedDate >= :start_date', TRUE, 'id', 'ModifiedDate', 4),
            ('Sales', 'SalesReason', 'SalesReasonID', NULL, 'ModifiedDate >= :start_date', TRUE, 'id', 'ModifiedDate', 4),
            ('Sales', 'CreditCard', 'CreditCardID', NULL, 'ModifiedDate >= :start_date', TRUE, 'id', 'ModifiedDate', 4),
            ('Sales', 'SalesTaxRate', 'SalesTaxRateID', NULL, 'ModifiedDate >= :start_date', TRUE, 'id', 'ModifiedDate', 4),
            ('Sales', 'PersonCreditCard', 'BusinessEntityID, CreditCardID', NULL, 'ModifiedDate >= :start_date', TRUE, NULL, 'ModifiedDate', 4),
            ('Sales', 'SalesTerritoryHistory', 'BusinessEntityID, StartDate, TerritoryID', NULL, 'ModifiedDate >= :start_date', TRUE, NULL, 'ModifiedDate', 4),
            ('Sales', 'SpecialOfferProduct', 'SpecialOfferID, ProductID', NULL, 'ModifiedDate >= :start_date', TRUE, NULL, 'ModifiedDate', 4),
            ('Sales', 'CountryRegionCurrency', 'CountryRegionCode, CurrencyCode', NULL, 'ModifiedDate >= :start_date', TRUE, NULL, NULL, NULL),
            ('Sales', 'Currency', 'CurrencyCode', NULL, 'ModifiedDate >= :start_date', TRUE, NULL, 'ModifiedDate', 4),
            ('Sales', 'SalesOrderDetail', 'SalesOrderID, SalesOrderDetailID', NULL, 'ModifiedDate >= :start_date', TRUE, NULL, 'ModifiedDate', 4),
            ('Sales', 'SalesOrderHeader', 'SalesOrderID', NULL, 'ModifiedDate >= :start_date', TRUE, NULL, 'ModifiedDate', 4),
            ('Sales', 'SalesOrderHeaderSalesReason', 'SalesOrderID, SalesReasonID', NULL, 'ModifiedDate >= :start_date', TRUE, NULL, 'ModifiedDate', 4),
            ('Sales', 'SalesPerson', 'BusinessEntityID', NULL, 'ModifiedDate >= :start_date', TRUE, NULL, 'ModifiedDate', 4),
            ('Sales', 'SalesPersonQuotaHistory', 'BusinessEntityID, QuotaDate', NULL, 'ModifiedDate >= :start_date', TRUE, NULL, 'ModifiedDate', 4),
            ('Sales', 'Store', 'BusinessEntityID', NULL, 'ModifiedDate >= :start_date', TRUE, NULL, 'ModifiedDate', 4)
    """)

spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {ctr_catalog_name}.loading.data_ingestion_log (
        ingestion_date timestamp,
        source_catalog_name string,
        source_schema_name string not null, 
        source_table_name string not null,
        target_catalog_name string,
        target_schema_name string,
        target_table_name string,
        movements integer,
        error string,
        constraint data_ingestion_log_pk primary key (target_catalog_name, target_schema_name, target_table_name, ingestion_date)
    )
""")

In [None]:
from lib.naming import pascal_to_snake

df_schemas = (spark.read
    .table(f"{ctr_catalog_name}.loading.data_ingestion")
    .select("schema_name")
    .distinct()
    .where("active = true")
)

for schema_name in [row["schema_name"] for row in df_schemas.collect()]:
    spark.sql(f"""
        CREATE SCHEMA IF NOT EXISTS {raw_catalog_name}.{schema_name}
        MANAGED LOCATION '{managed_location}';    
    """)

    # The schema in the staging zone will be created following snake_case naming convention, as this
    # standard will be used for schemas, tables and columns from the staging zone onwards.
    spark.sql(f"""
        CREATE SCHEMA IF NOT EXISTS {stg_catalog_name}.{pascal_to_snake(schema_name)}
        MANAGED LOCATION '{managed_location}';    
    """)