# Default notebook

This default notebook is executed using Databricks Workflows as defined in resources/bundle.job.yml.

In [2]:
import sys
import os
from datetime import datetime, timedelta

sys.path.append(os.path.abspath("../../"))

run_date = datetime.strptime(dbutils.widgets.get("run_date"), '%Y-%m-%d')
start_date = run_date - timedelta(days=1)
ctr_catalog_name = dbutils.widgets.get("ctr_catalog_name")
raw_catalog_name = dbutils.widgets.get("raw_catalog_name")
stg_catalog_name = dbutils.widgets.get("stg_catalog_name")

In [0]:
from pyspark.sql.functions import from_xml, schema_of_xml
from lib.el import DataLoader, DeltaDataLoader

store_data_ingestion = (spark.read
    .table(f"{ctr_catalog_name}.loading.data_ingestion")
    .select(
        "schema_name",
        "table_name",
        "primary_key", 
        "filter",
        "selected")
    .where("LOWER(schema_name) = 'sales' AND LOWER(table_name) = 'store'")
).collect()[0]

schema_name = store_data_ingestion["schema_name"]
table_name = store_data_ingestion["table_name"]
primary_key = store_data_ingestion["primary_key"].replace(" ", "").split(",")
selected = None if store_data_ingestion["selected"] is None else store_data_ingestion["selected"].replace(" ", "").split(",")

store_data_loader = DeltaDataLoader(
    catalog_name = raw_catalog_name,
    schema_name = schema_name,
    table_name = table_name,
    primary_key = primary_key,
    selected = selected
)

if spark.catalog.tableExists(f"{stg_catalog_name}.{schema_name}.{table_name}"):
    start_date_filter = f"TO_DATE('{start_date.strftime('%Y-%m-%d')}', 'yyyy-MM-dd')"
    filter = store_data_ingestion["filter"].replace(":start_date", start_date_filter)
else:
    filter = None

store_data_loader.extract(filter)

if store_data_loader.df.count() > 0:
    demographics_schema = schema_of_xml(store_data_loader.df.first()["Demographics"])

    df_store_demographics = (store_data_loader.df
        .withColumn(
            "Demographics",
            from_xml("Demographics", demographics_schema).alias("data")
        )

        .select(
            *primary_key, 
            "Demographics.AnnualRevenue",
            "Demographics.AnnualSales",
            "Demographics.BankName",
            "Demographics.Brands",
            "Demographics.BusinessType",
            "Demographics.Internet",
            "Demographics.NumberEmployees",
            "Demographics.Specialty",
            "Demographics.SquareFeet",
            "Demographics.YearOpened"
        )
    )

    spark.sql(f"create schema if not exists {stg_catalog_name}.{schema_name}")

    demographics_table_name = f"{stg_catalog_name}.{schema_name}.StoreDemographics"

    (DataLoader
        .fromDataFrame(df_store_demographics, *primary_key)
        .load_into(demographics_table_name)
    )

    (store_data_loader
        .apply(lambda df: df.select([col for col in store_data_loader.df.columns if col != "Demographics"]))
        .load_into(f"{stg_catalog_name}.{schema_name}.Store")
    )