In [0]:
%sql
CREATE EXTERNAL LOCATION IF NOT EXISTS databricksextdl_bronze
URL 'abfss://bronze@adlsexternalfororders.dfs.core.windows.net/'
WITH (STORAGE CREDENTIAL `azure_cred_devandtest`)

In [0]:
%sql
CREATE EXTERNAL LOCATION IF NOT EXISTS databricksextdl_silver
URL 'abfss://silver@adlsexternalfororders.dfs.core.windows.net/'
WITH (STORAGE CREDENTIAL `azure_cred_devandtest`)

In [0]:
CREATE EXTERNAL LOCATION  IF NOT EXISTS databricksextdl_gold
URL 'abfss://gold@adlsexternalfororders.dfs.core.windows.net/'
WITH (STORAGE CREDENTIAL `azure_cred_devandtest`)




In [0]:
CREATE EXTERNAL LOCATION  IF NOT EXISTS databricksextdl_ordercatalog
URL 'abfss://ordercatalog@adlsexternalfororders.dfs.core.windows.net/'
WITH (STORAGE CREDENTIAL `azure_cred_devandtest`)


In [0]:

CREATE EXTERNAL LOCATION IF NOT EXISTS databricksextdl_rawdata
URL 'abfss://rawdata@adlsexternalfororders.dfs.core.windows.net/'
WITH (STORAGE CREDENTIAL `azure_cred_devandtest`)

In [0]:
CREATE CATALOG IF NOT EXISTS ordercatalog MANAGED LOCATION 'abfss://ordercatalog@adlsexternalfororders.dfs.core.windows.net/';

In [0]:
%sql

-- Use the newly created catalog for subsequent commands
USE CATALOG ordercatalog ;

-- COMMAND ----------
-- DBTITLE 1,Step 2: Create Schemas (Databases) with Managed Locations
-- Each schema will store its managed tables in the specified ADLS Gen2 path.
-- Unity Catalog will manage the lifecycle of data stored in these locations.

-- Rawdata Schema: For raw, untransformed data
CREATE SCHEMA IF NOT EXISTS rawdata_schema
MANAGED LOCATION 'abfss://rawdata@adlsexternalfororders.dfs.core.windows.net/';

-- Bronze Schema: For raw data ingested, validated, and perhaps enriched with metadata
CREATE SCHEMA IF NOT EXISTS bronze_schema
MANAGED LOCATION 'abfss://bronze@adlsexternalfororders.dfs.core.windows.net/';

-- Silver Schema: For cleaned, transformed, and potentially de-duplicated data
CREATE SCHEMA IF NOT EXISTS silver_schema
MANAGED LOCATION 'abfss://silver@adlsexternalfororders.dfs.core.windows.net/';

-- Gold Schema: For aggregated, highly curated, and ready-for-consumption data
CREATE SCHEMA IF NOT EXISTS gold_schema
--LOCATION 'orders_catalog/gold_schema';
MANAGED LOCATION 'abfss://gold@adlsexternalfororders.dfs.core.windows.net/';

-- COMMAND ----------
-- DBTITLE 1,Step 3: Verify the Catalog and Schemas
-- Check that the catalog and schemas have been created
SHOW CATALOGS;
SHOW SCHEMAS IN ordercatalog;

-- You can also describe a schema to see its properties, including the managed location


In [0]:
%python


from pyspark.sql.functions import current_timestamp, lit, rand, round, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
import random
import os

# Define schema for raw orders
raw_order_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price", DoubleType(), True),
    StructField("order_timestamp", TimestampType(), True),
    StructField("status", StringType(), True)
])


In [0]:
%python
import random
from datetime import datetime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType

# Define the schema for the DataFrame
raw_order_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price", FloatType(), True),
    StructField("order_timestamp", TimestampType(), True),
    StructField("status", StringType(), True)
])

# Function to generate a single order
def generate_order(order_id_prefix="ORD"):
    order_id = f"{order_id_prefix}-{random.randint(10000, 99999)}"
    customer_id = f"CUST-{random.randint(2000, 9999)}"
    product_id = f"PROD-{random.randint(10, 100)}"
    quantity = random.randint(1, 10)
    price = float(f"{random.uniform(5, 500):.2f}")
    order_timestamp = datetime.now()
    status = random.choice(["PENDING", "COMPLETED", "CANCELLED", "SHIPPED"])
    return (order_id, customer_id, product_id, quantity, price, order_timestamp, status)

# Generate a batch of orders
num_orders = 100
orders_data = [generate_order() for _ in range(num_orders)]

# Create DataFrame
raw_orders_df = spark.createDataFrame(orders_data, schema=raw_order_schema)

# Define a path to save raw data (e.g., DBFS)
#raw_data_path = "dbfs:/FileStore/"
raw_data_path = "abfss://rawdata@adlsexternalfororders.dfs.core.windows.net/orders/"

# Overwrite for initial load, append for subsequent runs
raw_orders_df.write.format("json").mode("overwrite").save(raw_data_path)

print(f"Generated {num_orders} raw orders to: {raw_data_path}batch_1.json")

# Simulate a second batch with some updates and new orders
num_orders_batch2 = 250
orders_data_batch2 = [generate_order() for _ in range(num_orders_batch2 - 10)] # 40 new orders
# Add 10 updates to existing orders (batch_1)
existing_order_ids = raw_orders_df.select("order_id").distinct().limit(10).rdd.map(lambda r: r[0]).collect()
for oid in existing_order_ids:
    # Generate an updated order, changing status or quantity/price
    order_id = oid
    customer_id = f"CUST-{random.randint(2000, 9990)}" # Can be same or different, depends on your key
    product_id = f"PROD-{random.randint(100, 100)}"
    quantity = random.randint(1, 10)
    price = float(f"{random.uniform(5, 500):.2f}")
    order_timestamp = datetime.now()
    status = random.choice(["COMPLETED", "SHIPPED"]) # Update to a completed status
    orders_data_batch2.append((order_id, customer_id, product_id, quantity, price, order_timestamp, status))

raw_orders_df_batch2 = spark.createDataFrame(orders_data_batch2, schema=raw_order_schema)
raw_orders_df_batch2.write.format("json").mode("overwrite").save(raw_data_path)
print(f"Generated {num_orders_batch2} raw orders (including updates) to: {raw_data_path}batch_2.json")

In [0]:
CREATE TABLE ordercatalog.bronze_schema.allorders
USING DELTA
AS SELECT * FROM ordercatalog.rawdata_schema.rawdata
