In [0]:
# Import all parameters from 00_setup_storage
%run "./00_setup_storage"


In [0]:
# Databricks notebook source
# -----------------------------
# STEP 0: Set external storage locations
# -----------------------------
raw_path        = "abfss://datalake@etldatalakeabhi.dfs.core.windows.net/raw/"
bronze_path     = "abfss://datalake@etldatalakeabhi.dfs.core.windows.net/bronze/"
silver_path     = "abfss://datalake@etldatalakeabhi.dfs.core.windows.net/silver/"
gold_path       = "abfss://datalake@etldatalakeabhi.dfs.core.windows.net/gold/"
checkpoint_path = "abfss://datalake@etldatalakeabhi.dfs.core.windows.net/checkpoint/"


print("External storage paths configured:")
print("RAW:", raw_path)
print("BRONZE:", bronze_path)
print("SILVER:", silver_path)
print("GOLD:", gold_path)
print("CHECKPOINT:", checkpoint_path)

# Test access (optional)
# Should list folders or show empty
display(dbutils.fs.ls(raw_path))

In [0]:
# Databricks notebook source
# -----------------------------
# STEP 1: Create Bronze / Silver / Gold schemas in Unity Catalog
# -----------------------------
spark.sql("CREATE SCHEMA IF NOT EXISTS ws_databricks_etl.bronze")
spark.sql("CREATE SCHEMA IF NOT EXISTS ws_databricks_etl.silver")
spark.sql("CREATE SCHEMA IF NOT EXISTS ws_databricks_etl.gold")

print("Schemas created successfully: bronze, silver, gold")


In [0]:
# Databricks notebook source
# -----------------------------
# STEP 2: Pull orders.csv from GitHub
# -----------------------------
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp

github_url = "https://raw.githubusercontent.com/abhishektripathi27/databricks-etl-pipeline/main/data/orders.csv"

# Read CSV via pandas
pdf = pd.read_csv(github_url)

# Convert to Spark DataFrame
orders_df = SparkSession.builder.getOrCreate().createDataFrame(pdf)

# Add ingestion timestamp
orders_df = orders_df.withColumn("ingestion_timestamp", current_timestamp())

# Preview
display(orders_df)


In [0]:
# Databricks notebook source
# -----------------------------
# STEP 3: Write to Bronze Delta using external location
# -----------------------------
orders_df.write.format("delta").mode("overwrite").save(bronze_path + "orders/")

print("Bronze Delta written successfully to external location")


In [0]:
# Databricks notebook source
# -----------------------------
# STEP 4: Register Bronze table in Unity Catalog
# -----------------------------
spark.sql(f"""
CREATE TABLE IF NOT EXISTS ws_databricks_etl.bronze.orders
USING DELTA
LOCATION '{bronze_path}orders/'
""")

print("Bronze orders table registered successfully in Unity Catalog")


In [0]:
# List bronze orders folder
display(dbutils.fs.ls("abfss://datalake@etldatalakeabhi.dfs.core.windows.net/bronze/orders/"))

# Read and preview
df = spark.read.format("delta").load("abfss://datalake@etldatalakeabhi.dfs.core.windows.net/bronze/orders/")
display(df)
