In [0]:
# Storage and environment
dbutils.widgets.text("storage_account_name", "etldatalakeabhi")
dbutils.widgets.text("container_name", "datalake")
dbutils.widgets.text("env", "dev")

# Paths
dbutils.widgets.text("github_url", "https://raw.githubusercontent.com/abhishektripathi27/databricks-etl-pipeline/main/data/orders.csv")
dbutils.widgets.text("min_order_amount", "0.01")
dbutils.widgets.text("cancelled_status", "cancelled")


In [0]:
storage_account = dbutils.widgets.get("storage_account_name")
container_name  = dbutils.widgets.get("container_name")
env             = dbutils.widgets.get("env")

# Dynamic ADLS paths
raw_path        = f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/raw/"
bronze_path     = f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/bronze/"
silver_path     = f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/silver/"
gold_path       = f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/gold/"
checkpoint_path = f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/checkpoint/"
github_url      = dbutils.widgets.get("github_url")
min_order_amount = float(dbutils.widgets.get("min_order_amount"))
cancelled_status = dbutils.widgets.get("cancelled_status")


In [0]:
storage_account = "etldatalakeabhi"
container = "datalake"

raw_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/raw/"
bronze_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/bronze/"
silver_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/silver/"
gold_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/gold/"
checkpoint_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/checkpoint/"

print("RAW:", raw_path)
print("BRONZE:", bronze_path)
print("SILVER:", silver_path)
print("GOLD:", gold_path)
print("CHECKPOINT:", checkpoint_path)


# Unity Catalog External Location Setup Guide

To access your Azure storage (`etldatalakeabhi.dfs.core.windows.net`) on serverless compute, you need to set up Unity Catalog external locations.

## Prerequisites (Azure Portal Steps)

### Step 1: Create an Access Connector for Azure Databricks

1. Log in to the **Azure Portal**
2. Click **+ Create a resource**
3. Search for **"Access Connector for Azure Databricks"** and select it
4. Click **Create** and fill in:
   * **Subscription**: Your Azure subscription
   * **Resource Group**: Your resource group (or create new)
   * **Name**: `databricks-access-connector-etl` (or your preferred name)
   * **Region**: Same region as your storage account
5. On the **Managed Identity** tab:
   * Set **Status** to **On** (for system-assigned managed identity)
6. Click **Review + create**, then **Create**
7. Once deployed, go to the resource and **copy the Resource ID**
   * Format: `/subscriptions/<subscription-id>/resourceGroups/<resource-group>/providers/Microsoft.Databricks/accessConnectors/<connector-name>`

### Step 2: Grant Storage Access to the Managed Identity

1. In Azure Portal, navigate to your storage account: **`etldatalakeabhi`**
2. Go to **Access Control (IAM)**
3. Click **+ Add** → **Add role assignment**
4. Select role: **Storage Blob Data Contributor** (or Owner if you need full access)
5. Click **Next**
6. Select **Managed identity**
7. Click **+ Select members**
8. Find and select your access connector: **`databricks-access-connector-etl`**
9. Click **Review + assign**

### Step 3: Configure Network Access (if storage has firewall enabled)

If your storage account has network restrictions:
1. Go to your storage account → **Networking**
2. Under **Exceptions**, enable: **"Allow Azure services on the trusted services list to access this storage account"**

---

## Next Steps

Once you complete the Azure Portal steps above, proceed to the next cells to create the storage credential and external locations in Databricks.

## Create Storage Credential in Databricks

After completing the Azure Portal steps, you have two options:

### Option A: Using the Databricks UI (Recommended for first-time setup)

1. In your Databricks workspace, click **Catalog** in the left sidebar
2. Click **External Data** button → **Credentials** tab
3. Click **Create credential**
4. Select **Storage credential**
5. Fill in:
   * **Credential Type**: Azure Managed Identity
   * **Name**: `azure_storage_credential_etl`
   * **Access Connector Resource ID**: Paste the Resource ID from Azure Portal
6. Click **Create**

### Option B: Using SQL (run the cell below)

Replace `<YOUR_ACCESS_CONNECTOR_RESOURCE_ID>` with the actual Resource ID from Azure Portal.

In [0]:
# Create storage credential using Databricks SDK
# This requires the databricks-sdk package

try:
    from databricks.sdk import WorkspaceClient
    from databricks.sdk.service.catalog import AzureManagedIdentity
    
    # Initialize the workspace client (uses notebook context for auth)
    w = WorkspaceClient()
    
    # Create the storage credential
    credential = w.storage_credentials.create(
        name="azure_storage_credential_etl",
        azure_managed_identity=AzureManagedIdentity(
            access_connector_id="/subscriptions/9cc5a0bc-59a0-41c7-9f55-81c0cbefd7ca/resourceGroups/rg-databricks/providers/Microsoft.Databricks/accessConnectors/access_connector_databricks"
        ),
        comment="Storage credential for ETL data lake"
    )
    
    print("✓ Storage credential created successfully!")
    print(f"  Name: {credential.name}")
    print(f"  ID: {credential.id}")
    
except ImportError:
    print("⚠ Databricks SDK not installed.")
    print("Install it with: %pip install databricks-sdk")
    print("\nAlternatively, use the CLI or UI method described in Cell 5.")
    
except Exception as e:
    print(f"✗ Error creating storage credential: {e}")
    print("\nPossible reasons:")
    print("1. You don't have CREATE STORAGE CREDENTIAL privilege (need metastore admin)")
    print("2. The access connector doesn't exist in Azure Portal")
    print("3. The access connector ID is incorrect")
    print("\nPlease use the UI method or contact your workspace administrator.")

Storage credential created successfully!
  Name: azure_storage_credential_etl
  ID: bcd1d9c0-8d1f-42c0-9a71-1f83d4d7e649

In [0]:
# If you created the credential via UI, verify it exists
try:
    credentials = spark.sql("SHOW STORAGE CREDENTIALS").collect()
    if credentials:
        print("✓ Available Storage Credentials:")
        for cred in credentials:
            print(f"  - {cred[0]}")
    else:
        print("⚠ No storage credentials found.")
        print("Please create one using the UI (see Cell 4) or SQL (Cell 5)")
except Exception as e:
    print(f"Error: {e}")
    print("\nYou may need metastore admin privileges to view/create storage credentials.")
    print("Please contact your workspace administrator.")

In [0]:
%sql
-- Create external locations for each layer of your data lake
-- These map to the paths you defined in Cell 1
-- Make sure the storage credential 'azure_storage_credential_etl' exists first

-- RAW layer
CREATE EXTERNAL LOCATION IF NOT EXISTS etl_raw_location
URL 'abfss://datalake@etldatalakeabhi.dfs.core.windows.net/raw/'
WITH (STORAGE CREDENTIAL azure_storage_credential_etl)
COMMENT 'Raw data landing zone';

-- BRONZE layer
CREATE EXTERNAL LOCATION IF NOT EXISTS etl_bronze_location
URL 'abfss://datalake@etldatalakeabhi.dfs.core.windows.net/bronze/'
WITH (STORAGE CREDENTIAL azure_storage_credential_etl)
COMMENT 'Bronze layer - raw ingested data';

-- SILVER layer
CREATE EXTERNAL LOCATION IF NOT EXISTS etl_silver_location
URL 'abfss://datalake@etldatalakeabhi.dfs.core.windows.net/silver/'
WITH (STORAGE CREDENTIAL azure_storage_credential_etl)
COMMENT 'Silver layer - cleaned and validated data';

-- GOLD layer
CREATE EXTERNAL LOCATION IF NOT EXISTS etl_gold_location
URL 'abfss://datalake@etldatalakeabhi.dfs.core.windows.net/gold/'
WITH (STORAGE CREDENTIAL azure_storage_credential_etl)
COMMENT 'Gold layer - business-level aggregates';

-- CHECKPOINT location
CREATE EXTERNAL LOCATION IF NOT EXISTS etl_checkpoint_location
URL 'abfss://datalake@etldatalakeabhi.dfs.core.windows.net/checkpoint/'
WITH (STORAGE CREDENTIAL azure_storage_credential_etl)
COMMENT 'Checkpoint location for streaming jobs';

-- Verify all external locations were created
SHOW EXTERNAL LOCATIONS;

In [0]:
%sql
-- Grant yourself permissions to use these external locations
-- This allows you to create tables and read/write data in these locations

GRANT ALL PRIVILEGES ON EXTERNAL LOCATION etl_raw_location TO `tripathiabhi@hotmail.com`;
GRANT ALL PRIVILEGES ON EXTERNAL LOCATION etl_bronze_location TO `tripathiabhi@hotmail.com`;
GRANT ALL PRIVILEGES ON EXTERNAL LOCATION etl_silver_location TO `tripathiabhi@hotmail.com`;
GRANT ALL PRIVILEGES ON EXTERNAL LOCATION etl_gold_location TO `tripathiabhi@hotmail.com`;
GRANT ALL PRIVILEGES ON EXTERNAL LOCATION etl_checkpoint_location TO `tripathiabhi@hotmail.com`;

In [0]:
# Test that you can now access the storage through Unity Catalog
# This should work without any spark.conf.set() calls

storage_account = "etldatalakeabhi"
container = "datalake"

# Test listing the raw directory
try:
    files = dbutils.fs.ls(f"abfss://{container}@{storage_account}.dfs.core.windows.net/raw/")
    print("✓ Successfully accessed storage!")
    print(f"\nFound {len(files)} items in raw/ directory:")
    for file in files[:10]:  # Show first 10 items
        print(f"  - {file.name}")
except Exception as e:
    print(f"✗ Error accessing storage: {e}")
    print("\nTroubleshooting:")
    print("1. Make sure you completed all Azure Portal steps (Cell 3)")
    print("2. Verify storage credential was created (Cell 6)")
    print("3. Verify external locations were created (Cell 7)")
    print("4. Check that permissions were granted (Cell 8)")

In [0]:
# Updated storage paths - no authentication needed!
# Unity Catalog handles authentication automatically through external locations

storage_account = "etldatalakeabhi"
container = "datalake"

raw_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/raw/"
bronze_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/bronze/"
silver_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/silver/"
gold_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/gold/"
checkpoint_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/checkpoint/"

print("✓ Storage paths configured:")
print(f"RAW: {raw_path}")
print(f"BRONZE: {bronze_path}")
print(f"SILVER: {silver_path}")
print(f"GOLD: {gold_path}")
print(f"CHECKPOINT: {checkpoint_path}")
print("\n✓ No spark.conf.set() needed - Unity Catalog handles authentication!")
print("\nYou can now use these paths in your ETL pipeline without any authentication code.")