# Databricks App — Interactive Retail Analytics

Deploys a Gradio web app as a Databricks App with four tabs:
Customer 360, Revenue Explorer, Product Analytics, and Executive KPIs.

Queries run via the Databricks SDK Statement Execution API (no extra SQL driver needed).

**Prereqs**: Run notebooks 00–08 first.

## 1 — Configuration

In [None]:
import os

CATALOG = spark.catalog.currentCatalog()
GOLD    = f"{CATALOG}.retail_gold"
SILVER  = f"{CATALOG}.retail_silver"

# SQL warehouse for the app to use — update this to match your workspace
WAREHOUSE_ID = dbutils.widgets.get("warehouse_id") if "warehouse_id" in [w.name for w in dbutils.widgets.getAll()] else ""
if not WAREHOUSE_ID:
    # Fallback: pick the first available warehouse via SDK
    from databricks.sdk import WorkspaceClient
    w = WorkspaceClient()
    wh_list = [wh for wh in w.warehouses.list() if str(wh.state) in ("RUNNING", "STOPPED")]
    WAREHOUSE_ID = wh_list[0].id if wh_list else ""
    print(f"Auto-detected warehouse: {WAREHOUSE_ID}")

APP_NAME = "retail-analytics-app"
APP_DIR  = "/Workspace/Users/{}/apps/retail_analytics".format(
    spark.sql("SELECT current_user()").collect()[0][0]
)

print(f"Catalog      : {CATALOG}")
print(f"Gold         : {GOLD}")
print(f"Warehouse    : {WAREHOUSE_ID}")
print(f"App Dir      : {APP_DIR}")

## 2 — Create App Directory

In [None]:
import os
os.makedirs(APP_DIR, exist_ok=True)
print(f"✓ App directory: {APP_DIR}")

## 3 — Write the Gradio App Code

The app uses `databricks.sql` connector to query Gold tables and renders results with Gradio.

In [None]:
# Write app.py using base64 to avoid all string escaping issues
import base64

# This is the app.py content, base64-encoded to survive notebook JSON escaping
_b64 = base64.b64encode(open("/dev/null", "rb").read())  # placeholder

# Build the app source code programmatically (no nested string literals)
_lines = []
_lines.append("import os")
_lines.append("import logging")
_lines.append("")
_lines.append("import gradio as gr")
_lines.append("import pandas as pd")
_lines.append("from databricks.sdk import WorkspaceClient")
_lines.append("")
_lines.append("logging.basicConfig(level=logging.INFO)")
_lines.append("logger = logging.getLogger(__name__)")
_lines.append("")
_lines.append("CATALOG = os.getenv('CATALOG', '" + CATALOG + "')")
_lines.append("GOLD = CATALOG + '.retail_gold'")
_lines.append("SILVER = CATALOG + '.retail_silver'")
_lines.append("WAREHOUSE_ID = os.getenv('DATABRICKS_WAREHOUSE_ID', '')")
_lines.append("")
_lines.append("def run_query(sql):")
_lines.append("    try:")
_lines.append("        w = WorkspaceClient()")
_lines.append("        resp = w.statement_execution.execute_statement(")
_lines.append("            warehouse_id=WAREHOUSE_ID, statement=sql, wait_timeout='30s')")
_lines.append("        if resp.result and resp.manifest:")
_lines.append("            cols = [c.name for c in resp.manifest.schema.columns]")
_lines.append("            rows = resp.result.data_array or []")
_lines.append("            return pd.DataFrame(rows, columns=cols)")
_lines.append("        return pd.DataFrame()")
_lines.append("    except Exception as e:")
_lines.append("        logger.error('Query failed: %s', e)")
_lines.append("        return pd.DataFrame({'error': [str(e)]})")
_lines.append("")
_lines.append("def customer_lookup(customer_id):")
_lines.append("    try:")
_lines.append("        cid = int(customer_id)")
_lines.append("    except (ValueError, TypeError):")
_lines.append("        return 'Enter a valid customer ID (integer).', None")
_lines.append("    sql = 'SELECT c.customer_key, c.customer_name, c.market_segment, '")
_lines.append("    sql += 'c.nation_name, c.region_name, c.balance_tier, '")
_lines.append("    sql += 'r.rfm_segment, r.rfm_score, ROUND(r.monetary,2) as lifetime_value, '")
_lines.append("    sql += 'r.frequency as total_orders, r.recency_days, '")
_lines.append("    sql += 'ROUND(r.avg_order_value,2) as avg_order_value '")
_lines.append("    sql += 'FROM ' + SILVER + '.dim_customer c '")
_lines.append("    sql += 'LEFT JOIN ' + GOLD + '.gold_customer_rfm r '")
_lines.append("    sql += 'ON c.customer_key = r.customer_key '")
_lines.append("    sql += 'WHERE c.customer_key = ' + str(cid)")
_lines.append("    profile = run_query(sql)")
_lines.append("    if 'error' in profile.columns:")
_lines.append("        return 'Query error: ' + str(profile['error'].iloc[0]), None")
_lines.append("    if profile.empty:")
_lines.append("        return 'Customer ' + str(cid) + ' not found.', None")
_lines.append("    r = profile.iloc[0]")
_lines.append("    md = '## Customer #' + str(r.get('customer_key','')) + ' - ' + str(r.get('customer_name',''))")
_lines.append("    md += '\\n\\n| Attribute | Value |\\n|---|---|'")
_lines.append("    md += '\\n| Segment | ' + str(r.get('market_segment','')) + ' |'")
_lines.append("    md += '\\n| Region | ' + str(r.get('region_name','')) + ' (' + str(r.get('nation_name','')) + ') |'")
_lines.append("    md += '\\n| RFM Segment | ' + str(r.get('rfm_segment','')) + ' |'")
_lines.append("    md += '\\n| Lifetime Value | ' + str(r.get('lifetime_value','')) + ' |'")
_lines.append("    md += '\\n| Total Orders | ' + str(r.get('total_orders','')) + ' |'")
_lines.append("    md += '\\n| Avg Order Value | ' + str(r.get('avg_order_value','')) + ' |'")
_lines.append("    md += '\\n| Recency (days) | ' + str(r.get('recency_days','')) + ' |'")
_lines.append("    return md, profile")
_lines.append("")
_lines.append("def revenue_explorer(region, start_month, end_month):")
_lines.append("    w = 'WHERE 1=1'")
_lines.append("    if region != 'ALL':")
_lines.append("        w += \" AND region = '\" + region + \"'\"")
_lines.append("    if start_month:")
_lines.append("        w += \" AND year_month >= '\" + start_month + \"'\"")
_lines.append("    if end_month:")
_lines.append("        w += \" AND year_month <= '\" + end_month + \"'\"")
_lines.append("    sql = 'SELECT year_month, region, ROUND(SUM(net_revenue),0) as net_revenue, '")
_lines.append("    sql += 'SUM(num_orders) as orders, ROUND(AVG(profit_margin_pct),1) as margin_pct '")
_lines.append("    sql += 'FROM ' + GOLD + '.gold_monthly_sales ' + w")
_lines.append("    sql += ' GROUP BY year_month, region ORDER BY year_month, region'")
_lines.append("    df = run_query(sql)")
_lines.append("    if 'error' in df.columns:")
_lines.append("        return 'Query error: ' + str(df['error'].iloc[0]), df")
_lines.append("    total = pd.to_numeric(df['net_revenue'], errors='coerce').sum()")
_lines.append("    return '**Revenue**: ${:,.0f} | **Rows**: {}'.format(total, len(df)), df")
_lines.append("")
_lines.append("def product_analytics(sort_by, top_n):")
_lines.append("    sql = 'SELECT brand, price_band, ROUND(SUM(net_revenue),0) as net_revenue, '")
_lines.append("    sql += 'ROUND(AVG(profit_margin_pct),1) as margin_pct, '")
_lines.append("    sql += 'ROUND(AVG(return_rate_pct),1) as return_rate_pct, '")
_lines.append("    sql += 'SUM(num_orders) as orders '")
_lines.append("    sql += 'FROM ' + GOLD + '.gold_product_performance '")
_lines.append("    sql += 'GROUP BY brand, price_band ORDER BY ' + sort_by + ' DESC '")
_lines.append("    sql += 'LIMIT ' + str(int(top_n))")
_lines.append("    return run_query(sql)")
_lines.append("")
_lines.append("def executive_kpis():")
_lines.append("    sql = 'SELECT year_quarter, total_orders, active_customers, '")
_lines.append("    sql += 'ROUND(gross_order_value,0) as gross_order_value, '")
_lines.append("    sql += 'ROUND(avg_order_value,0) as avg_order_value, '")
_lines.append("    sql += 'ROUND(revenue_per_customer,0) as rev_per_customer, '")
_lines.append("    sql += 'qoq_revenue_growth_pct '")
_lines.append("    sql += 'FROM ' + GOLD + '.gold_executive_summary ORDER BY year_quarter'")
_lines.append("    return run_query(sql)")
_lines.append("")
_lines.append("with gr.Blocks(title='Retail Analytics', theme=gr.themes.Soft()) as app:")
_lines.append("    gr.Markdown('# Retail Analytics Dashboard')")
_lines.append("    gr.Markdown('Powered by **Databricks Lakehouse**')")
_lines.append("    with gr.Tab('Customer 360'):")
_lines.append("        gr.Markdown('### Look up any customer by ID')")
_lines.append("        with gr.Row():")
_lines.append("            ci = gr.Textbox(label='Customer ID', placeholder='e.g. 42', scale=1)")
_lines.append("            cb = gr.Button('Look Up', variant='primary', scale=1)")
_lines.append("        cm = gr.Markdown()")
_lines.append("        ct = gr.Dataframe(label='Raw Profile')")
_lines.append("        cb.click(customer_lookup, inputs=ci, outputs=[cm, ct])")
_lines.append("    with gr.Tab('Revenue Explorer'):")
_lines.append("        gr.Markdown('### Monthly revenue by region')")
_lines.append("        with gr.Row():")
_lines.append("            rd = gr.Dropdown(choices=['ALL','AMERICA','EUROPE','ASIA','AFRICA','MIDDLE EAST'], value='ALL', label='Region')")
_lines.append("            sm = gr.Textbox(label='Start (yyyy-MM)', value='1995-01')")
_lines.append("            em = gr.Textbox(label='End (yyyy-MM)', value='1997-12')")
_lines.append("            rb = gr.Button('Query', variant='primary')")
_lines.append("        rs = gr.Markdown()")
_lines.append("        rt = gr.Dataframe(label='Revenue Data')")
_lines.append("        rb.click(revenue_explorer, inputs=[rd, sm, em], outputs=[rs, rt])")
_lines.append("    with gr.Tab('Product Analytics'):")
_lines.append("        gr.Markdown('### Product performance by brand')")
_lines.append("        with gr.Row():")
_lines.append("            sd = gr.Dropdown(choices=['net_revenue','margin_pct','return_rate_pct','orders'], value='net_revenue', label='Sort By')")
_lines.append("            tn = gr.Slider(minimum=5, maximum=50, value=20, step=5, label='Top N')")
_lines.append("            pb = gr.Button('Query', variant='primary')")
_lines.append("        pt = gr.Dataframe(label='Product Performance')")
_lines.append("        pb.click(product_analytics, inputs=[sd, tn], outputs=pt)")
_lines.append("    with gr.Tab('Executive KPIs'):")
_lines.append("        gr.Markdown('### Quarterly executive summary')")
_lines.append("        eb = gr.Button('Load KPIs', variant='primary')")
_lines.append("        et = gr.Dataframe(label='Quarterly KPIs')")
_lines.append("        eb.click(executive_kpis, outputs=et)")
_lines.append("")
_lines.append("if __name__ == '__main__':")
_lines.append("    logger.info('Starting Retail Analytics App (catalog=%s)', CATALOG)")
_lines.append("    app.launch(server_name='0.0.0.0', server_port=int(os.getenv('PORT', '8000')))")

app_code = "\n".join(_lines) + "\n"
app_file = f"{APP_DIR}/app.py"
with open(app_file, "w") as f:
    f.write(app_code)
print(f"✓ App code written to: {app_file}")
print(f"  Lines: {len(_lines)}")

## 4 — Write App Configuration

In [None]:
config_content = f"""command:
- python
- app.py
env:
- name: CATALOG
  value: {CATALOG}
- name: DATABRICKS_WAREHOUSE_ID
  value: {WAREHOUSE_ID}
"""

config_file = f"{APP_DIR}/app.yaml"

with open(config_file, "w") as f:
    f.write(config_content)
print(f"✓ App config: {config_file}")
print(config_content)

In [None]:
# Requirements file — databricks-sdk is pre-installed in app runtime
requirements = """gradio>=4.0
databricks-sdk
pandas
"""

req_file = f"{APP_DIR}/requirements.txt"

with open(req_file, "w") as f:
    f.write(requirements)
print(f"✓ Requirements: {req_file}")

## 5 — Deploy the Databricks App

In [None]:
%pip install databricks-sdk --upgrade --quiet

In [None]:
import requests, time, json

# Re-init variables (kernel may restart between cells)
CATALOG  = spark.catalog.currentCatalog()
GOLD     = f"{CATALOG}.retail_gold"
SILVER   = f"{CATALOG}.retail_silver"
APP_NAME = "retail-analytics-app"
APP_DIR  = "/Workspace/Users/{}/apps/retail_analytics".format(
    spark.sql("SELECT current_user()").collect()[0][0]
)

# Re-detect warehouse
from databricks.sdk import WorkspaceClient as _WC
_w = _WC()
_wh = [wh for wh in _w.warehouses.list() if str(wh.state) in ("RUNNING", "STOPPED")]
WAREHOUSE_ID = _wh[0].id if _wh else ""

db_token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
db_host  = spark.conf.get("spark.databricks.workspaceUrl")
headers  = {"Authorization": f"Bearer {db_token}", "Content-Type": "application/json"}
base     = f"https://{db_host}/api/2.0/apps"

# ── Step A: Create the app via REST API ───────────────────────────────────────
print("Creating app...")
resp = requests.post(base, headers=headers, json={
    "name": APP_NAME,
    "description": "Retail analytics dashboard — Gold-layer Delta tables",
    "resources": [
        {"name": "sql-warehouse", "sql_warehouse": {"id": WAREHOUSE_ID, "permission": "CAN_USE"}},
    ],
})
if resp.status_code == 200:
    app_data = resp.json()
    print(f"✓ App created: {APP_NAME}")
    print(f"  App URL: {app_data.get('url', 'pending...')}")
elif resp.status_code == 409:
    print(f"○ App '{APP_NAME}' already exists — reusing it")
else:
    print(f"⚠ App creation ({resp.status_code}): {resp.text[:300]}")

# ── Step B: Deploy source code via REST API ───────────────────────────────────
print("\nDeploying app...")
resp = requests.post(f"{base}/{APP_NAME}/deployments", headers=headers, json={
    "source_code_path": APP_DIR,
    "mode": "SNAPSHOT",
})
if resp.status_code == 200:
    dep = resp.json()
    dep_id = dep.get("deployment_id", "")
    print(f"✓ Deployment started: {dep_id}")

    # Poll until deployment completes (up to 5 min)
    for i in range(30):
        time.sleep(10)
        status_resp = requests.get(f"{base}/{APP_NAME}/deployments/{dep_id}", headers=headers)
        if status_resp.status_code == 200:
            state = status_resp.json().get("status", {}).get("state", "")
            print(f"  [{i*10}s] Status: {state}")
            if state == "SUCCEEDED":
                # Get the app URL
                app_resp = requests.get(f"{base}/{APP_NAME}", headers=headers)
                if app_resp.status_code == 200:
                    app_url = app_resp.json().get("url", "")
                    print(f"\n✓ App deployed successfully!")
                    print(f"  URL: https://{app_url}")
                break
            elif state == "FAILED":
                msg = status_resp.json().get("status", {}).get("message", "")
                print(f"\n✗ Deployment failed: {msg}")
                break
    else:
        print("  Deployment still in progress — check Compute → Apps in the UI")
else:
    print(f"⚠ Deployment ({resp.status_code}): {resp.text[:300]}")
    print(f"\nManual deployment:")
    print(f"  1. Go to Compute → Apps → click '{APP_NAME}'")
    print(f"  2. Create a new deployment with source: {APP_DIR}")

## 5b — Grant App Service Principal Access to Data

The Databricks App runs under its own service principal. It needs explicit grants to read your Unity Catalog tables.

In [None]:
# Look up the app's service principal
import requests, json

db_token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
db_host  = spark.conf.get("spark.databricks.workspaceUrl")
headers  = {"Authorization": f"Bearer {db_token}", "Content-Type": "application/json"}

resp = requests.get(
    f"https://{db_host}/api/2.0/apps/{APP_NAME}",
    headers=headers,
)
app_info = resp.json()
sp_name = app_info.get("service_principal_name", "")
sp_id   = app_info.get("service_principal_id", "")

print(f"App service principal: {sp_name} (id={sp_id})")

if sp_name:
    # Grant the service principal access to the catalog and schemas
    grants = [
        f"GRANT USE CATALOG ON CATALOG {CATALOG} TO `{sp_name}`",
        f"GRANT USE SCHEMA ON SCHEMA {CATALOG}.retail_gold TO `{sp_name}`",
        f"GRANT USE SCHEMA ON SCHEMA {CATALOG}.retail_silver TO `{sp_name}`",
        f"GRANT SELECT ON SCHEMA {CATALOG}.retail_gold TO `{sp_name}`",
        f"GRANT SELECT ON SCHEMA {CATALOG}.retail_silver TO `{sp_name}`",
    ]
    for sql in grants:
        try:
            spark.sql(sql)
            print(f"  ✓ {sql.split('GRANT ')[1][:60]}")
        except Exception as e:
            print(f"  ⚠ {sql.split('GRANT ')[1][:40]}... → {str(e)[:80]}")

    print(f"\n✓ Permissions granted. Restart the app or try again.")
else:
    print("⚠ Could not find app service principal. Grant permissions manually:")
    print(f"  GRANT USE CATALOG ON CATALOG {CATALOG} TO `<app_service_principal>`")
    print(f"  GRANT SELECT ON SCHEMA {CATALOG}.retail_gold TO `<app_service_principal>`")
    print(f"  GRANT SELECT ON SCHEMA {CATALOG}.retail_silver TO `<app_service_principal>`")

## 6 — Verify App Files

In [None]:
import os

print(f"App directory contents ({APP_DIR}):")
for name in os.listdir(APP_DIR):
    full = os.path.join(APP_DIR, name)
    size = os.path.getsize(full) if os.path.isfile(full) else 0
    print(f"  {name:<25} {size:>8} bytes")

## 7 — Quick Smoke Test (Query Gold from notebook)

In [None]:
# Verify the same queries the app will run
print("Smoke test — queries that the app executes:\n")

print("1. Customer lookup (ID=42):")
display(spark.sql(f"""
    SELECT c.customer_key, c.customer_name, c.market_segment, c.region_name,
           r.rfm_segment, r.rfm_score, ROUND(r.monetary, 2) as ltv
    FROM {SILVER}.dim_customer c
    LEFT JOIN {GOLD}.gold_customer_rfm r ON c.customer_key = r.customer_key
    WHERE c.customer_key = 42
"""))

print("\n2. Revenue summary (AMERICA, 1996):")
display(spark.sql(f"""
    SELECT year_month, region, ROUND(SUM(net_revenue), 0) as net_revenue
    FROM {GOLD}.gold_monthly_sales
    WHERE region = 'AMERICA' AND year_month >= '1996-01' AND year_month <= '1996-12'
    GROUP BY year_month, region
    ORDER BY year_month
"""))

print("\n3. Top 5 brands:")
display(spark.sql(f"""
    SELECT brand, ROUND(SUM(net_revenue), 0) as revenue
    FROM {GOLD}.gold_product_performance
    GROUP BY brand
    ORDER BY revenue DESC
    LIMIT 5
"""))

---
App deployed. Open the URL printed above to use it.

This is the last notebook — the full pipeline is now live.