# Create and Validate KPI Dashboard Views (Databricks SQL)
This notebook installs dependencies, connects to Databricks SQL using environment variables, executes the SQL in `sql/dashboard_views.sql`, validates the created views, queries them, plots KPIs, and saves CSV outputs.

## 1) Set Up Python Dependencies
Install and import required packages. If packages are already installed, pip will skip or reuse cached wheels.

In [None]:
# Auto-install dependencies if missing and import
import os, re, sys, subprocess, pathlib

def ensure_packages(pkgs):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *pkgs])
    except Exception as e:
        print("Package install failed:", e)
        raise

# Ensure core packages
try:
    from databricks import sql as dbsql  # noqa: F401
    import pandas as pd  # noqa: F401
    import matplotlib.pyplot as plt  # noqa: F401
except Exception:
    ensure_packages(["databricks-sql-connector", "pandas", "matplotlib", "python-dotenv"])
    from databricks import sql as dbsql
    import pandas as pd
    import matplotlib.pyplot as plt

from datetime import datetime, timedelta

# Display versions
print("pandas:", pd.__version__)
try:
    import databricks.sql as _tmp
    print("databricks-sql-connector:", getattr(_tmp, "__version__", "installed"))
except Exception as e:
    print("databricks-sql-connector not imported:", e)

ModuleNotFoundError: No module named 'pandas'

## 2) Configure Databricks SQL Connection (env vars)
We read credentials from environment variables (or a local .env).

In [None]:
# Optional: load from a .env file in project root
try:
    from dotenv import load_dotenv
    load_dotenv(dotenv_path=os.path.expanduser("/Users/kritan/data-monorepo/.env"))
except Exception as e:
    print(".env load skipped:", e)

DATABRICKS_HOST = os.getenv("DATABRICKS_HOST")
DATABRICKS_HTTP_PATH = os.getenv("DATABRICKS_HTTP_PATH")
DATABRICKS_TOKEN = os.getenv("DATABRICKS_TOKEN")

if not DATABRICKS_HOST or not DATABRICKS_HTTP_PATH or not DATABRICKS_TOKEN:
    raise RuntimeError(
        "Missing required env vars. Set DATABRICKS_HOST, DATABRICKS_HTTP_PATH, DATABRICKS_TOKEN.\n"
        "Tip: create /Users/kritan/data-monorepo/.env with:\n"
        "DATABRICKS_HOST=adb-xxxx.azuredatabricks.net\n"
        "DATABRICKS_HTTP_PATH=/sql/1.0/warehouses/xxxxx\n"
        "DATABRICKS_TOKEN=xxxxxxxx\n"
    )

print("Using host:", DATABRICKS_HOST)
print("Using http_path:", DATABRICKS_HTTP_PATH[:16] + "...")

## 3) Open Connection
Create a Databricks SQL connection and a cursor.

In [None]:
conn = dbsql.connect(
    server_hostname=DATABRICKS_HOST,
    http_path=DATABRICKS_HTTP_PATH,
    access_token=DATABRICKS_TOKEN,
)
cur = conn.cursor()
print("Connected.")

## 4) Execute SQL to Create Views from File
Read the repository SQL file and execute statements idempotently.

In [None]:
sql_path = "/Users/kritan/data-monorepo/real-time-lakehouse/sql/dashboard_views.sql"
with open(sql_path, "r", encoding="utf-8") as f:
    text = f.read()

# Remove line comments and split by semicolons at end of statements
lines = []
for ln in text.splitlines():
    if ln.strip().startswith("--"):
        continue
    lines.append(ln)
cleaned = "\n".join(lines)

stmts = [s.strip() for s in re.split(r";\s*(?:\n|$)", cleaned) if s.strip()]

executed, errors = 0, []
for s in stmts:
    try:
        print("Executing:\n", s[:120].replace("\n"," "), "...")
        cur.execute(s)
        executed += 1
    except Exception as e:
        print("Error executing statement:", e)
        errors.append((s, str(e)))

print(f"Executed {executed} statements. Errors: {len(errors)}")
if errors:
    for i, (s, msg) in enumerate(errors[:5], 1):
        print(f"[{i}] {msg}\nSQL: {s[:200]}...")
    raise RuntimeError("One or more SQL statements failed. See logs above.")

## 5) Validate Created Views
Confirm the expected views exist.

In [None]:
views_to_check = [
    ("demo", "ecommerce_rt", "v_kpi_timeseries"),
    ("demo", "ecommerce_rt", "v_kpi_last_15m"),
]

rows = []
for cat, sch, name in views_to_check:
    cur.execute(f"USE CATALOG {cat}")
    cur.execute(f"USE SCHEMA {sch}")
    cur.execute("""
        SELECT table_catalog, table_schema, table_name
        FROM system.information_schema.views
        WHERE table_name = ?
    """, (name,))
    rows.extend(cur.fetchall())

print("Found views:")
for r in rows:
    print(r)

if len(rows) != len(views_to_check):
    raise RuntimeError(
        "Expected views not found. Verify your SQL warehouse has access to catalog/schema and that the gold table exists."
    )

## 6) Query v_kpi_timeseries
Load recent KPI time series into a pandas DataFrame.

In [None]:
limit_n = 500
cur.execute("USE CATALOG demo")
cur.execute("USE SCHEMA ecommerce_rt")
cur.execute(f"""
    SELECT window_start, window_end, orders, gmv, active_users, conversion_rate
    FROM v_kpi_timeseries
    ORDER BY window_start DESC
    LIMIT {limit_n}
""")
cols = [d[0] for d in cur.description]
rows = cur.fetchall()

ts_df = pd.DataFrame(rows, columns=cols)
ts_df = ts_df.sort_values("window_start").reset_index(drop=True)
ts_df.head()

## 7) Query v_kpi_last_15m with Parameterized Window
Customize the aggregation window in minutes.

In [None]:
minutes = int(15)
cur.execute("USE CATALOG demo")
cur.execute("USE SCHEMA ecommerce_rt")
cur.execute(f"""
    SELECT
      SUM(orders) AS orders_agg,
      SUM(gmv) AS gmv_agg,
      AVG(conversion_rate) AS conversion_agg,
      MAX(window_start) AS last_window
    FROM gold_kpis
    WHERE window_start >= now() - INTERVAL {minutes} minutes
""")
cols = [d[0] for d in cur.description]
rows = cur.fetchall()

last_df = pd.DataFrame(rows, columns=cols)
last_df

## 8) Plot KPIs Over Time
Plot orders and GMV; GMV on secondary axis.

In [None]:
if not ts_df.empty:
    fig, ax1 = plt.subplots(figsize=(10, 5))
    ax2 = ax1.twinx()

    ax1.plot(ts_df["window_start"], ts_df["orders"], color="tab:blue", label="orders")
    ax2.plot(ts_df["window_start"], ts_df["gmv"], color="tab:orange", label="gmv")

    ax1.set_xlabel("window_start")
    ax1.set_ylabel("orders", color="tab:blue")
    ax2.set_ylabel("gmv", color="tab:orange")
    ax1.grid(True, alpha=0.3)
    fig.tight_layout()
    plt.show()
else:
    print("No timeseries data to plot.")

## 9) Save Query Results to CSV
Write time series and last-window KPIs to ./out.

In [None]:
out_dir = pathlib.Path("/Users/kritan/data-monorepo/real-time-lakehouse/out")
out_dir.mkdir(parents=True, exist_ok=True)

if 'ts_df' in globals() and not ts_df.empty:
    ts_df.to_csv(out_dir/"kpi_timeseries.csv", index=False)
    print("Saved:", out_dir/"kpi_timeseries.csv")
else:
    print("Timeseries DataFrame empty; skipped save.")

if 'last_df' in globals() and not last_df.empty:
    last_df.to_csv(out_dir/"kpi_last_window.csv", index=False)
    print("Saved:", out_dir/"kpi_last_window.csv")
else:
    print("Last-window DataFrame empty; skipped save.")

## 10) Basic Assertions for Data Quality
Simple checks to fail fast if data is unexpected.

In [None]:
try:
    if 'ts_df' in globals() and not ts_df.empty:
        assert (ts_df['orders'] >= 0).all(), "orders must be non-negative"
        assert (ts_df['gmv'] >= 0).all(), "gmv must be non-negative"
        assert ((ts_df['conversion_rate'] >= 0) & (ts_df['conversion_rate'] <= 1)).all(), "conversion_rate must be in [0,1]"
    else:
        print("Warning: ts_df is empty; skipping KPI assertions.")

    if 'last_df' in globals() and not last_df.empty:
        assert last_df.iloc[0]['orders_agg'] >= 0, "orders_agg must be non-negative"
        assert last_df.iloc[0]['gmv_agg'] >= 0, "gmv_agg must be non-negative"
        assert 0 <= last_df.iloc[0]['conversion_agg'] <= 1, "conversion_agg must be in [0,1]"
    else:
        print("Warning: last_df is empty; skipping aggregate assertions.")
finally:
    try:
        cur.close()
    except Exception:
        pass
    try:
        conn.close()
    except Exception:
        pass
print("Done.")