In [0]:
https://oportun-sandbox.cloud.databricks.com/editor/notebooks/3331349926406355?o=2282150804257627$0# AI/BI Dashboard Setup for All-Purpose Cluster Cost Analysis
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datetime import datetime, timedelta
from pyspark.sql.functions import col, sum as spark_sum, avg, count, countDistinct, round as spark_round

# Configure visualization settings for AI/BI Dashboard
plt.style.use('default')
sns.set_palette('Set2')

# Dashboard Header
displayHTML("""
<div style='background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%); padding: 30px; border-radius: 15px; margin-bottom: 25px; box-shadow: 0 8px 32px rgba(0,0,0,0.1);'>
    <h1 style='color: white; text-align: center; margin: 0; font-size: 2.8em; font-weight: 300;'>
        üöÄ All-Purpose Cluster Cost Analytics Dashboard
    </h1>
    <p style='color: rgba(255,255,255,0.9); text-align: center; margin: 15px 0 0 0; font-size: 1.3em; font-weight: 300;'>
        AI-Powered Insights ‚Ä¢ Cost Optimization ‚Ä¢ Performance Analytics
    </p>
    <div style='text-align: center; margin-top: 20px;'>
        <span style='background: rgba(255,255,255,0.2); padding: 8px 16px; border-radius: 20px; color: white; font-size: 0.9em;'>
            üìä Real-time Data ‚Ä¢ üí° Smart Recommendations ‚Ä¢ üìà Trend Analysis
        </span>
    </div>
</div>
""")

# Verify data availability
try:
    # Check if tables exist
    tables_check = spark.sql("""
    SELECT 
        'all_purpose_base' as table_name,
        COUNT(*) as record_count,
        MIN(usage_date) as start_date,
        MAX(usage_date) as end_date
    FROM ex_dash_temp.billing_forecast.all_purpose_base
    
    UNION ALL
    
    SELECT 
        'user_total_cost' as table_name,
        COUNT(*) as record_count,
        NULL as start_date,
        NULL as end_date
    FROM ex_dash_temp.billing_forecast.user_total_cost
    """)
    
    display(tables_check)
    
    displayHTML("""
    <div style='background: #d4edda; border: 1px solid #c3e6cb; padding: 15px; border-radius: 8px; margin: 20px 0;'>
        <h4 style='color: #155724; margin: 0 0 10px 0;'>‚úÖ Data Source Validation</h4>
        <p style='color: #155724; margin: 0;'>All required tables are available and ready for dashboard visualization.</p>
    </div>
    """)
    
except Exception as e:
    displayHTML(f"""
    <div style='background: #f8d7da; border: 1px solid #f5c6cb; padding: 15px; border-radius: 8px; margin: 20px 0;'>
        <h4 style='color: #721c24; margin: 0 0 10px 0;'>‚ö†Ô∏è Data Source Issue</h4>
        <p style='color: #721c24; margin: 0;'>Please run the source notebook first: {str(e)}</p>
    </div>
    """)

print("üéØ AI/BI Dashboard initialized successfully!")

In [0]:
%sql
-- Executive Summary KPIs for AI/BI Dashboard
SELECT 
  'Total Cost' as metric,
  ROUND(SUM(total_cost_usd), 2) as value,
  'USD' as unit,
  'primary' as type
FROM ex_dash_temp.billing_forecast.all_purpose_base

UNION ALL

SELECT 
  'Active Users' as metric,
  COUNT(DISTINCT principal_email) as value,
  'users' as unit,
  'info' as type
FROM ex_dash_temp.billing_forecast.all_purpose_base

UNION ALL

SELECT 
  'Active Clusters' as metric,
  COUNT(DISTINCT cluster_id) as value,
  'clusters' as unit,
  'warning' as type
FROM ex_dash_temp.billing_forecast.all_purpose_base

UNION ALL

SELECT 
  'Avg Daily Cost' as metric,
  ROUND(SUM(total_cost_usd) / COUNT(DISTINCT usage_date), 2) as value,
  'USD/day' as unit,
  'success' as type
FROM ex_dash_temp.billing_forecast.all_purpose_base

In [0]:
%sql
-- Daily Cost Trends for Time Series Chart
SELECT 
  usage_date,
  ROUND(SUM(total_cost_usd), 2) as daily_cost_usd,
  ROUND(SUM(dbus), 2) as daily_dbus,
  COUNT(DISTINCT principal_email) as active_users,
  COUNT(DISTINCT cluster_id) as active_clusters,
  ROUND(AVG(total_cost_usd), 2) as avg_cost_per_record
FROM ex_dash_temp.billing_forecast.all_purpose_base
GROUP BY usage_date
ORDER BY usage_date

In [0]:
%sql
-- Top 15 Users by Total Cost with Utilization
SELECT 
  principal_email,
  principal_type,
  ROUND(total_cost_usd, 2) as total_cost_usd,
  ROUND(avg_cpu_pct, 1) as avg_cpu_utilization,
  ROUND(avg_mem_pct, 1) as avg_memory_utilization,
  active_days,
  clusters_used,
  ROUND(telemetry_coverage_pct, 1) as telemetry_coverage,
  CASE 
    WHEN avg_cpu_pct < 10 AND avg_mem_pct < 20 THEN 'Under-utilized'
    WHEN avg_cpu_pct > 80 OR avg_mem_pct > 80 THEN 'Over-utilized'
    ELSE 'Well-utilized'
  END as utilization_category
FROM ex_dash_temp.billing_forecast.user_total_cost
WHERE total_cost_usd > 50  -- Filter for meaningful costs
ORDER BY total_cost_usd DESC
LIMIT 15

In [0]:
%sql
-- Cluster Utilization Distribution for Pie/Donut Chart
SELECT 
  CASE 
    WHEN avg_cpu_pct < 10 AND avg_mem_pct < 20 THEN 'Under-utilized'
    WHEN avg_cpu_pct > 80 OR avg_mem_pct > 80 THEN 'Over-utilized'
    ELSE 'Well-utilized'
  END as utilization_category,
  COUNT(*) as cluster_count,
  ROUND(SUM(total_cost_usd), 2) as total_cost,
  ROUND(AVG(avg_cpu_pct), 1) as avg_cpu_utilization,
  ROUND(AVG(avg_mem_pct), 1) as avg_memory_utilization,
  ROUND(SUM(total_cost_usd) * 100.0 / SUM(SUM(total_cost_usd)) OVER (), 1) as cost_percentage
FROM ex_dash_temp.billing_forecast.cluster_total_cost
WHERE avg_cpu_pct IS NOT NULL
GROUP BY 1
ORDER BY total_cost DESC

In [0]:
%sql
-- Instance Type Cost and Performance Analysis
SELECT 
  instance_type,
  unique_clusters as cluster_count,
  ROUND(total_cost_usd, 2) as total_cost_usd,
  ROUND(total_cost_usd / unique_clusters, 2) as avg_cost_per_cluster,
  ROUND(avg_cpu_pct, 1) as avg_cpu_utilization,
  ROUND(avg_mem_pct, 1) as avg_memory_utilization,
  core_count,
  memory_gb,
  ROUND(telemetry_coverage_pct, 1) as telemetry_coverage,
  photon_enabled_pct,
  ROUND(total_cost_usd / core_count, 2) as cost_per_core
FROM ex_dash_temp.billing_forecast.instance_total_cost
WHERE total_cost_usd > 100  -- Filter for significant costs
ORDER BY total_cost_usd DESC
LIMIT 20

In [0]:
%sql
-- Workspace Cost Distribution
SELECT 
  workspace_name,
  COUNT(DISTINCT principal_email) as unique_users,
  COUNT(DISTINCT cluster_id) as unique_clusters,
  ROUND(SUM(total_cost_usd), 2) as total_cost_usd,
  ROUND(AVG(total_cost_usd), 2) as avg_daily_cost,
  ROUND(SUM(total_cost_usd) * 100.0 / SUM(SUM(total_cost_usd)) OVER (), 1) as cost_percentage,
  MIN(usage_date) as first_usage_date,
  MAX(usage_date) as last_usage_date
FROM ex_dash_temp.billing_forecast.all_purpose_base
GROUP BY workspace_name
ORDER BY total_cost_usd DESC

In [0]:
%sql
-- Cost Optimization Summary by Priority
SELECT 
  'User Level' as optimization_level,
  opportunity_priority,
  COUNT(*) as opportunity_count,
  ROUND(SUM(total_cost_usd), 2) as current_cost,
  ROUND(SUM(validated_savings), 2) as potential_savings,
  ROUND(AVG(avg_cpu_pct), 1) as avg_cpu_utilization,
  ROUND(AVG(avg_mem_pct), 1) as avg_memory_utilization,
  ROUND(SUM(validated_savings) * 100.0 / SUM(total_cost_usd), 1) as savings_percentage
FROM ex_dash_temp.billing_forecast.user_opportunities
GROUP BY opportunity_priority

UNION ALL

SELECT 
  'Cluster Level' as optimization_level,
  opportunity_priority,
  COUNT(*) as opportunity_count,
  ROUND(SUM(total_cost_usd), 2) as current_cost,
  ROUND(SUM(validated_savings), 2) as potential_savings,
  ROUND(AVG(avg_cpu_pct), 1) as avg_cpu_utilization,
  ROUND(AVG(avg_mem_pct), 1) as avg_memory_utilization,
  ROUND(SUM(validated_savings) * 100.0 / SUM(total_cost_usd), 1) as savings_percentage
FROM ex_dash_temp.billing_forecast.cluster_opportunities
GROUP BY opportunity_priority

ORDER BY optimization_level, 
  CASE opportunity_priority
    WHEN 'High' THEN 1
    WHEN 'Medium' THEN 2
    WHEN 'Low' THEN 3
    ELSE 4
  END