# Databricks Insight Agent - Example Analysis

This notebook demonstrates how to use the Databricks Insight Agent for enterprise security and network analysis.

## Overview

The Databricks Insight Agent is a multi-agent system that analyzes Databricks system tables to provide actionable insights on:
- Network activity patterns
- Security threats and anomalies
- Operational performance
- Compliance monitoring

## Prerequisites

- Databricks workspace with system table access
- OpenAI API key
- Required Python packages installed

In [None]:
# Install required packages (if not already installed)
# %pip install databricks-sdk dspy-ai mlflow pandas openai structlog

In [None]:
# Import required libraries
import sys
import os
from pathlib import Path
import pandas as pd
import asyncio
from datetime import datetime

# Add project path
project_path = Path("../src")
if str(project_path) not in sys.path:
    sys.path.append(str(project_path))

print("Libraries imported successfully")

In [None]:
# Setup configuration
from utils.config import config
from utils.logging import setup_logging

# Setup logging
setup_logging(log_level="INFO")

# Configure credentials (replace with your values)
config.set('DATABRICKS_HOST', 'https://your-workspace.cloud.databricks.com')
config.set('DATABRICKS_TOKEN', 'your-databricks-token')
config.set('OPENAI_API_KEY', 'your-openai-api-key')

print("Configuration setup complete")

In [None]:
# Initialize agents
from agents.orchestrator import OrchestratorAgent
from agents.reporting import ReportingAgent

# Create orchestrator
orchestrator = OrchestratorAgent()
reporting = ReportingAgent()

print("Agents initialized")

## Data Ingestion

First, let's collect data from Databricks system tables.

In [None]:
# Test data ingestion
async def test_ingestion():
    # Connect to Databricks
    connected = await orchestrator.data_agent.connect()
    if not connected:
        print("Failed to connect to Databricks")
        return None
    
    print("Connected to Databricks successfully")
    
    # Collect sample data (last 2 hours)
    data = await orchestrator.data_agent.collect_all_data(hours_back=2)
    
    print(f"Collected data:")
    for key, df in data.items():
        print(f"  {key}: {len(df)} records")
    
    return data

# Run data ingestion
data = await test_ingestion()

## Network Analysis

Analyze network patterns and connectivity.

In [None]:
# Test network analysis
async def test_network_analysis():
    if data is None or 'audit_logs' not in data:
        print("No audit data available")
        return None
    
    audit_df = data['audit_logs']
    print(f"Analyzing {len(audit_df)} audit events...")
    
    # Run network analysis
    network_results = await orchestrator.network_agent.analyze_network_activity(audit_df)
    
    print("Network Analysis Results:")
    print(f"  Risk Score: {network_results.get('risk_score', 'N/A')}/10")
    print(f"  Events Analyzed: {network_results.get('analyzed_events', 0)}")
    
    if 'key_findings' in network_results:
        print("\nKey Findings:")
        for finding in network_results['key_findings'][:5]:
            print(f"  - {finding}")
    
    return network_results

# Run network analysis
network_results = await test_network_analysis()

## Security Analysis

Analyze security threats and compliance.

In [None]:
# Test security analysis
async def test_security_analysis():
    if data is None or 'audit_logs' not in data:
        print("No audit data available")
        return None
    
    audit_df = data['audit_logs']
    print(f"Analyzing security in {len(audit_df)} audit events...")
    
    # Run security analysis
    security_results = await orchestrator.security_agent.analyze_security_threats(audit_df)
    
    print("Security Analysis Results:")
    print(f"  Threat Level: {security_results.get('threat_level', 'UNKNOWN')}")
    print(f"  Events Analyzed: {security_results.get('analyzed_events', 0)}")
    
    if 'security_findings' in security_results:
        print("\nSecurity Findings:")
        for finding in security_results['security_findings'][:5]:
            print(f"  - {finding}")
    
    return security_results

# Run security analysis
security_results = await test_security_analysis()

## Full Analysis Workflow

Run the complete analysis workflow.

In [None]:
# Run complete analysis
async def run_full_analysis():
    print("Running full analysis workflow...")
    
    start_time = datetime.now()
    
    # Run analysis
    results = await orchestrator.run_full_analysis(hours_back=2)
    
    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds()
    
    print(f"\nAnalysis completed in {duration:.2f} seconds")
    print(f"Overall Threat Level: {results.get('overall_threat_level', 'UNKNOWN')}")
    
    # Display key metrics
    data_summary = results.get('data_summary', {})
    print(f"\nData Summary:")
    print(f"  Audit Events: {data_summary.get('audit_events', 0)}")
    print(f"  Clusters: {data_summary.get('clusters', 0)}")
    print(f"  Queries: {data_summary.get('queries', 0)}")
    
    # Display recommendations
    recommendations = results.get('recommendations', [])
    if recommendations:
        print(f"\nRecommendations:")
        for rec in recommendations[:5]:
            print(f"  - {rec}")
    
    return results

# Run full analysis
full_results = await run_full_analysis()

## Generate Report

Create a comprehensive report from the analysis.

In [None]:
# Generate report
async def generate_report():
    if full_results is None:
        print("No analysis results available")
        return
    
    print("Generating analysis report...")
    
    # Generate full report
    report_result = await reporting.generate_report(full_results, report_type="full")
    
    print(f"Report generated: {report_result['metadata']['file_path']}")
    
    # Display report summary
    report_path = Path(report_result['metadata']['file_path'])
    if report_path.exists():
        with open(report_path, 'r') as f:
            content = f.read()
            # Display first 1000 characters
            print("\nReport Preview:")
            print("=" * 50)
            print(content[:1000] + "...")
            print("=" * 50)

# Generate report
await generate_report()

## Evaluation & Scoring

Evaluate the quality of the analysis.

In [None]:
# Evaluate analysis quality
async def evaluate_analysis():
    if full_results is None:
        print("No analysis results to evaluate")
        return
    
    print("Evaluating analysis quality...")
    
    evaluation = await orchestrator.scoring_system.evaluate_analysis_results(full_results)
    
    print(f"Overall Score: {evaluation.get('overall_score', 0):.2f}/10")
    
    agent_scores = evaluation.get('agent_scores', {})
    print(f"\nAgent Scores:")
    for agent, score in agent_scores.items():
        print(f"  {agent}: {score.get('performance_score', 0)}/10")
    
    recommendations = evaluation.get('recommendations', [])
    if recommendations:
        print(f"\nImprovement Recommendations:")
        for rec in recommendations[:3]:
            print(f"  - {rec}")

# Evaluate analysis
await evaluate_analysis()

## Summary

This notebook demonstrated:
1. Data ingestion from Databricks system tables
2. Network pattern analysis
3. Security threat detection
4. Full workflow orchestration
5. Report generation
6. Quality evaluation

## Next Steps

- Configure production credentials
- Set up scheduled analysis jobs
- Customize analysis parameters
- Integrate with alerting systems
- Deploy to Databricks Jobs for automated monitoring