# Intelligent LLM Data Conversation Demo

This notebook demonstrates how to use the Big Data Migrator's LLM conversation system for data analysis tasks. The system combines a local LLM with data context understanding to provide intelligent insights and recommendations.

In [None]:
# Import required libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dotenv import load_dotenv

# Add project root to path
sys.path.append(os.path.abspath('..'))

# Import Big Data Migrator components
from app.llm.conversation_system import LLMConversationSystem
from app.llm.online_llm_fallback import OnlineLLMConfig
from app.memory.memory_monitor import MemoryMonitor
from app.memory.resource_optimizer import ResourceOptimizer

## 1. Initialize the LLM Conversation System

First, we need to initialize the LLM conversation system with appropriate configuration.

In [None]:
# Load environment variables
load_dotenv(os.path.join("../config", ".env"))

# Initialize memory monitoring
memory_monitor = MemoryMonitor()
resource_optimizer = ResourceOptimizer(memory_monitor)

# Configure online LLM fallback (optional)
ENABLE_ONLINE_FALLBACK = os.getenv("ENABLE_ONLINE_FALLBACK", "false").lower() == "true"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")

online_llm_config = None
if ENABLE_ONLINE_FALLBACK and OPENAI_API_KEY:
    online_llm_config = OnlineLLMConfig(
        api_key=OPENAI_API_KEY,
        model=os.getenv("ONLINE_LLM_MODEL", "gpt-4o")
    )

# Initialize the conversation system
llm_system = LLMConversationSystem(
    local_llm_url=os.getenv("LOCAL_LLM_URL", "http://localhost:1234/v1"),
    local_llm_model=os.getenv("LOCAL_LLM_MODEL", "CodeLlama-34B-Instruct"),
    memory_monitor=memory_monitor,
    resource_optimizer=resource_optimizer,
    online_llm_config=online_llm_config,
    enable_online_fallback=ENABLE_ONLINE_FALLBACK
)

# Check connection to the local LLM
connection_status = llm_system.llm_client.check_connection()
print(f"Connection status: {connection_status}")

## 2. Create Sample Data for Analysis

Let's create some sample data to demonstrate the system capabilities.

In [None]:
# Create sample sales data
np.random.seed(42)
num_records = 1000

# Generate sample data
product_ids = np.random.randint(1, 21, size=num_records)
customer_ids = np.random.randint(101, 251, size=num_records)
quantities = np.random.randint(1, 10, size=num_records)
unit_prices = np.random.uniform(10.0, 1000.0, size=num_records).round(2)
order_dates = pd.date_range(start='2024-01-01', end='2024-05-01', periods=num_records)

# Create sales dataframe
sales_df = pd.DataFrame({
    'order_id': range(1, num_records + 1),
    'product_id': product_ids,
    'customer_id': customer_ids,
    'quantity': quantities,
    'unit_price': unit_prices,
    'order_date': order_dates,
    'total_amount': quantities * unit_prices
})

# Create product dataframe
product_categories = ['Electronics', 'Furniture', 'Clothing', 'Books', 'Food']
product_df = pd.DataFrame({
    'product_id': range(1, 21),
    'product_name': [f'Product {i}' for i in range(1, 21)],
    'category': np.random.choice(product_categories, size=20),
    'supplier_id': np.random.randint(1, 6, size=20)
})

# Create customer dataframe
customer_df = pd.DataFrame({
    'customer_id': range(101, 251),
    'customer_name': [f'Customer {i}' for i in range(101, 251)],
    'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], size=150),
    'segment': np.random.choice(['Consumer', 'Corporate', 'Home Office'], size=150)
})

# Save dataframes to CSV
os.makedirs('sample_data', exist_ok=True)
sales_df.to_csv('sample_data/sales.csv', index=False)
product_df.to_csv('sample_data/products.csv', index=False)
customer_df.to_csv('sample_data/customers.csv', index=False)

print(f"Created sample datasets with {len(sales_df)} sales records, {len(product_df)} products, and {len(customer_df)} customers")
print("Files saved in 'sample_data' directory")

## 3. Create a Conversation with Data Context

Now we'll create a conversation that includes our sample data files.

In [None]:
# Define data file paths
data_files = [
    os.path.abspath('sample_data/sales.csv'),
    os.path.abspath('sample_data/products.csv'),
    os.path.abspath('sample_data/customers.csv')
]

# Create a conversation
conversation_id = llm_system.create_conversation(
    title="Sample Sales Data Analysis",
    data_files=data_files
)

print(f"Created conversation with ID: {conversation_id}")

## 4. Ask Questions About the Data

Let's ask some questions about our data and see the LLM's responses.

In [None]:
# Ask about top performing products
response = llm_system.add_message(
    message="What are the top 5 selling products by total revenue?",
    conversation_id=conversation_id
)

print(response["response"])

In [None]:
# Ask about customer segments
response = llm_system.add_message(
    message="Which customer segment generates the most revenue? Break it down by city.",
    conversation_id=conversation_id
)

print(response["response"])

## 5. Get Intelligent Guidance

The system can generate guidance based on the data and conversation context.

In [None]:
# Generate guidance
guidance = llm_system.generate_guidance(conversation_id)

# Display suggestions
print("=== Suggested Questions ===")
for question in guidance.get("questions", []):
    print(f"- {question['content']}")

print("\n=== Data Exploration Suggestions ===")
for suggestion in guidance.get("suggestions", []):
    print(f"- {suggestion['content']}")

print("\n=== Improvement Recommendations ===")
for improvement in guidance.get("improvements", []):
    print(f"- {improvement['content']}")

## 6. Data Validation and Relationship Detection

Let's ask about data quality and relationships between our datasets.

In [None]:
# Ask about data quality
response = llm_system.add_message(
    message="Are there any data quality issues I should be aware of in these datasets?",
    conversation_id=conversation_id
)

print(response["response"])

In [None]:
# Ask about relationships between datasets
response = llm_system.add_message(
    message="What are the relationships between these three datasets? How should I join them?",
    conversation_id=conversation_id
)

print(response["response"])

## 7. Schema Optimization with Online LLM Fallback (Optional)

If online LLM fallback is enabled, we can use it for complex schema optimization.

In [None]:
# Check if online fallback is enabled
if llm_system.enable_online_fallback:
    # Run schema optimization
    print("Starting schema optimization with online LLM...")
    schema_results = llm_system.optimize_schema_with_fallback(conversation_id)
    
    # Display results
    print("\n=== Schema Optimization Results ===")
    
    if "optimized_schema" in schema_results:
        print("\nOptimized Schema:")
        print(schema_results["optimized_schema"])
    
    if "recommendations" in schema_results:
        print("\nRecommendations:")
        for rec in schema_results["recommendations"]:
            print(f"- {rec}")
else:
    print("Online LLM fallback is not enabled. To use schema optimization:")
    print("1. Set ENABLE_ONLINE_FALLBACK=true in your .env file")
    print("2. Set a valid OPENAI_API_KEY")

## 8. Generate SQL Queries

The LLM can also help generate SQL queries for data analysis.

In [None]:
# Ask for a SQL query
response = llm_system.add_message(
    message="Could you generate a SQL query to find the top 3 products by revenue for each city?",
    conversation_id=conversation_id
)

print(response["response"])

## 9. Review Conversation History

Finally, let's retrieve and display the conversation history.

In [None]:
# Get conversation details
conversation = llm_system.conversation_manager.get_conversation(conversation_id)

print(f"Conversation Title: {conversation.title}")
print(f"Created at: {pd.to_datetime(conversation.created_at, unit='s')}")
print(f"Updated at: {pd.to_datetime(conversation.updated_at, unit='s')}")
print(f"Number of messages: {len(conversation.messages)}")
print("\nData files:")
for file in conversation.data_files:
    print(f"- {file}")

print("\nConversation Summary:")
if conversation.context_summary:
    print(conversation.context_summary)
else:
    print("No summary available")