# Data Analysis Service Testing Notebook

This notebook tests the data analysis functionality using Gemini for insights and recommendations.

In [None]:
import sys
import os
import json
from datetime import datetime, timedelta
import random

# Add backend to path
sys.path.append(os.path.join(os.getcwd(), '..', 'backend'))

from dotenv import load_dotenv
load_dotenv('../.env')

## Generate Sample Data

In [None]:
# Generate sample query results for testing
def generate_sample_sales_data():
    """Generate sample sales data for testing"""
    products = ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Headphones']
    categories = ['Electronics', 'Accessories', 'Hardware']
    
    data = []
    for i in range(20):
        data.append({
            'product_name': random.choice(products),
            'category': random.choice(categories),
            'sales_amount': round(random.uniform(100, 2000), 2),
            'quantity_sold': random.randint(1, 50),
            'sale_date': (datetime.now() - timedelta(days=random.randint(0, 90))).strftime('%Y-%m-%d')
        })
    return data

def generate_sample_customer_data():
    """Generate sample customer data for testing"""
    regions = ['North', 'South', 'East', 'West']
    
    data = []
    for i in range(15):
        data.append({
            'customer_id': i + 1,
            'customer_name': f'Customer {i + 1}',
            'region': random.choice(regions),
            'total_orders': random.randint(1, 20),
            'total_spent': round(random.uniform(500, 10000), 2),
            'last_order_date': (datetime.now() - timedelta(days=random.randint(0, 180))).strftime('%Y-%m-%d')
        })
    return data

# Generate sample data
sales_data = generate_sample_sales_data()
customer_data = generate_sample_customer_data()

print("Sample Sales Data (first 5 rows):")
for row in sales_data[:5]:
    print(json.dumps(row, indent=2))
    
print("\nSample Customer Data (first 3 rows):")
for row in customer_data[:3]:
    print(json.dumps(row, indent=2))

## Test Data Analysis Service

In [None]:
from data_analyst.services import DataAnalysisService

# Initialize analysis service
analysis_service = DataAnalysisService()

# Test basic analysis
analysis_result = analysis_service.analyze_query_results(
    query_results=sales_data,
    query_description="Sales data for the last 3 months by product and category"
)

print("=== Sales Data Analysis ===")
print(analysis_result)

## Test Customer Data Analysis

In [None]:
# Analyze customer data
customer_analysis = analysis_service.analyze_query_results(
    query_results=customer_data,
    query_description="Customer analysis showing total orders, spending, and regional distribution"
)

print("=== Customer Data Analysis ===")
print(customer_analysis)

## Test Follow-up Query Suggestions

In [None]:
# Test follow-up suggestions
followup_suggestions = analysis_service.suggest_followup_queries(
    original_query="SELECT product_name, SUM(sales_amount) as total_sales FROM sales GROUP BY product_name",
    results_summary="Laptops had the highest sales at $15,000, followed by Monitors at $8,500. Keyboards had the lowest sales at $2,100."
)

print("=== Follow-up Query Suggestions ===")
print(followup_suggestions)

## Test Async Analysis

In [None]:
import asyncio

async def test_async_analysis():
    # Create a subset of data for async testing
    async_data = sales_data[:10]
    
    result = await analysis_service.analyze_query_results_async(
        query_results=async_data,
        query_description="Top 10 sales transactions analysis"
    )
    return result

# Run async test
async_analysis = await test_async_analysis()
print("=== Async Analysis Result ===")
print(async_analysis)

## Test Different Analysis Scenarios

In [None]:
# Test with empty results
empty_analysis = analysis_service.analyze_query_results(
    query_results=[],
    query_description="Query that returned no results"
)

print("=== Empty Results Analysis ===")
print(empty_analysis)
print("\n" + "="*50 + "\n")

In [None]:
# Test with single result
single_result = [{
    'metric': 'total_revenue',
    'value': 125000.50,
    'period': '2024-Q1'
}]

single_analysis = analysis_service.analyze_query_results(
    query_results=single_result,
    query_description="Total revenue for Q1 2024"
)

print("=== Single Result Analysis ===")
print(single_analysis)
print("\n" + "="*50 + "\n")

## Test Time Series Analysis

In [None]:
# Generate time series data
def generate_time_series_data():
    data = []
    base_date = datetime.now() - timedelta(days=30)
    
    for i in range(30):
        date = base_date + timedelta(days=i)
        # Simulate weekly patterns and trends
        base_value = 1000 + (i * 10)  # Growth trend
        weekend_factor = 0.7 if date.weekday() >= 5 else 1.0
        daily_revenue = round(base_value * weekend_factor * random.uniform(0.8, 1.2), 2)
        
        data.append({
            'date': date.strftime('%Y-%m-%d'),
            'day_of_week': date.strftime('%A'),
            'daily_revenue': daily_revenue,
            'orders_count': random.randint(20, 80)
        })
    
    return data

time_series_data = generate_time_series_data()

# Analyze time series data
time_analysis = analysis_service.analyze_query_results(
    query_results=time_series_data,
    query_description="Daily revenue and order count for the last 30 days"
)

print("=== Time Series Analysis ===")
print(time_analysis)

## Test Different Temperature Settings

In [None]:
# Test different temperature settings for analysis creativity
temperatures = [0.2, 0.5, 0.8]
test_data = customer_data[:5]

for temp in temperatures:
    print(f"\n=== Analysis with Temperature {temp} ===")
    
    try:
        result = analysis_service.analyze_query_results(
            query_results=test_data,
            query_description="Top 5 customers by spending analysis",
            temperature=temp
        )
        print(result)
    except Exception as e:
        print(f"Error with temperature {temp}: {e}")
    
    print("\n" + "-"*30)

## Comprehensive Analysis Workflow

In [None]:
# Simulate a complete analysis workflow
def comprehensive_analysis_workflow():
    print("=== Comprehensive Analysis Workflow ===")
    
    # Step 1: Initial Analysis
    print("\n1. Initial Data Analysis")
    initial_analysis = analysis_service.analyze_query_results(
        query_results=sales_data,
        query_description="Monthly sales performance by product category"
    )
    print(initial_analysis)
    
    # Step 2: Generate Follow-up Questions
    print("\n2. Follow-up Query Suggestions")
    followups = analysis_service.suggest_followup_queries(
        original_query="SELECT category, SUM(sales_amount) FROM sales GROUP BY category",
        results_summary=initial_analysis[:200]  # Use first part of analysis as summary
    )
    print(followups)
    
    # Step 3: Detailed Customer Analysis
    print("\n3. Customer Segmentation Analysis")
    customer_analysis = analysis_service.analyze_query_results(
        query_results=customer_data,
        query_description="Customer segmentation based on purchase behavior and geography"
    )
    print(customer_analysis)
    
    print("\n=== Workflow Complete ===")

# Run the comprehensive workflow
comprehensive_analysis_workflow()