# Data Archaeologist - Environment Setup and Validation

This notebook validates connections to all database environments (staging, production, backup) and provides utility functions for exploring schemas and tables.

**Author:** Data Archaeologist Team  
**Version:** 2.0  
**Date:** 2025-08-28

In [None]:
# Import required libraries
import sys
import json
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
PROJECT_ROOT = Path('.').parent
sys.path.insert(0, str(PROJECT_ROOT))

try:
    from data_archaeologist.core.database_connection import DatabaseConnection
    from scripts.database_summary_real import get_table_summary, test_database_connection
    print("✅ All imports successful")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Please ensure you're running from the project root directory")

In [None]:
# Load configuration
config_file = '../config.json'

try:
    with open(config_file, 'r') as f:
        config = json.load(f)
    
    environments = list(config['environments'].keys())
    print(f"✅ Configuration loaded successfully")
    print(f"Available environments: {environments}")
    
    # Display environment details
    for env_name, env_config in config['environments'].items():
        print(f"\n{env_name.title()}:")
        print(f"  Host: {env_config['host']}")
        print(f"  Database: {env_config['database']}")
        print(f"  Description: {env_config.get('description', 'No description')}")
        
except Exception as e:
    print(f"❌ Configuration error: {e}")

In [None]:
# Test database connections
print("Testing database connections...")
print("=" * 50)

db_connection = DatabaseConnection(config_file)
connection_results = {}

for environment in environments:
    print(f"\nTesting {environment}...")
    try:
        success = test_database_connection(db_connection, environment)
        connection_results[environment] = {
            'status': 'Connected' if success else 'Failed',
            'success': success
        }
        
        if success:
            print(f"  ✅ {environment}: Connected successfully")
        else:
            print(f"  ❌ {environment}: Connection failed")
            
    except Exception as e:
        connection_results[environment] = {
            'status': f'Error: {str(e)}',
            'success': False
        }
        print(f"  ❌ {environment}: {str(e)}")

# Summary
successful_connections = sum(1 for result in connection_results.values() if result['success'])
print(f"\n📊 Connection Summary: {successful_connections}/{len(environments)} successful")

In [None]:
# Utility functions for exploration

def list_schemas_per_environment():
    """List all schemas for each environment."""
    print("Schemas per Environment")
    print("=" * 50)
    
    for environment in environments:
        if not connection_results[environment]['success']:
            print(f"\n{environment.title()}: Connection failed - skipping")
            continue
            
        try:
            query = """
            SELECT schema_name, 
                   COUNT(*) as table_count
            FROM information_schema.tables 
            WHERE schema_name NOT IN ('information_schema', 'pg_catalog', 'pg_toast')
            GROUP BY schema_name 
            ORDER BY schema_name
            """
            
            results = db_connection.execute_query(environment, query)
            
            print(f"\n{environment.title()}:")
            if results:
                for row in results:
                    print(f"  📁 {row['schema_name']} ({row['table_count']} tables)")
            else:
                print("  No user schemas found")
                
        except Exception as e:
            print(f"  ❌ Error: {str(e)}")

def get_environment_summary():
    """Get high-level summary of each environment."""
    print("Environment Summary")
    print("=" * 50)
    
    summary_data = []
    
    for environment in environments:
        if not connection_results[environment]['success']:
            summary_data.append({
                'Environment': environment.title(),
                'Status': 'Failed',
                'Schemas': 0,
                'Tables': 0,
                'Total Size': 'N/A'
            })
            continue
            
        try:
            # Get table summary
            table_summary = get_table_summary(db_connection, environment)
            
            # Calculate totals
            total_tables = len(table_summary)
            unique_schemas = len(set(table['schema'] for table in table_summary))
            total_size_bytes = sum(table['size_bytes'] for table in table_summary)
            
            # Format size
            if total_size_bytes >= 1024**3:
                total_size = f"{total_size_bytes / (1024**3):.2f} GB"
            elif total_size_bytes >= 1024**2:
                total_size = f"{total_size_bytes / (1024**2):.2f} MB"
            else:
                total_size = f"{total_size_bytes / 1024:.2f} KB"
            
            summary_data.append({
                'Environment': environment.title(),
                'Status': 'Connected',
                'Schemas': unique_schemas,
                'Tables': total_tables,
                'Total Size': total_size
            })
            
        except Exception as e:
            summary_data.append({
                'Environment': environment.title(),
                'Status': f'Error: {str(e)[:50]}...',
                'Schemas': 0,
                'Tables': 0,
                'Total Size': 'N/A'
            })
    
    return pd.DataFrame(summary_data)

print("✅ Utility functions defined:")
print("  - list_schemas_per_environment()")
print("  - get_environment_summary()")

In [None]:
# List schemas per environment
list_schemas_per_environment()

In [None]:
# Get environment summary table
summary_df = get_environment_summary()
display(summary_df)

## Setup Complete ✅

The environment setup is now complete. You can proceed to the other notebooks:

- **01_layer1_physical_map.ipynb** - Interactive exploration of table sizes, columns, and data distributions
- **02_layer2_logical_blueprint.ipynb** - Primary keys, foreign keys, and relationship analysis
- **03_layer3_business_story.ipynb** - Business insights and process discovery
- **04_multi_env_parallel_run.ipynb** - Parallel analysis across all environments

### Next Steps:
1. Verify all connections are working
2. Choose an environment to explore in detail
3. Run the appropriate analysis notebook based on your needs