In [None]:
# First, let's create the complete workflow using PostgreSQL database
import pandas as pd
import os
from pathlib import Path


# Import required modules for SQL validation
import great_expectations as gx

# Create Data Context
context = gx.get_context()


In [None]:
# Connect to data - Using Fluent API for SQL validation
# Using the official GX Postgres connection string
connection_string = "postgresql+psycopg2://try_gx:try_gx@postgres.workshops.greatexpectations.io/gx_example_db"

# Create Data Source using Fluent API
data_source = context.sources.add_postgres(
    "postgres db", connection_string=connection_string
)

# Create Data Asset using Fluent API
data_asset = data_source.add_table_asset(name="taxi data", table_name="nyc_taxi_data")

# Create Batch Request using Fluent API
batch_request = data_asset.build_batch_request()

# Get Batch using Fluent API
batch_list = data_asset.get_batch_list_from_batch_request(batch_request)
batch = batch_list[0]

In [None]:
# Create Expectation Suite using Fluent API
suite = context.add_expectation_suite("sql_validation_suite")

# Add expectations using Fluent API
# First expectation: passenger_count between 1 and 6 (warning level)
expectation1 = gx.core.ExpectationConfiguration(
    expectation_type='expect_column_values_to_be_between',
    kwargs={'column': 'passenger_count', 'min_value': 1, 'max_value': 6}
)
suite.add_expectation(expectation1)

# Second expectation: fare_amount >= 0 (critical level)
expectation2 = gx.core.ExpectationConfiguration(
    expectation_type='expect_column_values_to_be_between',
    kwargs={'column': 'fare_amount', 'min_value': 0}
)
suite.add_expectation(expectation2)

In [None]:
# Create Checkpoint using Fluent API
checkpoint_config = {
    'name': 'sql_checkpoint',
    'config_version': 1.0,
    'class_name': 'Checkpoint',
    'run_name_template': '%Y%m%d-%H%M%S-sql-run',
    'expectation_suite_name': suite.expectation_suite_name,
    'batch_request': {
        'datasource_name': data_source.name,
        'data_asset_name': data_asset.name
    },
    'action_list': [
        {
            'name': 'store_validation_result',
            'action': {'class_name': 'StoreValidationResultAction'}
        },
        {
            'name': 'update_data_docs',
            'action': {'class_name': 'UpdateDataDocsAction'}
        }
    ]
}

checkpoint = context.add_checkpoint(**checkpoint_config)

In [None]:
# Run Checkpoint using Fluent API
checkpoint_result = context.run_checkpoint(checkpoint_name=checkpoint.name)

# Display results using Fluent API
print(f"Success: {checkpoint_result.success}")
print(f"Statistics: {checkpoint_result.get_statistics()}")

In [None]:
# Complete Fluent API Demonstration
# This cell demonstrates ALL Fluent API methods including documentation access

# Get documentation site information
docs_sites = context.get_docs_sites_urls()

# Get expectation suite details using Fluent API
try:
    suite = context.get_expectation_suite('sql_checkpoint')
    print(f"Suite Name: {suite.expectation_suite_name}")
    print(f"Number of Expectations: {len(suite.expectations)}")
except Exception as e:
    print(f"Error accessing suite: {e}")

# Get checkpoint results using Fluent API
try:
    checkpoint_result = context.run_checkpoint(checkpoint_name='sql_checkpoint')
    print(f"Checkpoint Success: {checkpoint_result.success}")
except Exception as e:
    print(f"Error accessing checkpoint: {e}")

# Open Data Docs in browser
try:
    context.open_data_docs()
except Exception as e:
    print(f"Could not open browser: {e}")


In [None]:
# Complete Fluent API Demonstration - Using Database + Separate Profiling Suite
# This cell demonstrates ALL Fluent API methods using PostgreSQL database with separate suites
# Create organized output directories specifically for Great Expectations demo
output_dir = Path("notebooks/great_expectations/outputs")
profiling_dir = output_dir / "profiling"
manual_dir = output_dir / "manual"
reports_dir = output_dir / "reports"

# Ensure directories exist
for dir_path in [output_dir, profiling_dir, manual_dir, reports_dir]:
    dir_path.mkdir(parents=True, exist_ok=True)


# Fluent API Method 1: Create PostgreSQL Data Source (with error handling)
data_source_name = 'postgres_demo'
try:
    # Try to get existing data source first
    data_source = context.get_datasource(data_source_name)
    print(f'✅ Using existing PostgreSQL Data Source: {data_source.name}')
except:
    # Create new PostgreSQL data source
    connection_string = "postgresql+psycopg2://try_gx:try_gx@postgres.workshops.greatexpectations.io/gx_example_db"
    data_source = context.sources.add_postgres(data_source_name, connection_string=connection_string)
    print(f'✅ Created new PostgreSQL Data Source: {data_source.name}')

# Fluent API Method 2: Create Table Data Asset (with error handling)
asset_name = 'nyc_taxi_data'
try:
    data_asset = data_source.get_asset(asset_name)
    print(f'✅ Using existing Table Asset: {data_asset.name}')
except:
    data_asset = data_source.add_table_asset(name=asset_name, table_name="nyc_taxi_data")
    print(f'✅ Created new Table Asset: {data_asset.name}')

# Fluent API Method 3: Build Batch Request
batch_request = data_asset.build_batch_request()
print(f"✅ Fluent Batch Request created")

# Fluent API Method 4: Get Batch List
batch_list = data_asset.get_batch_list_from_batch_request(batch_request)
batch = batch_list[0]
print(f"✅ Fluent Batch created: {batch.id}")

# Create SEPARATE expectation suites for different purposes
print("\n📋 Creating Separate Expectation Suites:")
print("-" * 50)

# Suite 1: For GX Profiling Tool (Automatic Data Quality Analysis)
profiling_suite_name = 'database_profiling_suite'
try:
    profiling_suite = context.get_expectation_suite(profiling_suite_name)
    print(f'✅ Using existing Profiling Suite: {profiling_suite.expectation_suite_name}')
except:
    profiling_suite = context.add_expectation_suite(profiling_suite_name)
    print(f'✅ Created new Profiling Suite: {profiling_suite.expectation_suite_name}')

# Suite 2: For Manual Expectations (Business Rules)
manual_suite_name = 'database_manual_suite'
try:
    manual_suite = context.get_expectation_suite(manual_suite_name)
    print(f'✅ Using existing Manual Suite: {manual_suite.expectation_suite_name}')
except:
    manual_suite = context.add_expectation_suite(manual_suite_name)
    print(f'✅ Created new Manual Suite: {manual_suite.expectation_suite_name}')

# Use GX Profiling Tool - This generates automatic data quality insights!
print("\n🔍 Using GX Profiling Tool (Separate Suite):")
print("-" * 50)

# The profiling tool automatically analyzes data and creates expectations
profiler = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=profiling_suite.expectation_suite_name
)

print(f"✅ Profiler created: {profiler}")

# Run automatic profiling to generate data quality insights
print("🔄 Running automatic data profiling...")
profiler_result = profiler.validate()
print(f"✅ Profiling completed")

# Save the profiling suite
profiler.save_expectation_suite()
print("✅ Profiling suite saved")

# Add manual expectations using a separate suite (this is the proper way)
print("\n🔧 Adding Manual Expectations (Separate Suite):")
print("-" * 50)

validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=manual_suite.expectation_suite_name
)

# Add business-specific expectations
validator.expect_column_values_to_be_between(
    column='passenger_count',
    min_value=1,
    max_value=6
)

validator.expect_column_values_to_be_between(
    column='fare_amount',
    min_value=0
)

validator.expect_column_values_to_not_be_null(
    column='trip_distance'
)

print(f"✅ Added {len(validator.get_expectation_suite().expectations)} manual expectations")

# Save the manual expectation suite
validator.save_expectation_suite()
print("✅ Manual expectation suite saved")

# Create SEPARATE checkpoints for different suites
print("\n🎯 Creating Separate Checkpoints:")
print("-" * 50)

# Checkpoint 1: For Profiling Suite
profiling_checkpoint_name = 'database_profiling_checkpoint'
try:
    profiling_checkpoint = context.get_checkpoint(profiling_checkpoint_name)
    print(f'✅ Using existing Profiling Checkpoint: {profiling_checkpoint.name}')
except:
    profiling_checkpoint_config = {
        'name': profiling_checkpoint_name,
        'config_version': 1.0,
        'class_name': 'Checkpoint',
        'run_name_template': '%Y%m%d-%H%M%S-profiling-run',
        'expectation_suite_name': profiling_suite.expectation_suite_name,
        'batch_request': {
            'datasource_name': data_source.name,
            'data_asset_name': data_asset.name
        },
        'action_list': [
            {
                'name': 'store_validation_result',
                'action': {'class_name': 'StoreValidationResultAction'}
            },
            {
                'name': 'update_data_docs',
                'action': {'class_name': 'UpdateDataDocsAction'}
            }
        ]
    }
    profiling_checkpoint = context.add_checkpoint(**profiling_checkpoint_config)
    print(f'✅ Created new Profiling Checkpoint: {profiling_checkpoint.name}')

# Checkpoint 2: For Manual Suite
manual_checkpoint_name = 'database_manual_checkpoint'
try:
    manual_checkpoint = context.get_checkpoint(manual_checkpoint_name)
    print(f'✅ Using existing Manual Checkpoint: {manual_checkpoint.name}')
except:
    manual_checkpoint_config = {
        'name': manual_checkpoint_name,
        'config_version': 1.0,
        'class_name': 'Checkpoint',
        'run_name_template': '%Y%m%d-%H%M%S-manual-run',
        'expectation_suite_name': manual_suite.expectation_suite_name,
        'batch_request': {
            'datasource_name': data_source.name,
            'data_asset_name': data_asset.name
        },
        'action_list': [
            {
                'name': 'store_validation_result',
                'action': {'class_name': 'StoreValidationResultAction'}
            },
            {
                'name': 'update_data_docs',
                'action': {'class_name': 'UpdateDataDocsAction'}
            }
        ]
    }
    manual_checkpoint = context.add_checkpoint(**manual_checkpoint_config)
    print(f'✅ Created new Manual Checkpoint: {manual_checkpoint.name}')

# Run BOTH checkpoints separately
print("\n🔄 Running Separate Checkpoints:")
print("-" * 50)

# Run Profiling Checkpoint
print("🔄 Running Profiling Checkpoint...")
profiling_result = context.run_checkpoint(checkpoint_name=profiling_checkpoint.name)
print(f"✅ Profiling Checkpoint completed: {profiling_result.success}")

# Run Manual Checkpoint
print("🔄 Running Manual Checkpoint...")
manual_result = context.run_checkpoint(checkpoint_name=manual_checkpoint.name)
print(f"✅ Manual Checkpoint completed: {manual_result.success}")

# Now demonstrate documentation access methods
print("\n📚 Documentation Access Methods:")
print("-" * 40)

# Fluent API Method: Get Documentation Sites
docs_sites = context.get_docs_sites_urls()
print(f"✅ Data Documentation Sites: {len(docs_sites)}")
for site in docs_sites:
    print(f"  Site Name: {site['site_name']}")
    print(f"  Site URL: {site['site_url']}")

# Get details for BOTH suites
print(f"\n📋 Profiling Suite Details:")
profiling_suite = context.get_expectation_suite(profiling_suite_name)
print(f"✅ Suite Name: {profiling_suite.expectation_suite_name}")
print(f"✅ Number of Expectations: {len(profiling_suite.expectations)}")

print(f"\n📋 Manual Suite Details:")
manual_suite = context.get_expectation_suite(manual_suite_name)
print(f"✅ Suite Name: {manual_suite.expectation_suite_name}")
print(f"✅ Number of Expectations: {len(manual_suite.expectations)}")

# Show results for BOTH checkpoints
print(f"\n📊 Profiling Checkpoint Results:")
print(f"✅ Success: {profiling_result.success}")
print(f"✅ Statistics: {profiling_result.get_statistics()}")

print(f"\n📊 Manual Checkpoint Results:")
print(f"✅ Success: {manual_result.success}")
print(f"✅ Statistics: {manual_result.get_statistics()}")

# Save organized outputs to Great Expectations specific directories
print(f"\n💾 Saving Great Expectations Outputs:")
print("-" * 40)

# Save profiling results
# Convert statistics to serializable format
profiling_stats = profiling_result.get_statistics()
serializable_profiling_stats = {
    "data_asset_count": profiling_stats.get("data_asset_count", 0),
    "validation_result_count": profiling_stats.get("validation_result_count", 0),
    "successful_validation_count": profiling_stats.get("successful_validation_count", 0),
    "unsuccessful_validation_count": profiling_stats.get("unsuccessful_validation_count", 0),
    "successful_validation_percent": profiling_stats.get("successful_validation_percent", 0.0)
}

profiling_summary = {
    "statistics": serializable_profiling_stats,
# Save manual results
# Convert statistics to serializable format
manual_stats = manual_result.get_statistics()
serializable_manual_stats = {
    "data_asset_count": manual_stats.get("data_asset_count", 0),
    "validation_result_count": manual_stats.get("validation_result_count", 0),
    "successful_validation_count": manual_stats.get("successful_validation_count", 0),
    "unsuccessful_validation_count": manual_stats.get("unsuccessful_validation_count", 0),
    "successful_validation_percent": manual_stats.get("successful_validation_percent", 0.0)
}

manual_summary = {
    "suite_name": manual_suite.expectation_suite_name,
    "expectations_count": len(manual_suite.expectations),
    "checkpoint_success": manual_result.success,
    "statistics": serializable_manual_stats,
    "batch_id": batch.id,
    "data_source": data_source.name,
    "data_asset": data_asset.name
}

with open(manual_dir / "manual_summary.json", "w") as f:
    json.dump(manual_summary, f, indent=2)
print(f"✅ Manual summary saved to: {manual_dir / 'manual_summary.json'}")

# Create comprehensive report
report_content = f"""
# Great Expectations Demo Validation Report

## Project Information
- **Project Root**: {Path.cwd()}
- **Great Expectations Output Directory**: {output_dir}
- **Profiling Results**: {profiling_dir}
- **Manual Results**: {manual_dir}
- **Reports**: {reports_dir}
- **Notebook**: notebooks/great_expectations/demo.ipynb

## Database Integration
- **Data Source**: {data_source.name}
- **Data Asset**: {data_asset.name}
- **Batch ID**: {batch.id}
- **Connection**: PostgreSQL (Official GX Workshop Database)

## Expectation Suites

### Profiling Suite (Automatic Data Quality Analysis)
- **Name**: {profiling_suite.expectation_suite_name}
- **Expectations**: {len(profiling_suite.expectations)}
- **Checkpoint Success**: {profiling_result.success}
- **Purpose**: Automatic data profiling and quality insights

### Manual Suite (Business Rules)
- **Name**: {manual_suite.expectation_suite_name}
- **Expectations**: {len(manual_suite.expectations)}
- **Checkpoint Success**: {manual_result.success}
- **Purpose**: Custom business rules and validation logic

## Validation Results

### Profiling Checkpoint Results
- **Success**: {profiling_result.success}
- **Statistics**: {profiling_result.get_statistics()}

### Manual Checkpoint Results
- **Success**: {manual_result.success}
- **Statistics**: {manual_result.get_statistics()}

## Documentation
- **Data Docs Sites**: {len(docs_sites)}
- **Interactive HTML**: Available via context.open_data_docs()
- **Site URL**: {docs_sites[0]['site_url'] if docs_sites else 'N/A'}

## Fluent API Methods Demonstrated
- ✅ context.sources.add_postgres() - PostgreSQL data source
- ✅ data_source.add_table_asset() - Table asset creation
- ✅ data_asset.build_batch_request() - Batch request building
- ✅ data_asset.get_batch_list_from_batch_request() - Batch list creation
- ✅ context.add_expectation_suite() - Expectation suite creation
- ✅ context.get_validator() - Validator creation (GX Automatic Validator Tool)
- ✅ validator.validate() - Profiling and validation execution
- ✅ validator.expect_*() - Manual expectation addition
- ✅ validator.save_expectation_suite() - Suite saving
- ✅ context.add_checkpoint() - Checkpoint creation
- ✅ context.run_checkpoint() - Checkpoint execution
- ✅ context.get_docs_sites_urls() - Documentation site access
- ✅ context.get_expectation_suite() - Suite details retrieval

## Next Steps
1. Review validation results in Data Docs
2. Customize expectations for your specific use case
3. Integrate with your data pipeline
4. Set up automated validation schedules
5. Explore additional expectation types

## File Structure
```
notebooks/great_expectations/
├── demo.ipynb                    # This notebook
└── outputs/                      # Generated outputs
    ├── profiling/                # Profiling results
    │   └── profiling_summary.json
    ├── manual/                   # Manual validation results
    │   └── manual_summary.json
    └── reports/                  # Generated reports
        └── validation_report.md
```

Generated on: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
"""

with open(reports_dir / "validation_report.md", "w") as f:
    f.write(report_content)
print(f"✅ Comprehensive report saved to: {reports_dir / 'validation_report.md'}")

# Create a simple index file for easy navigation
index_content = f"""
# Great Expectations Demo Outputs

This directory contains outputs generated from the Great Expectations demo notebook.

## Directory Structure
- **profiling/** - Automatic data profiling results
- **manual/** - Manual validation results  
- **reports/** - Comprehensive reports and documentation

## Files Generated
- `profiling/profiling_summary.json` - Profiling suite summary
- `manual/manual_summary.json` - Manual suite summary
- `reports/validation_report.md` - Complete validation report

## Data Docs
Interactive HTML documentation is available at:
{docs_sites[0]['site_url'] if docs_sites else 'Run the notebook to generate Data Docs'}

Generated on: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
"""

with open(output_dir / "README.md", "w") as f:
    f.write(index_content)
print(f"✅ Index file saved to: {output_dir / 'README.md'}")


print("\n🌐 Opening Data Docs in Browser...")
try:
    context.open_data_docs()
    print("✅ Data Docs opened in browser")
except Exception as e:
    print(f"Note: Could not open browser automatically: {e}")
    print("You can manually open the URL shown above in your browser")

