# Step 01: Execute Data Extraction Batch

This notebook executes SQL scripts to extract data and generate CSV import files for Moody's.

**Tasks:**
- Retrieve Data Extraction batch from Stage 01
- Review batch configuration and job details
- Execute SQL extraction scripts for each portfolio
- Save results to CSV files in files/data/
- Display results and update job status

## 1) Setup

In [None]:
%load_ext autoreload
%autoreload 2

from helpers.notebook_setup import initialize_notebook_context
from helpers import ux
from helpers.batch import submit_batch, get_batch_jobs, read_batch, validate_batch
from helpers.database import execute_query
from helpers.irp_integration import IRPClient
from helpers.constants import BatchType

# Flag to track validation state - allows notebook to complete gracefully on failure
validation_failed = False
validation_errors = []

In [None]:
# Initialize notebook context and step tracking
context, step = initialize_notebook_context('Step_01_Execute_Data_Extraction.ipynb')

# Display context
ux.header("Data Extraction Execution")
ux.info(f"Cycle: {context.cycle_name}")
ux.info(f"Stage: {context.stage_name}")
ux.info(f"Step: {context.step_name}")
ux.success(f"\u2713 Step tracking initialized for '{context.step_name}'")

## 2) Retrieve Data Extraction Batch

In [None]:
# Retrieve Data Extraction batch from Stage_01/Step_03
ux.subheader("Retrieve Data Extraction Batch")

# Query for Stage_01/Step_03 step run
query = """
    SELECT sr.id, sr.step_id, sr.run_num, sr.output_data, sr.completed_ts
    FROM irp_step_run sr
    INNER JOIN irp_step s ON sr.step_id = s.id
    INNER JOIN irp_stage sg ON s.stage_id = sg.id
    INNER JOIN irp_cycle c ON sg.cycle_id = c.id
    WHERE c.cycle_name = %s
      AND sg.stage_num = 1
      AND s.step_num = 3
      AND sr.status = 'COMPLETED'
    ORDER BY sr.completed_ts DESC
    LIMIT 1
"""

result = execute_query(query, (context.cycle_name,))

if result.empty:
    raise ValueError("Batch creation step not found - please complete Stage_01/Step_03 first")

output_data = result.iloc[0]['output_data']
batches = output_data.get('batches', {})

if BatchType.DATA_EXTRACTION not in batches:
    raise ValueError(f"Data Extraction batch not found. Available: {list(batches.keys())}")

extraction_batch_id = int(batches[BatchType.DATA_EXTRACTION])

ux.success(f"\u2713 Retrieved Data Extraction batch: ID={extraction_batch_id}")
step.log(f"Retrieved Data Extraction batch: ID={extraction_batch_id}")

## 3) Review Batch Configuration

In [None]:
# Verify batch status and display job information
ux.subheader("Verify Batch Status")

# Read batch details
batch = read_batch(extraction_batch_id)

batch_info = [
    ["Batch ID", batch['id']],
    ["Batch Type", batch['batch_type']],
    ["Status", batch['status']],
    ["Created", batch['created_ts'].strftime('%Y-%m-%d %H:%M:%S')]
]
ux.table(batch_info, headers=["Property", "Value"])

# Get jobs in batch
jobs = get_batch_jobs(extraction_batch_id)
job_count = len(jobs)

ux.info(f"\nTotal jobs: {job_count}")

# Display job details
job_rows = []
for job in jobs:
    config_query = "SELECT job_configuration_data FROM irp_job_configuration WHERE id = %s"
    config_result = execute_query(config_query, (job['job_configuration_id'],))
    job_config_data = config_result.iloc[0]['job_configuration_data'] if not config_result.empty else {}
    
    job_rows.append([
        job['id'],
        job_config_data.get('Portfolio', 'N/A'),
        job_config_data.get('Import File', 'N/A'),
        job_config_data.get('accounts_import_file', 'N/A'),
        job_config_data.get('locations_import_file', 'N/A'),
        job['status']
    ])

ux.subheader("Job Details")
ux.table(job_rows[:10], headers=['Job ID', 'Portfolio', 'Import File', 'Account CSV', 'Location CSV', 'Status'])
if len(job_rows) > 10:
    ux.info(f"... and {len(job_rows) - 10} more jobs")

# Validate batch before submission
ux.subheader("Validate Batch")
validation_errors = validate_batch(extraction_batch_id)

if validation_errors:
    validation_failed = True
    ux.error("\u2717 Batch validation failed:")
    for error in validation_errors:
        ux.error(f"  {error}")
    step.log(f"Batch validation failed: {len(validation_errors)} error(s)")
else:
    ux.success("\u2713 Batch validation passed")
    step.log(f"Verified batch: {job_count} jobs ready for execution")

## 4) Execute Data Extraction

In [None]:
# Execute data extraction SQL scripts
if validation_failed:
    ux.warning("\u23ed Skipping execution due to validation failure")
    result = None
    failed_count = 0
else:
    ux.subheader("Execute Data Extraction")

    ux.info("")
    ux.info("Execution Process:")
    ux.info("  - Each job executes a SQL script to extract data from SQL Server")
    ux.info("  - SQL scripts are located in workspace/sql/import_files/{Cycle Type}/")
    ux.info("  - Script naming: 2_Create_{Import File}_Moodys_ImportFile.sql")
    ux.info("  - Connection: ASSURANT / Database: DW_EXP_MGMT_USER")
    ux.info("  - Output: Account and Location CSV files saved to files/data/")
    ux.info("")
    ux.warning("\u26a0 Note: Large datasets may take significant time to process")
    ux.info("")

    # Execute
    ux.info("\nExecuting extraction scripts...")

    # Pass step.step_id to associate batch with this step
    result = submit_batch(extraction_batch_id, IRPClient(), step_id=step.step_id)

    # Display results
    ux.success(f"\n\u2713 Batch execution completed")
    ux.info(f"  Executed: {result['submitted_jobs']} jobs")
    ux.info(f"  Status: {result['batch_status']}")

    # Display generated files
    ux.subheader("Generated CSV Files")
    for job_result in result.get('jobs', []):
        response = job_result.get('response', {})
        if response.get('status') == 'SUCCESS':
            csv_files = response.get('csv_files', [])
            account_rows = response.get('account_rows', 0)
            location_rows = response.get('location_rows', 0)
            ux.success(f"  \u2713 Job {job_result['job_id']}: {account_rows:,} accounts, {location_rows:,} locations")
            for f in csv_files:
                ux.info(f"      \u2192 {f}")

    # Check for errors
    failed_count = len([j for j in result.get('jobs', []) if 'error' in j])
    if failed_count > 0:
        ux.warning(f"\n\u26a0 {failed_count} job(s) failed to execute")
        for job_result in result.get('jobs', []):
            if 'error' in job_result:
                ux.error(f"  Job {job_result['job_id']}: {job_result['error']}")

    step.log(f"Batch executed: {result['submitted_jobs']} jobs, {failed_count} failed")

## 5) Complete Step Execution

In [None]:
# Complete step execution
ux.header("Step Completion")

if validation_failed:
    # Handle validation failure
    from helpers.step import update_step_run
    from helpers.constants import StepStatus
    
    error_message = "\n".join(validation_errors)
    update_step_run(step.run_id, StepStatus.FAILED, error_message=error_message)
    
    ux.error("\n" + "="*60)
    ux.error("BATCH VALIDATION FAILED")
    ux.error("="*60)
    ux.info(f"\nBatch ID: {extraction_batch_id}")
    ux.error(f"\nValidation errors ({len(validation_errors)}):")
    for error in validation_errors:
        ux.error(f"  {error}")
    ux.info("\nPlease fix the validation errors and retry.")

elif failed_count > 0:
    # Handle execution failures
    failed_job_errors = [
        f"Job {j['job_id']}: {j['error']}" 
        for j in result.get('jobs', []) if 'error' in j
    ]
    error_message = f"{failed_count} job(s) failed to execute:\n" + "\n".join(failed_job_errors)
    
    from helpers.step import update_step_run
    from helpers.constants import StepStatus
    update_step_run(step.run_id, StepStatus.FAILED, error_message=error_message)
    
    ux.error("\n" + "="*60)
    ux.error("DATA EXTRACTION BATCH EXECUTION FAILED")
    ux.error("="*60)
    ux.info(f"\nBatch ID: {extraction_batch_id}")
    ux.info(f"Executed: {result['submitted_jobs']} job(s)")
    ux.error(f"Failed: {failed_count} job(s)")
    ux.info("\nFailed jobs:")
    for error in failed_job_errors:
        ux.error(f"  {error}")
    ux.info("\nPlease review the errors and resubmit failed jobs.")

else:
    # Complete the step successfully
    output_data = {
        'batch_id': extraction_batch_id,
        'batch_type': batch['batch_type'],
        'batch_status': result['batch_status'],
        'submitted_jobs': result['submitted_jobs'],
        'failed_jobs': failed_count
    }
    step.complete(output_data)

    ux.success("\n" + "="*60)
    ux.success("DATA EXTRACTION BATCH EXECUTED SUCCESSFULLY")
    ux.success("="*60)
    ux.info(f"\nExecuted {result['submitted_jobs']} job(s)")
    ux.info(f"Batch status: {result['batch_status']}")
    ux.info(f"CSV files saved to: files/data/")
    ux.info("\nNext: Proceed to Step 02 Control Totals for validation")