# Step 03: Create Batches

This notebook creates batches and jobs from the loaded configuration.

**Tasks:**
- Verify configuration is loaded and valid
- Identify batch types to create based on configuration data
- Preview databases and job configurations
- Create batches (EDM Creation, etc.)
- Display batch and job summaries

In [None]:
%load_ext autoreload
%autoreload 2

# Import required modules
import sys
import os
from pathlib import Path

# Determine the notebook's actual directory
# JupyterLab doesn't always set cwd correctly, so we need to find it
cwd = Path.cwd()

if 'Active_' in str(cwd):
    # Working directory is set correctly, construct path to THIS notebook
    notebook_path = cwd / 'Step_03_Create_Batches.ipynb'
else:
    # Working directory is not set correctly (e.g., /home/jovyan)
    # Look for Active_ directories in the standard workspace structure
    home = Path.home()
    workspace = home / 'workspace'
    
    if workspace.exists():
        workflows = workspace / 'workflows'
        active_dirs = list(workflows.glob('Active_*'))
        
        if active_dirs:
            # Use the first Active_ directory found and point to THIS notebook
            notebook_path = active_dirs[0] / 'notebooks' / 'Stage_01_Setup' / 'Step_03_Create_Batches.ipynb'
        else:
            # No active cycle found
            raise RuntimeError("No Active_ cycle directory found in workspace/workflows/")
    else:
        raise RuntimeError("Workspace directory not found")

print(f"Notebook path: {notebook_path}")

# Add workspace to path
workspace_path = notebook_path.parent.parent.parent.parent
if str(workspace_path) not in sys.path:
    sys.path.insert(0, str(workspace_path))

from helpers import ux
from helpers.context import WorkContext
from helpers.configuration import read_configuration
from helpers.batch import create_batch
from helpers.database import execute_query
from helpers.step import Step

## 1) Initialize Context

In [None]:
# Initialize work context
ux.header("Batch Creation")

# Initialize context with explicit notebook path
context = WorkContext(notebook_path=str(notebook_path))

ux.info(f"Cycle: {context.cycle_name}")
ux.info(f"Stage: {context.stage_name}")
ux.info(f"Step: {context.step_name}")

## 2) Initialize Step Tracking

In [None]:
# Initialize step execution tracking
ux.header("Step Execution Tracking")

try:
    # Create step object (auto-starts if not already executed)
    step = Step(context)
    
    # Check if step was already executed
    if step.executed:
        ux.warning("⚠ This step has already been executed")
        ux.info(f"Message: {step.status_message}")
        
        # Get details about the last run
        from helpers.step import get_last_step_run
        last_run = get_last_step_run(step.step_id)
        if last_run:
            ux.info(f"Last run: #{last_run['run_num']}")
            ux.info(f"Status: {last_run['status']}")
            if last_run['completed_ts']:
                ux.info(f"Completed: {last_run['completed_ts'].strftime('%Y-%m-%d %H:%M:%S')}")
        
        # Allow user to override and re-run if needed
        response = ux.yes_no("Do you want to re-run this step?")
        if response:
            ux.info("Re-running step...")
            step.start(force=True)
        else:
            ux.info("Step execution skipped")
            raise SystemExit("Step already completed")
    
    ux.success(f"✓ Step tracking initialized for '{context.step_name}'")
    
except SystemExit:
    raise  # Re-raise to stop notebook execution
except Exception as e:
    ux.error(f"✗ Failed to initialize step tracking: {str(e)}")
    raise

## 3) Verify Configuration

In [None]:
# Verify configuration exists and is valid
ux.header("Configuration Verification")

try:
    # Get cycle ID
    cycle_result = execute_query(
        "SELECT id FROM irp_cycle WHERE cycle_name = %s",
        (context.cycle_name,)
    )
    
    if cycle_result.empty:
        raise ValueError(f"Cycle not found: {context.cycle_name}")
    
    cycle_id = int(cycle_result.iloc[0]['id'])  # Convert numpy.int64 to Python int
    
    # Get configuration for this cycle
    config_result = execute_query(
        "SELECT id, status, created_ts FROM irp_configuration WHERE cycle_id = %s ORDER BY created_ts DESC LIMIT 1",
        (cycle_id,)
    )
    
    if config_result.empty:
        ux.error("✗ No configuration found for this cycle")
        ux.info("Please complete Step 02: Validate Configuration File first")
        step.fail("No configuration found for cycle")
        raise ValueError("No configuration found for cycle")
    
    config_id = int(config_result.iloc[0]['id'])  # Convert numpy.int64 to Python int
    config_status = config_result.iloc[0]['status']
    config_created = config_result.iloc[0]['created_ts']
    
    # Verify status is VALID or ACTIVE
    if config_status not in ['VALID', 'ACTIVE']:
        ux.error(f"✗ Configuration status is '{config_status}' (expected VALID or ACTIVE)")
        step.fail(f"Configuration status invalid: {config_status}")
        raise ValueError(f"Configuration must be VALID or ACTIVE, found: {config_status}")
    
    # Display configuration summary
    config_info = [
        ["Configuration ID", config_id],
        ["Status", config_status],
        ["Created", config_created.strftime('%Y-%m-%d %H:%M:%S')]
    ]
    ux.table(config_info, headers=["Property", "Value"])
    ux.success("✓ Configuration verified")
    
    step.log(f"Configuration verified: ID={config_id}, Status={config_status}")
    
except Exception as e:
    ux.error(f"✗ Configuration verification failed: {str(e)}")
    step.fail(f"Configuration verification failed: {str(e)}")
    raise

## 4) Identify Batch Types to Create

In [None]:
# Analyze configuration to determine which batch types are needed
ux.header("Batch Type Identification")

try:
    # Read configuration data
    config_data = read_configuration(config_id)
    
    # Extract configuration_data JSONB field
    configuration_data = config_data.get('configuration_data', {})
    
    # Identify batch types based on configuration content
    batch_types_info = []
    batch_types_to_create = []
    
    # Check for EDM Creation (Databases sheet)
    databases = configuration_data.get('Databases', [])
    if databases:
        batch_types_info.append(["EDM Creation", len(databases), "One job per database"])
        batch_types_to_create.append('EDM Creation')
    
    # Future: Add more batch types here
    # portfolios = configuration_data.get('Portfolios', [])
    # if portfolios:
    #     batch_types_info.append(["Portfolio Creation", len(portfolios), "One job per portfolio"])
    #     batch_types_to_create.append('Portfolio Creation')
    
    # analyses = configuration_data.get('Analysis Table', [])
    # if analyses:
    #     batch_types_info.append(["Analysis", len(analyses), "One job per analysis"])
    #     batch_types_to_create.append('Analysis')
    
    # Display identified batch types
    if batch_types_info:
        ux.info("Batch types identified from configuration:")
        ux.table(batch_types_info, headers=["Batch Type", "Row Count", "Description"])
        ux.success(f"✓ Found {len(batch_types_to_create)} batch type(s) to create")
        
        step.log(f"Identified {len(batch_types_to_create)} batch type(s): {', '.join(batch_types_to_create)}")
    else:
        ux.warning("⚠ No batch types identified from configuration")
        ux.info("Configuration may not contain required data sheets (Databases, Portfolios, etc.)")
        step.fail("No batch types identified in configuration")
        raise ValueError("No batch types identified in configuration")
    
except Exception as e:
    ux.error(f"✗ Batch type identification failed: {str(e)}")
    step.fail(f"Batch type identification failed: {str(e)}")
    raise

## 5) Preview: EDM Creation Batch

In [None]:
# Preview databases that will become jobs
ux.header("EDM Creation Batch Preview")

if 'EDM Creation' in batch_types_to_create:
    ux.info(f"This batch will create {len(databases)} job(s), one for each database:")
    ux.info("")
    
    # Display databases table
    database_rows = []
    for db in databases:
        database_rows.append([
            db.get('Database', 'N/A')
        ])
    
    ux.table(database_rows, headers=["Database"])
    
    ux.info("")
    ux.info("Each database will become one job with configuration containing:")
    ux.info("  - Metadata from configuration file")
    ux.info("  - Database-specific fields (Database, Version, EDM_Type, etc.)")
    
    step.log(f"Previewed EDM Creation batch: {len(databases)} databases")
else:
    ux.info("EDM Creation batch not needed (no databases in configuration)")

## 6) Create Batches

In [None]:
# Create batches for identified batch types
ux.header("Batch Creation")

# Confirm with user
batch_summary = ", ".join(batch_types_to_create)
ux.info(f"Ready to create batches: {batch_summary}")
proceed = ux.yes_no("Create these batches?")

if not proceed:
    ux.info("Batch creation cancelled by user")
    step.log("User cancelled batch creation")
    raise SystemExit("User cancelled batch creation")

# Create batches
created_batches = {}

try:
    for batch_type in batch_types_to_create:
        ux.subheader(f"Creating batch: {batch_type}")
        
        # Create batch (this will create jobs atomically)
        batch_id = create_batch(
            batch_type=batch_type,
            configuration_id=config_id,
            step_id=step.step_id
        )
        
        # Store batch ID (convert to int to avoid numpy types)
        created_batches[batch_type] = int(batch_id)
        
        # Get job count for this batch
        job_count_result = execute_query(
            "SELECT COUNT(*) as count FROM irp_job WHERE batch_id = %s",
            (batch_id,)
        )
        job_count = int(job_count_result.iloc[0]['count'])
        
        ux.success(f"✓ Batch created: ID={batch_id}")
        ux.info(f"  Jobs created: {job_count}")
        
        step.log(f"Created batch '{batch_type}': ID={batch_id}, Jobs={job_count}")
    
    ux.success(f"\n✓ All batches created successfully ({len(created_batches)} total)")
    
except Exception as e:
    ux.error(f"✗ Batch creation failed: {str(e)}")
    step.fail(f"Batch creation failed: {str(e)}")
    raise

## 7) Display Batch Summary

In [None]:
# Display summary of all created batches
ux.header("Batch Summary")

try:
    # Get batch details
    batch_ids = list(created_batches.values())
    
    if batch_ids:
        # Build query to get all batches
        placeholders = ', '.join(['%s'] * len(batch_ids))
        batch_query = f"""
            SELECT 
                b.id,
                b.batch_type,
                b.status,
                b.created_ts,
                COUNT(j.id) as job_count
            FROM irp_batch b
            LEFT JOIN irp_job j ON b.id = j.batch_id
            WHERE b.id IN ({placeholders})
            GROUP BY b.id, b.batch_type, b.status, b.created_ts
            ORDER BY b.created_ts
        """
        
        batch_results = execute_query(batch_query, tuple(batch_ids))
        
        # Display batch information
        batch_rows = []
        total_jobs = 0
        
        for _, batch in batch_results.iterrows():
            batch_rows.append([
                batch['batch_type'],
                batch['id'],
                batch['status'],
                int(batch['job_count']),
                batch['created_ts'].strftime('%Y-%m-%d %H:%M:%S')
            ])
            total_jobs += int(batch['job_count'])
        
        ux.table(batch_rows, headers=["Batch Type", "Batch ID", "Status", "Jobs", "Created"])
        
        ux.info(f"\nTotal batches: {len(batch_ids)}")
        ux.info(f"Total jobs: {total_jobs}")
        
        step.log(f"Batch summary: {len(batch_ids)} batches, {total_jobs} total jobs")
    
except Exception as e:
    ux.error(f"✗ Failed to display batch summary: {str(e)}")
    # Don't fail step, this is just display
    step.log(f"Warning: Failed to display batch summary: {str(e)}", level="WARNING")

## 8) Preview Job Configurations

In [None]:
# Preview job configurations for EDM Creation batch
ux.header("Job Configuration Preview")

try:
    if 'EDM Creation' in created_batches:
        edm_batch_id = created_batches['EDM Creation']
        
        ux.subheader("EDM Creation Jobs (first 5)")
        
        # Get job configurations
        job_config_query = """
            SELECT 
                jc.id,
                jc.job_configuration_data,
                j.id as job_id,
                j.status
            FROM irp_job_configuration jc
            INNER JOIN irp_job j ON jc.id = j.job_configuration_id
            WHERE jc.batch_id = %s
            LIMIT 5
        """
        
        job_configs = execute_query(job_config_query, (edm_batch_id,))
        
        if not job_configs.empty:
            # Display job configuration details
            job_rows = []
            for _, job_config in job_configs.iterrows():
                config_data = job_config['job_configuration_data']
                job_rows.append([
                    job_config['job_id'],
                    config_data.get('Database', 'N/A'),
                    job_config['status']
                ])
            
            ux.table(job_rows, headers=["Job ID", "Database", "Status"])
            
            ux.info("\nEach job configuration contains:")
            ux.info("  - Full metadata from configuration file")
            ux.info("  - Database-specific fields from Databases sheet")
            ux.info("  - Additional fields: Description, Connection details, etc.")
        else:
            ux.warning("No job configurations found")
    
    step.log("Job configuration preview displayed")
    
except Exception as e:
    ux.error(f"✗ Failed to preview job configurations: {str(e)}")
    # Don't fail step, this is just display
    step.log(f"Warning: Failed to preview jobs: {str(e)}", level="WARNING")

## 9) Complete Step Execution

In [None]:
# Complete step execution
ux.header("Step Completion")

try:
    # Prepare output data
    output_data = {
        'configuration_id': config_id,
        'batches': created_batches,  # {batch_type: batch_id}
        'batch_types_created': batch_types_to_create,
        'total_job_count': total_jobs
    }
    
    # Complete the step
    step.complete(output_data)

    ux.success("\n" + "="*60)
    ux.success("✓ BATCHES CREATED SUCCESSFULLY")
    ux.success("="*60)
    ux.info(f"\nCreated {len(created_batches)} batch(es) with {total_jobs} total job(s)")
    ux.info("Batches are in INITIATED status and ready for submission")
    ux.info("\nNext: Stage 02 will handle batch submission and job monitoring")

except Exception as e:
    ux.error(f"✗ Step completion failed: {str(e)}")
    step.fail(str(e))
    raise