# Configuration Issues Debug

This notebook debugs configuration issues using the actual PipelineConfig class.
Tests configuration loading, validation, and usage in pipeline components.

In [None]:
# Import configuration modules
import sys
sys.path.append('/home/kosaraju/mgpt-serve/mgpt_eval')

from models.config_models import PipelineConfig
import os
import yaml
from pathlib import Path

In [None]:
# Test configuration file discovery
print("Discovering configuration files...")

config_paths = [
    "/home/kosaraju/mgpt-serve/mgpt_eval/configs/pipeline_config.yaml",
    "/home/kosaraju/mgpt-serve/mgpt_eval/configs/examples/config_embeddings_only.yaml",
    "/home/kosaraju/mgpt-serve/mgpt_eval/configs/examples/config_evaluation_only.yaml",
    "/home/kosaraju/mgpt-serve/mgpt_eval/configs/examples/config_training_from_embeddings.yaml",
    "/home/kosaraju/mgpt-serve/mgpt_eval/configs/examples/custom_medical_config.yaml"
]

existing_configs = []
for config_path in config_paths:
    if os.path.exists(config_path):
        existing_configs.append(config_path)
        print(f"✓ Found: {os.path.basename(config_path)}")
    else:
        print(f"✗ Missing: {os.path.basename(config_path)}")

print(f"\nTotal configs found: {len(existing_configs)}")

In [None]:
# Test individual configuration loading
print("Testing individual configuration loading...")

for config_path in existing_configs:
    config_name = os.path.basename(config_path)
    print(f"\n--- Testing {config_name} ---")
    
    try:
        # Test YAML loading first
        with open(config_path, 'r') as f:
            yaml_content = yaml.safe_load(f)
        print(f"  ✓ YAML parsing successful")
        print(f"    Top-level keys: {list(yaml_content.keys())}")
        
        # Test PipelineConfig loading
        config = PipelineConfig.from_yaml(config_path)
        print(f"  ✓ PipelineConfig loading successful")
        print(f"    Job name: {config.job.job_name}")
        
        # Check configuration components
        components = []
        if hasattr(config, 'job') and config.job:
            components.append('job')
        if hasattr(config, 'input') and config.input:
            components.append('input')
        if hasattr(config, 'api') and config.api:
            components.append('api')
        if hasattr(config, 'embedding') and config.embedding:
            components.append('embedding')
        if hasattr(config, 'classification') and config.classification:
            components.append('classification')
        if hasattr(config, 'evaluation') and config.evaluation:
            components.append('evaluation')
        
        print(f"    Components: {components}")
        
    except yaml.YAMLError as e:
        print(f"  ✗ YAML parsing failed: {e}")
    except Exception as e:
        print(f"  ✗ PipelineConfig loading failed: {e}")
        # Try to get more details
        import traceback
        traceback.print_exc()

In [None]:
# Test configuration validation issues
print("Testing configuration validation...")

# Test with main pipeline config
main_config_path = "/home/kosaraju/mgpt-serve/mgpt_eval/configs/pipeline_config.yaml"

if os.path.exists(main_config_path):
    try:
        config = PipelineConfig.from_yaml(main_config_path)
        print(f"✓ Main config loaded successfully")
        
        # Validate specific fields
        print(f"\nValidation checks:")
        
        # Job configuration
        if hasattr(config, 'job') and config.job:
            print(f"  ✓ Job config present")
            print(f"    Job name: {getattr(config.job, 'job_name', 'Missing')}")
            print(f"    Output dir: {getattr(config.job, 'output_base_dir', 'Missing')}")
            print(f"    Split ratio: {getattr(config.job, 'split_ratio', 'Missing')}")
        else:
            print(f"  ✗ Job config missing")
        
        # Input configuration
        if hasattr(config, 'input') and config.input:
            print(f"  ✓ Input config present")
            
            # Check input data specification
            dataset_path = getattr(config.input, 'dataset_path', None)
            train_path = getattr(config.input, 'train_dataset_path', None)
            test_path = getattr(config.input, 'test_dataset_path', None)
            
            print(f"    Dataset path: {dataset_path}")
            print(f"    Train path: {train_path}")
            print(f"    Test path: {test_path}")
            
            # Check for conflicting specifications
            has_single = bool(dataset_path)
            has_split = bool(train_path and test_path)
            
            if has_single and has_split:
                print(f"    ⚠️  Both single and split datasets specified")
            elif not (has_single or has_split):
                print(f"    ✗ No valid dataset specification")
            else:
                print(f"    ✓ Valid dataset specification")
        else:
            print(f"  ✗ Input config missing")
        
        # API configuration
        if hasattr(config, 'api') and config.api:
            print(f"  ✓ API config present")
            print(f"    Host: {getattr(config.api, 'host', 'Missing')}")
            print(f"    Port: {getattr(config.api, 'port', 'Missing')}")
        else:
            print(f"  ✗ API config missing")
        
        # Evaluation configuration
        if hasattr(config, 'evaluation') and config.evaluation:
            print(f"  ✓ Evaluation config present")
            
            if hasattr(config.evaluation, 'target_word') and config.evaluation.target_word:
                target_codes = getattr(config.evaluation.target_word, 'target_codes', [])
                print(f"    Target codes: {target_codes}")
                
                if not target_codes:
                    print(f"    ⚠️  Target codes list is empty")
                elif len(target_codes) == 0:
                    print(f"    ⚠️  Target codes list has no elements")
                else:
                    print(f"    ✓ Target codes properly configured ({len(target_codes)} codes)")
        
    except Exception as e:
        print(f"✗ Main config validation failed: {e}")
else:
    print(f"Main config file not found: {main_config_path}")

In [None]:
# Test configuration field access patterns
print("Testing configuration field access patterns...")

if existing_configs:
    test_config_path = existing_configs[0]
    
    try:
        config = PipelineConfig.from_yaml(test_config_path)
        print(f"Testing with: {os.path.basename(test_config_path)}")
        
        # Test different access patterns
        print(f"\nAccess pattern tests:")
        
        # Direct attribute access
        try:
            job_name = config.job.job_name
            print(f"  ✓ Direct access: config.job.job_name = {job_name}")
        except AttributeError as e:
            print(f"  ✗ Direct access failed: {e}")
        
        # Getattr with default
        try:
            split_ratio = getattr(config.job, 'split_ratio', 'Not found')
            print(f"  ✓ Getattr access: split_ratio = {split_ratio}")
        except Exception as e:
            print(f"  ✗ Getattr access failed: {e}")
        
        # Hasattr check
        try:
            has_embedding = hasattr(config, 'embedding')
            print(f"  ✓ Hasattr check: has embedding = {has_embedding}")
            
            if has_embedding:
                embedding_config = config.embedding
                print(f"    Embedding config: {embedding_config}")
        except Exception as e:
            print(f"  ✗ Hasattr check failed: {e}")
        
        # Test nested access
        try:
            if hasattr(config, 'evaluation') and config.evaluation:
                if hasattr(config.evaluation, 'target_word') and config.evaluation.target_word:
                    target_codes = config.evaluation.target_word.target_codes
                    print(f"  ✓ Nested access: target_codes = {target_codes}")
                else:
                    print(f"  - No target_word config")
            else:
                print(f"  - No evaluation config")
        except Exception as e:
            print(f"  ✗ Nested access failed: {e}")
        
    except Exception as e:
        print(f"Configuration loading failed: {e}")

In [None]:
# Test configuration consistency across files
print("Testing configuration consistency...")

configs = {}
for config_path in existing_configs:
    try:
        config = PipelineConfig.from_yaml(config_path)
        config_name = os.path.basename(config_path)
        configs[config_name] = config
    except Exception as e:
        print(f"Failed to load {os.path.basename(config_path)}: {e}")

print(f"\nLoaded {len(configs)} configurations for comparison")

if len(configs) > 1:
    # Compare API configurations
    api_hosts = {}
    api_ports = {}
    
    for name, config in configs.items():
        if hasattr(config, 'api') and config.api:
            api_hosts[name] = getattr(config.api, 'host', None)
            api_ports[name] = getattr(config.api, 'port', None)
    
    print(f"\nAPI Configuration Comparison:")
    print(f"  Hosts: {api_hosts}")
    print(f"  Ports: {api_ports}")
    
    # Check for inconsistencies
    unique_hosts = set(api_hosts.values())
    unique_ports = set(api_ports.values())
    
    if len(unique_hosts) > 1:
        print(f"  ⚠️  Inconsistent API hosts: {unique_hosts}")
    else:
        print(f"  ✓ Consistent API hosts")
    
    if len(unique_ports) > 1:
        print(f"  ⚠️  Inconsistent API ports: {unique_ports}")
    else:
        print(f"  ✓ Consistent API ports")
    
    # Compare target codes (if available)
    target_codes_configs = {}
    for name, config in configs.items():
        if (hasattr(config, 'evaluation') and config.evaluation and 
            hasattr(config.evaluation, 'target_word') and config.evaluation.target_word):
            target_codes_configs[name] = config.evaluation.target_word.target_codes
    
    if target_codes_configs:
        print(f"\nTarget Codes Comparison:")
        for name, codes in target_codes_configs.items():
            print(f"  {name}: {codes}")
else:
    print("Not enough configurations to compare")

In [None]:
# Test configuration with pipeline components
print("Testing configuration with pipeline components...")

if configs:
    # Test with first available config
    config_name, config = next(iter(configs.items()))
    print(f"Testing with: {config_name}")
    
    # Test embedding pipeline initialization
    try:
        from pipelines.embedding_pipeline import EmbeddingPipeline
        embedding_pipeline = EmbeddingPipeline(config)
        print(f"  ✓ EmbeddingPipeline accepts config")
    except Exception as e:
        print(f"  ✗ EmbeddingPipeline config error: {e}")
    
    # Test classification pipeline initialization
    try:
        from pipelines.classification_pipeline import ClassificationPipeline
        classification_pipeline = ClassificationPipeline(config)
        print(f"  ✓ ClassificationPipeline accepts config")
    except Exception as e:
        print(f"  ✗ ClassificationPipeline config error: {e}")
    
    # Test target word evaluator initialization
    try:
        from evaluation.target_word_evaluator import TargetWordEvaluator
        target_evaluator = TargetWordEvaluator(config)
        print(f"  ✓ TargetWordEvaluator accepts config")
    except Exception as e:
        print(f"  ✗ TargetWordEvaluator config error: {e}")
    
    # Test evaluation pipeline initialization
    try:
        from pipelines.evaluation_pipeline import EvaluationPipeline
        evaluation_pipeline = EvaluationPipeline(config)
        print(f"  ✓ EvaluationPipeline accepts config")
    except Exception as e:
        print(f"  ✗ EvaluationPipeline config error: {e}")
    
    # Test end-to-end pipeline initialization
    try:
        from pipelines.end_to_end_pipeline import EndToEndPipeline
        e2e_pipeline = EndToEndPipeline(config)
        print(f"  ✓ EndToEndPipeline accepts config")
    except Exception as e:
        print(f"  ✗ EndToEndPipeline config error: {e}")

In [None]:
# Debug split ratio issue mentioned in the problem
print("=== Split Ratio Issue Debug ===")

print("Checking split_ratio placement in configurations...")

for config_name, config in configs.items():
    print(f"\n{config_name}:")
    
    # Check if split_ratio is in job config
    if hasattr(config, 'job') and config.job:
        if hasattr(config.job, 'split_ratio'):
            split_ratio = config.job.split_ratio
            print(f"  ✓ split_ratio in job config: {split_ratio}")
        else:
            print(f"  ✗ split_ratio NOT in job config")
    
    # Check if split_ratio is in input config
    if hasattr(config, 'input') and config.input:
        if hasattr(config.input, 'split_ratio'):
            split_ratio = config.input.split_ratio
            print(f"  ! split_ratio found in input config: {split_ratio}")
            print(f"    This might be the issue - should it be in job config instead?")
        else:
            print(f"  - split_ratio not in input config")
    
    # Check input data specification
    if hasattr(config, 'input') and config.input:
        dataset_path = getattr(config.input, 'dataset_path', None)
        train_path = getattr(config.input, 'train_dataset_path', None)
        test_path = getattr(config.input, 'test_dataset_path', None)
        
        print(f"  Input data specification:")
        print(f"    Single dataset: {bool(dataset_path)}")
        print(f"    Split datasets: {bool(train_path and test_path)}")
        
        # Logic check
        if dataset_path and (train_path or test_path):
            print(f"    ⚠️  CONFLICT: Both single and split datasets specified")
        elif dataset_path:
            print(f"    ✓ Single dataset mode - split_ratio should be used")
            if not (hasattr(config.job, 'split_ratio') or hasattr(config.input, 'split_ratio')):
                print(f"    ✗ ERROR: Single dataset mode but no split_ratio found")
        elif train_path and test_path:
            print(f"    ✓ Split dataset mode - split_ratio should NOT be used")
            if hasattr(config.job, 'split_ratio') or hasattr(config.input, 'split_ratio'):
                print(f"    ⚠️  WARNING: Split dataset mode but split_ratio is specified")
        else:
            print(f"    ✗ ERROR: No valid dataset specification")