# Classification Pipeline Debug

This notebook debugs the classification pipeline using the actual ClassificationPipeline class.
Tests classification training and evaluation using real pipeline methods.

In [None]:
# Import pipeline modules
import sys
sys.path.append('/home/kosaraju/mgpt-serve/mgpt_eval')

from models.config_models import PipelineConfig
from pipelines.classification_pipeline import ClassificationPipeline
from pipelines.embedding_pipeline import EmbeddingPipeline
import pandas as pd
import numpy as np

In [None]:
# Load configuration
config_path = "/home/kosaraju/mgpt-serve/mgpt_eval/configs/examples/config_training_from_embeddings.yaml"
config = PipelineConfig.from_yaml(config_path)
print(f"Config loaded: {config.job.job_name}")
print(f"Classification config: {config.classification}")

In [None]:
# Initialize classification pipeline
classification_pipeline = ClassificationPipeline(config)
print(f"Classification pipeline initialized")
print(f"Pipeline methods: {[m for m in dir(classification_pipeline) if not m.startswith('_') and callable(getattr(classification_pipeline, m))]}")

In [None]:
# Create training and test data
train_data = pd.DataFrame({
    'mcid': [f'TRAIN_{i:03d}' for i in range(20)],
    'claims': [
        'N6320 G0378 |eoc| Z91048 M1710',
        'E119 A1234 |eoc| B5678 C9012',
        'Z03818 D3456 |eoc| F7890 G1234',
        'H5678 I9012 |eoc| J1234 K5678',
        'L9012 M3456 |eoc| N6320 O7890'
    ] * 4,
    'label': [1, 1, 1, 0, 1] * 4
})

test_data = pd.DataFrame({
    'mcid': [f'TEST_{i:03d}' for i in range(10)],
    'claims': [
        'P1234 Q5678 |eoc| E119 R9012',
        'S3456 T7890 |eoc| U1234 V5678',
        'W9012 X3456 |eoc| Z03818 Y7890',
        'Z1234 A5678 |eoc| B9012 C3456',
        'D7890 E1234 |eoc| N6320 F5678'
    ] * 2,
    'label': [1, 0, 1, 0, 1] * 2
})

print(f"Training data: {len(train_data)} samples")
print(f"Test data: {len(test_data)} samples")
print(f"Train labels distribution: {train_data['label'].value_counts().to_dict()}")
print(f"Test labels distribution: {test_data['label'].value_counts().to_dict()}")

In [None]:
# Generate embeddings for classification (using mock embeddings for testing)
print("Generating embeddings for classification...")

# Create mock embeddings or use actual embedding pipeline
try:
    # Try to use actual embedding pipeline
    embedding_pipeline = EmbeddingPipeline(config)
    train_embeddings = embedding_pipeline.generate_embeddings(train_data)
    test_embeddings = embedding_pipeline.generate_embeddings(test_data)
    print(f"✓ Real embeddings generated")
    print(f"  Train embeddings: {train_embeddings.shape}")
    print(f"  Test embeddings: {test_embeddings.shape}")
    
except Exception as e:
    print(f"Real embeddings failed: {e}")
    print("Using mock embeddings...")
    
    # Create mock embeddings
    np.random.seed(42)
    train_embeddings = np.random.randn(len(train_data), 768)
    test_embeddings = np.random.randn(len(test_data), 768)
    print(f"✓ Mock embeddings created")
    print(f"  Train embeddings: {train_embeddings.shape}")
    print(f"  Test embeddings: {test_embeddings.shape}")

In [None]:
# Test classification training
print("Testing classification training...")

try:
    # Check if pipeline has train method
    if hasattr(classification_pipeline, 'train'):
        models = classification_pipeline.train(train_embeddings, train_data['label'])
        print(f"✓ Training completed")
        print(f"  Models trained: {list(models.keys()) if isinstance(models, dict) else type(models)}")
        
    elif hasattr(classification_pipeline, 'fit'):
        classification_pipeline.fit(train_embeddings, train_data['label'])
        print(f"✓ Fit completed")
        
    else:
        print("No train/fit method found")
        print(f"Available methods: {[m for m in dir(classification_pipeline) if not m.startswith('_') and callable(getattr(classification_pipeline, m))]}")
        
except Exception as e:
    print(f"✗ Training failed: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Test classification prediction
print("Testing classification prediction...")

try:
    # Check if pipeline has predict method
    if hasattr(classification_pipeline, 'predict'):
        predictions = classification_pipeline.predict(test_embeddings)
        print(f"✓ Predictions completed")
        print(f"  Predictions shape: {predictions.shape if hasattr(predictions, 'shape') else len(predictions)}")
        print(f"  Predictions: {predictions[:5]}")
        
    elif hasattr(classification_pipeline, 'evaluate'):
        results = classification_pipeline.evaluate(test_embeddings, test_data['label'])
        print(f"✓ Evaluation completed")
        print(f"  Results: {results}")
        
    else:
        print("No predict/evaluate method found")
        
except Exception as e:
    print(f"✗ Prediction failed: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Test full pipeline run
print("Testing full classification pipeline run...")

try:
    # Use the main run method
    results = classification_pipeline.run(train_data, test_data)
    print(f"✓ Pipeline run completed")
    print(f"  Results type: {type(results)}")
    print(f"  Results keys: {list(results.keys()) if isinstance(results, dict) else 'Not a dict'}")
    
    if isinstance(results, dict):
        for key, value in results.items():
            print(f"    {key}: {type(value)}")
            if hasattr(value, 'shape'):
                print(f"      Shape: {value.shape}")
            elif isinstance(value, (list, dict)):
                print(f"      Length: {len(value)}")
            
except Exception as e:
    print(f"✗ Pipeline run failed: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Test with existing embeddings
print("Testing with pre-existing embeddings...")

try:
    # Check if pipeline can load existing embeddings
    if hasattr(classification_pipeline, 'load_embeddings'):
        # Test with a mock embeddings file path
        embeddings_path = "test_embeddings.pkl"
        embeddings = classification_pipeline.load_embeddings(embeddings_path)
        print(f"✓ Embeddings loaded from {embeddings_path}")
        
    elif hasattr(classification_pipeline, 'set_embeddings'):
        classification_pipeline.set_embeddings(train_embeddings, test_embeddings)
        print(f"✓ Embeddings set directly")
        
    else:
        print("No embedding loading/setting methods found")
        
except Exception as e:
    print(f"Embedding loading test: {e}")

In [None]:
# Debug classification pipeline internals
print("=== Classification Pipeline Debug ===")

# Check pipeline attributes
print(f"Pipeline config: {hasattr(classification_pipeline, 'config')}")
print(f"Pipeline logger: {hasattr(classification_pipeline, 'logger')}")
print(f"Pipeline models: {hasattr(classification_pipeline, 'models')}")

# Check configuration
if hasattr(classification_pipeline, 'config'):
    print(f"\nConfig classification: {classification_pipeline.config.classification}")
    if hasattr(classification_pipeline.config, 'classification') and classification_pipeline.config.classification:
        print(f"Classification models: {classification_pipeline.config.classification.models}")
        print(f"CV folds: {classification_pipeline.config.classification.cross_validation_folds}")
        print(f"Hyperparameter search: {classification_pipeline.config.classification.hyperparameter_search}")

# Check all methods and attributes
all_attrs = [attr for attr in dir(classification_pipeline) if not attr.startswith('_')]
methods = [attr for attr in all_attrs if callable(getattr(classification_pipeline, attr))]
properties = [attr for attr in all_attrs if not callable(getattr(classification_pipeline, attr))]

print(f"\nMethods: {methods}")
print(f"Properties: {properties}")

In [None]:
# Test individual classifier methods if available
print("Testing individual classifier methods...")

# Check if pipeline has individual classifier methods
classifier_methods = ['train_logistic_regression', 'train_svm', 'train_random_forest']

for method_name in classifier_methods:
    if hasattr(classification_pipeline, method_name):
        print(f"Found method: {method_name}")
        try:
            method = getattr(classification_pipeline, method_name)
            result = method(train_embeddings, train_data['label'])
            print(f"  ✓ {method_name} completed: {type(result)}")
        except Exception as e:
            print(f"  ✗ {method_name} failed: {e}")
    else:
        print(f"Method not found: {method_name}")