# Embedding Pipeline Debug

This notebook debugs the embedding pipeline using the actual EmbeddingPipeline class.
Tests embedding generation step by step using real pipeline methods.

In [None]:
# Import pipeline modules
import sys
sys.path.append('/home/kosaraju/mgpt-serve/mgpt_eval')

from models.config_models import PipelineConfig
from pipelines.embedding_pipeline import EmbeddingPipeline
from models.data_models import DataSample, DataBatch
import pandas as pd
import numpy as np

In [None]:
# Load configuration
config_path = "/home/kosaraju/mgpt-serve/mgpt_eval/configs/examples/config_embeddings_only.yaml"
config = PipelineConfig.from_yaml(config_path)
print(f"Config loaded: {config.job.job_name}")
print(f"Embedding config: {config.embedding}")

In [None]:
# Initialize embedding pipeline
embedding_pipeline = EmbeddingPipeline(config)
print(f"Embedding pipeline initialized")
print(f"Pipeline methods: {[m for m in dir(embedding_pipeline) if not m.startswith('_') and callable(getattr(embedding_pipeline, m))]}")

In [None]:
# Create test dataset
test_data = pd.DataFrame({
    'mcid': ['EMB_001', 'EMB_002', 'EMB_003', 'EMB_004', 'EMB_005'],
    'claims': [
        'N6320 G0378 |eoc| Z91048 M1710',
        'E119 A1234 |eoc| B5678 C9012',
        'Z03818 D3456 |eoc| F7890 G1234',
        'H5678 I9012 |eoc| J1234 K5678',
        'L9012 M3456 |eoc| N6320 O7890'
    ],
    'label': [1, 1, 1, 0, 1]
})

print(f"Test data created: {len(test_data)} samples")
print(test_data.head())

In [None]:
# Test single sample embedding
print("Testing single sample embedding...")
sample_claims = test_data.iloc[0]['claims']
print(f"Sample claims: {sample_claims}")

try:
    # Check if pipeline has single sample method
    if hasattr(embedding_pipeline, 'generate_embedding'):
        embedding = embedding_pipeline.generate_embedding(sample_claims)
        print(f"✓ Single embedding generated: shape {embedding.shape if hasattr(embedding, 'shape') else len(embedding)}")
    else:
        print("No single sample embedding method found")
except Exception as e:
    print(f"✗ Single embedding failed: {e}")

In [None]:
# Test batch embedding generation
print("Testing batch embedding generation...")

try:
    # Use the main pipeline method
    embeddings = embedding_pipeline.generate_embeddings(test_data)
    print(f"✓ Batch embeddings generated")
    print(f"  Shape: {embeddings.shape if hasattr(embeddings, 'shape') else 'No shape attr'}")
    print(f"  Type: {type(embeddings)}")
    
    # Check first few values
    if hasattr(embeddings, 'shape') and len(embeddings.shape) == 2:
        print(f"  First embedding sample (first 5 dims): {embeddings[0][:5]}")
        
except Exception as e:
    print(f"✗ Batch embedding failed: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Test pipeline run method
print("Testing embedding pipeline run method...")

try:
    # Use the main run method
    results = embedding_pipeline.run(test_data)
    print(f"✓ Pipeline run completed")
    print(f"  Results type: {type(results)}")
    print(f"  Results keys: {list(results.keys()) if isinstance(results, dict) else 'Not a dict'}")
    
    if isinstance(results, dict):
        for key, value in results.items():
            print(f"    {key}: {type(value)} - {getattr(value, 'shape', len(value)) if hasattr(value, '__len__') else 'scalar'}")
            
except Exception as e:
    print(f"✗ Pipeline run failed: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Debug embedding pipeline internals
print("=== Embedding Pipeline Debug ===")

# Check pipeline attributes
print(f"Pipeline config: {hasattr(embedding_pipeline, 'config')}")
print(f"Pipeline logger: {hasattr(embedding_pipeline, 'logger')}")
print(f"Pipeline API client: {hasattr(embedding_pipeline, 'api_client')}")

# Check configuration
if hasattr(embedding_pipeline, 'config'):
    print(f"\nConfig API: {embedding_pipeline.config.api}")
    print(f"Config embedding: {embedding_pipeline.config.embedding}")

# Check all methods and attributes
all_attrs = [attr for attr in dir(embedding_pipeline) if not attr.startswith('_')]
methods = [attr for attr in all_attrs if callable(getattr(embedding_pipeline, attr))]
properties = [attr for attr in all_attrs if not callable(getattr(embedding_pipeline, attr))]

print(f"\nMethods: {methods}")
print(f"Properties: {properties}")

In [None]:
# Test with different data formats
print("Testing different data formats...")

# Test with DataSample objects
try:
    sample = DataSample(
        mcid="SAMPLE_001",
        claims="N6320 G0378 |eoc| Z91048 M1710",
        label=1
    )
    print(f"DataSample created: {sample}")
    
    # Check if pipeline can handle DataSample
    if hasattr(embedding_pipeline, 'process_sample'):
        result = embedding_pipeline.process_sample(sample)
        print(f"✓ DataSample processed: {type(result)}")
    else:
        print("No process_sample method found")
        
except Exception as e:
    print(f"✗ DataSample processing failed: {e}")

# Test with DataBatch
try:
    batch = DataBatch(samples=[test_data.iloc[i] for i in range(min(3, len(test_data)))])
    print(f"DataBatch created with {len(batch.samples)} samples")
    
    if hasattr(embedding_pipeline, 'process_batch'):
        result = embedding_pipeline.process_batch(batch)
        print(f"✓ DataBatch processed: {type(result)}")
    else:
        print("No process_batch method found")
        
except Exception as e:
    print(f"✗ DataBatch processing failed: {e}")