# 🧠 Embedding Pipeline Debug Testing

## Purpose
This notebook tests the embedding pipeline step-by-step using the actual pipeline code to debug embedding generation, checkpoint mechanisms, and data processing issues.

## What This Tests
- Embedding pipeline initialization and configuration
- Data loading and batch processing
- API communication for embedding generation
- Checkpoint saving and resume functionality
- Memory usage and performance monitoring
- Error handling and retry mechanisms
- Output file generation and validation

In [None]:
# Import required libraries
import sys
import os
import yaml
import json
import time
import traceback
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Dict, Any, Optional

# Add the mgpt_eval directory to Python path
mgpt_eval_path = Path.cwd().parent if Path.cwd().name == 'examples' else Path.cwd()
sys.path.insert(0, str(mgpt_eval_path))

# Import actual pipeline modules
from models.config_models import PipelineConfig
from models.data_models import DataSample, DataBatch
from pipelines.embedding_pipeline import EmbeddingPipeline
from utils.logging_utils import setup_logging

print(f"✅ Working directory: {Path.cwd()}")
print(f"✅ MGPT-eval path: {mgpt_eval_path}")
print(f"✅ Imports successful")

# Setup logging for debugging
import logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('embedding_debug')

## Step 1: Create Test Data and Configuration
Set up test data and configuration for embedding pipeline testing.

In [None]:
# Create test data for embedding pipeline
def create_embedding_test_data(num_samples=20):
    """Create test data specifically for embedding pipeline testing."""
    
    test_data = {
        'mcid': [f'TEST_EMBED_{i:03d}' for i in range(num_samples)],
        'claims': [
            'N6320 G0378 |eoc| Z91048 M1710',
            'E119 76642 |eoc| K9289 O0903',
            'I10 E785 |eoc| Z1239 M549',
            'E119 N6320 |eoc| K9289 76642',
            'O0903 Z91048 |eoc| M1710 G0378',
            'K9289 I10 |eoc| E785 N6320',
            'Z1239 E119 |eoc| 76642 M549',
            'M1710 O0903 |eoc| G0378 Z91048',
            'E785 K9289 |eoc| I10 N6320',
            '76642 Z1239 |eoc| E119 M549'
        ] * (num_samples // 10 + 1),  # Repeat patterns
        'label': [i % 2 for i in range(num_samples)]  # Alternating 0, 1
    }
    
    # Truncate to exact number of samples
    for key in test_data:
        test_data[key] = test_data[key][:num_samples]
    
    return pd.DataFrame(test_data)

# Create test dataset
test_df = create_embedding_test_data(20)
test_data_path = mgpt_eval_path / "examples" / "embedding_test_data.csv"
test_df.to_csv(test_data_path, index=False)

print(f"📊 Created test data: {test_data_path}")
print(f"📊 Shape: {test_df.shape}")
print(f"📋 First 3 samples:")
print(test_df.head(3))
print(f"📊 Label distribution: {test_df['label'].value_counts().to_dict()}")

In [None]:
# Create test configuration for embedding pipeline
def create_embedding_test_config(data_path: str):
    """Create configuration optimized for embedding pipeline testing."""
    
    test_config = {
        'input': {
            'dataset_path': str(data_path),
            'split_ratio': 0.8
        },
        'job': {
            'name': 'embedding_pipeline_test',
            'output_dir': str(mgpt_eval_path / "examples" / "test_outputs"),
            'random_seed': 42
        },
        'model_api': {
            'base_url': 'http://localhost:8000',  # ⚠️ Update this for your API
            'batch_size': 4,  # Small batches for testing
            'timeout': 30,  # Shorter timeout for faster debugging
            'max_retries': 2
        },
        'pipeline_stages': {
            'embeddings': True,
            'classification': False,  # Focus on embeddings only
            'evaluation': False,
            'target_word_eval': False,
            'summary_report': False,
            'method_comparison': False
        },
        'data_processing': {
            'random_seed': 42,
            'max_sequence_length': 256,  # Shorter for faster processing
            'include_mcid': True,
            'output_format': 'json',  # JSON for easier debugging
            'train_test_split': 0.8
        },
        'embedding_generation': {
            'batch_size': 3,  # Small batches for detailed debugging
            'save_interval': 2,  # Frequent checkpoints for testing
            'checkpoint_dir': str(mgpt_eval_path / "examples" / "test_checkpoints"),
            'resume_from_checkpoint': True,
            'tokenizer_path': '/app/tokenizer'  # May not be used
        },
        'classification': {
            'models': ['logistic_regression'],  # Minimal for testing
            'cross_validation': {'n_folds': 2, 'scoring': 'accuracy', 'n_jobs': 1}
        },
        'evaluation': {
            'metrics': ['accuracy'],
            'visualization': {'generate_plots': False}
        },
        'target_word_evaluation': {
            'enable': False,
            'target_codes': ['E119']
        },
        'output': {
            'embeddings_dir': 'embeddings',
            'models_dir': 'models',
            'metrics_dir': 'metrics',
            'logs_dir': 'logs',
            'save_best_model_only': False,
            'model_format': 'pickle'
        },
        'logging': {
            'level': 'DEBUG',  # Verbose logging for debugging
            'console_level': 'DEBUG',
            'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            'file': 'logs/embedding_test.log'
        }
    }
    
    return test_config

# Create and validate test configuration
test_config_dict = create_embedding_test_config(test_data_path)

try:
    test_config = PipelineConfig(**test_config_dict)
    print(f"✅ Test configuration validated")
    
    print(f"\n🔧 Configuration summary:")
    print(f"   Job name: {test_config.job.name}")
    print(f"   Output dir: {test_config.job.output_dir}")
    print(f"   API URL: {test_config.model_api.base_url}")
    print(f"   API batch size: {test_config.model_api.batch_size}")
    print(f"   Processing batch size: {test_config.embedding_generation.batch_size}")
    print(f"   Checkpoint interval: {test_config.embedding_generation.save_interval}")
    print(f"   Max sequence length: {test_config.data_processing.max_sequence_length}")
    
except Exception as e:
    print(f"❌ Configuration validation failed: {e}")
    print(f"\nError details:")
    traceback.print_exc()
    test_config = None

## Step 2: Initialize Embedding Pipeline
Create and initialize the embedding pipeline using the actual pipeline code.

In [None]:
# Initialize embedding pipeline
def initialize_embedding_pipeline(config: PipelineConfig):
    """Initialize the embedding pipeline with debugging."""
    
    print(f"\n🚀 Initializing embedding pipeline...")
    
    try:
        # Create output directories
        output_dir = Path(config.job.output_dir) / config.job.name
        output_dir.mkdir(parents=True, exist_ok=True)
        
        embeddings_dir = output_dir / config.output.embeddings_dir
        embeddings_dir.mkdir(parents=True, exist_ok=True)
        
        checkpoint_dir = Path(config.embedding_generation.checkpoint_dir)
        checkpoint_dir.mkdir(parents=True, exist_ok=True)
        
        print(f"   📁 Output directory: {output_dir}")
        print(f"   📁 Embeddings directory: {embeddings_dir}")
        print(f"   📁 Checkpoint directory: {checkpoint_dir}")
        
        # Initialize the actual embedding pipeline
        pipeline = EmbeddingPipeline(config)
        
        print(f"   ✅ Pipeline initialized successfully")
        return pipeline, output_dir
        
    except Exception as e:
        print(f"   ❌ Pipeline initialization failed: {e}")
        print(f"\nError details:")
        traceback.print_exc()
        return None, None

# Initialize pipeline if config is valid
if test_config:
    embedding_pipeline, output_directory = initialize_embedding_pipeline(test_config)
else:
    print(f"⚠️  Skipping pipeline initialization due to config issues")
    embedding_pipeline = None
    output_directory = None

## Step 3: Test Data Loading and Preprocessing
Test the data loading functionality within the embedding pipeline.

In [None]:
# Test data loading using pipeline methods
def test_pipeline_data_loading(pipeline: EmbeddingPipeline, config: PipelineConfig):
    """Test data loading using the actual pipeline data loading methods."""
    
    print(f"\n📊 Testing pipeline data loading...")
    
    try:
        # Load and validate data (using pipeline's internal methods)
        print(f"   📂 Loading data from: {config.input.dataset_path}")
        
        # This mimics the pipeline's data loading process
        data_df = pd.read_csv(config.input.dataset_path)
        print(f"   ✅ Data loaded: {data_df.shape}")
        
        # Split data (mimics pipeline splitting)
        from sklearn.model_selection import train_test_split
        
        train_df, test_df = train_test_split(
            data_df,
            test_size=1 - config.input.split_ratio,
            random_state=config.job.random_seed,
            stratify=data_df['label']
        )
        
        print(f"   ✅ Data split: train={len(train_df)}, test={len(test_df)}")
        
        # Convert to DataBatch format (mimics pipeline processing)
        train_samples = []
        for _, row in train_df.iterrows():
            sample = DataSample(
                mcid=row['mcid'],
                claims=row['claims'],
                label=int(row['label'])
            )
            train_samples.append(sample)
        
        train_batch = DataBatch(samples=train_samples)
        print(f"   ✅ Train batch created: {len(train_batch.samples)} samples")
        
        # Test batch methods
        mcids = train_batch.get_mcids()
        claims = train_batch.get_claims()
        labels = train_batch.get_labels()
        
        print(f"   📋 Sample MCIDs: {mcids[:3]}...")
        print(f"   📋 Sample claims: {claims[0][:50]}...")
        print(f"   📋 Label distribution: {pd.Series(labels).value_counts().to_dict()}")
        
        return train_batch, test_df
        
    except Exception as e:
        print(f"   ❌ Data loading failed: {e}")
        print(f"\nError details:")
        traceback.print_exc()
        return None, None

# Test data loading
if embedding_pipeline and test_config:
    train_data_batch, test_data_df = test_pipeline_data_loading(embedding_pipeline, test_config)
else:
    print(f"⚠️  Skipping data loading test")
    train_data_batch = None
    test_data_df = None

## Step 4: Test API Communication
Test API communication using the pipeline's API client.

In [None]:
# Test API communication with small batches
def test_api_communication(pipeline: EmbeddingPipeline, batch: DataBatch, config: PipelineConfig):
    """Test API communication using pipeline's API client."""
    
    print(f"\n🌐 Testing API communication...")
    
    if batch is None:
        print(f"   ❌ No data batch available for testing")
        return False
    
    try:
        # Test with a very small batch (first 2 samples)
        test_samples = batch.samples[:2]
        test_claims = [sample.claims for sample in test_samples]
        test_mcids = [sample.mcid for sample in test_samples]
        
        print(f"   🧪 Testing with {len(test_claims)} claims")
        print(f"   📋 MCIDs: {test_mcids}")
        print(f"   📋 Claims: {[claim[:30] + '...' for claim in test_claims]}")
        
        # ⚠️ Note: This will call the actual API - make sure your server is running!
        # If you don't have a server, comment out the next section and use mock data
        
        # Test API call using pipeline's method
        start_time = time.time()
        
        # This is where the actual API call would happen
        # We'll try to call it, but handle errors gracefully
        try:
            # Attempt real API call (comment out if no server available)
            import requests
            
            api_url = f"{config.model_api.base_url}/embeddings_batch"
            payload = {"texts": test_claims}
            
            response = requests.post(
                api_url,
                json=payload,
                timeout=config.model_api.timeout,
                headers={"Content-Type": "application/json"}
            )
            
            response_time = time.time() - start_time
            
            if response.status_code == 200:
                embeddings_data = response.json()
                embeddings = embeddings_data.get('embeddings', [])
                
                print(f"   ✅ API call successful")
                print(f"   ⏱️  Response time: {response_time:.2f}s")
                print(f"   📊 Received {len(embeddings)} embeddings")
                if embeddings:
                    print(f"   📊 Embedding dimension: {len(embeddings[0])}")
                    print(f"   📊 Sample values: {embeddings[0][:3]}...")
                
                return True
            else:
                print(f"   ❌ API error: {response.status_code}")
                print(f"   📄 Response: {response.text[:200]}")
                return False
                
        except requests.exceptions.ConnectionError:
            print(f"   ⚠️  API server not available - using mock data for testing")
            
            # Create mock embeddings for testing pipeline logic
            mock_embeddings = [
                [0.1, 0.2, 0.3] * 256,  # 768-dim mock embedding
                [0.4, 0.5, 0.6] * 256
            ]
            
            print(f"   🎭 Using mock embeddings: {len(mock_embeddings)} embeddings")
            print(f"   📊 Mock embedding dimension: {len(mock_embeddings[0])}")
            
            return "mock"  # Return special value to indicate mock mode
            
        except Exception as api_error:
            print(f"   ❌ API call failed: {api_error}")
            return False
        
    except Exception as e:
        print(f"   ❌ API test failed: {e}")
        print(f"\nError details:")
        traceback.print_exc()
        return False

# Test API communication
if embedding_pipeline and train_data_batch and test_config:
    api_test_result = test_api_communication(embedding_pipeline, train_data_batch, test_config)
else:
    print(f"⚠️  Skipping API communication test")
    api_test_result = False

## Step 5: Test Batch Processing Logic
Test the batch processing and checkpoint mechanisms.

In [None]:
# Test batch processing logic
def test_batch_processing(pipeline: EmbeddingPipeline, batch: DataBatch, config: PipelineConfig, use_mock=False):
    """Test batch processing logic with checkpoint simulation."""
    
    print(f"\n⚙️ Testing batch processing logic...")
    
    if batch is None:
        print(f"   ❌ No data batch available")
        return None
    
    try:
        # Simulate batch processing (like the actual pipeline does)
        batch_size = config.embedding_generation.batch_size
        save_interval = config.embedding_generation.save_interval
        
        print(f"   📦 Processing batch size: {batch_size}")
        print(f"   💾 Save interval: {save_interval}")
        print(f"   📊 Total samples: {len(batch.samples)}")
        
        # Calculate expected batches
        total_batches = (len(batch.samples) + batch_size - 1) // batch_size
        print(f"   📊 Expected batches: {total_batches}")
        
        # Simulate processing each batch
        all_embeddings = []
        all_mcids = []
        all_labels = []
        
        for batch_idx in range(total_batches):
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size, len(batch.samples))
            
            batch_samples = batch.samples[start_idx:end_idx]
            batch_claims = [sample.claims for sample in batch_samples]
            batch_mcids = [sample.mcid for sample in batch_samples]
            batch_labels = [sample.label for sample in batch_samples]
            
            print(f"\n   🔄 Processing batch {batch_idx + 1}/{total_batches}")
            print(f"      Samples: {len(batch_samples)} ({start_idx}-{end_idx-1})")
            print(f"      MCIDs: {batch_mcids}")
            
            # Simulate embedding generation
            if use_mock or api_test_result == "mock":
                # Generate mock embeddings
                batch_embeddings = []
                for i, claim in enumerate(batch_claims):
                    # Create deterministic mock embedding based on claim content
                    embedding_seed = hash(claim) % 1000
                    mock_embedding = [(embedding_seed + j) / 1000.0 for j in range(768)]
                    batch_embeddings.append(mock_embedding)
                
                print(f"      🎭 Generated {len(batch_embeddings)} mock embeddings")
            else:
                # Real API call would go here
                print(f"      🌐 Would call API with {len(batch_claims)} claims")
                # For testing, create mock embeddings
                batch_embeddings = [[0.1] * 768 for _ in batch_claims]
            
            # Accumulate results
            all_embeddings.extend(batch_embeddings)
            all_mcids.extend(batch_mcids)
            all_labels.extend(batch_labels)
            
            # Simulate checkpoint saving
            if (batch_idx + 1) % save_interval == 0 or batch_idx == total_batches - 1:
                checkpoint_data = {
                    'mcids': all_mcids,
                    'labels': all_labels,
                    'embeddings': all_embeddings,
                    'processed_samples': len(all_embeddings),
                    'batch_index': batch_idx + 1
                }
                
                print(f"      💾 Checkpoint: {len(all_embeddings)} embeddings saved")
        
        # Final results
        print(f"\n   ✅ Batch processing completed")
        print(f"   📊 Total embeddings: {len(all_embeddings)}")
        print(f"   📊 Embedding dimension: {len(all_embeddings[0]) if all_embeddings else 0}")
        print(f"   📋 MCIDs: {all_mcids}")
        print(f"   📋 Labels: {all_labels}")
        
        return {
            'mcids': all_mcids,
            'labels': all_labels,
            'embeddings': all_embeddings
        }
        
    except Exception as e:
        print(f"   ❌ Batch processing failed: {e}")
        print(f"\nError details:")
        traceback.print_exc()
        return None

# Test batch processing
if embedding_pipeline and train_data_batch and test_config:
    use_mock_data = (api_test_result == "mock" or api_test_result == False)
    batch_results = test_batch_processing(embedding_pipeline, train_data_batch, test_config, use_mock_data)
else:
    print(f"⚠️  Skipping batch processing test")
    batch_results = None

## Step 6: Test Checkpoint and Resume Functionality
Test the checkpoint saving and resume mechanisms.

In [None]:
# Test checkpoint and resume functionality
def test_checkpoint_resume(config: PipelineConfig, results_data: Dict):
    """Test checkpoint saving and resume functionality."""
    
    print(f"\n💾 Testing checkpoint and resume functionality...")
    
    if results_data is None:
        print(f"   ❌ No results data available for testing")
        return False
    
    try:
        checkpoint_dir = Path(config.embedding_generation.checkpoint_dir)
        checkpoint_dir.mkdir(parents=True, exist_ok=True)
        
        # Test checkpoint saving
        print(f"   💾 Testing checkpoint saving...")
        
        checkpoint_file = checkpoint_dir / "test_checkpoint.json"
        
        # Create checkpoint data (simulating pipeline checkpoint format)
        checkpoint_data = {
            'job_info': {
                'job_name': config.job.name,
                'timestamp': time.time(),
                'config_hash': hash(str(config.dict()))
            },
            'progress': {
                'total_samples': len(results_data['mcids']),
                'processed_samples': len(results_data['mcids']),
                'current_batch': 1,
                'completion_percentage': 100.0
            },
            'data': {
                'mcids': results_data['mcids'],
                'labels': results_data['labels'],
                'embeddings': results_data['embeddings'][:5]  # Save only first 5 for checkpoint test
            },
            'metadata': {
                'embedding_dimension': len(results_data['embeddings'][0]) if results_data['embeddings'] else 0,
                'format_version': '1.0'
            }
        }
        
        # Save checkpoint
        with open(checkpoint_file, 'w') as f:
            json.dump(checkpoint_data, f, indent=2)
        
        print(f"      ✅ Checkpoint saved: {checkpoint_file}")
        print(f"      📊 File size: {checkpoint_file.stat().st_size / 1024:.1f} KB")
        
        # Test checkpoint loading
        print(f"   📂 Testing checkpoint loading...")
        
        with open(checkpoint_file, 'r') as f:
            loaded_checkpoint = json.load(f)
        
        print(f"      ✅ Checkpoint loaded successfully")
        print(f"      📊 Job: {loaded_checkpoint['job_info']['job_name']}")
        print(f"      📊 Processed: {loaded_checkpoint['progress']['processed_samples']} samples")
        print(f"      📊 Embeddings in checkpoint: {len(loaded_checkpoint['data']['embeddings'])}")
        print(f"      📊 Embedding dimension: {loaded_checkpoint['metadata']['embedding_dimension']}")
        
        # Test resume logic simulation
        print(f"   🔄 Testing resume logic...")
        
        # Simulate resume scenario
        total_samples = len(results_data['mcids'])
        checkpoint_samples = len(loaded_checkpoint['data']['embeddings'])
        remaining_samples = total_samples - checkpoint_samples
        
        print(f"      📊 Total samples: {total_samples}")
        print(f"      📊 Checkpoint samples: {checkpoint_samples}")
        print(f"      📊 Remaining samples: {remaining_samples}")
        
        if remaining_samples > 0:
            print(f"      🔄 Would resume processing from sample {checkpoint_samples + 1}")
            print(f"      📋 Next MCIDs to process: {results_data['mcids'][checkpoint_samples:checkpoint_samples+3]}")
        else:
            print(f"      ✅ All samples already processed in checkpoint")
        
        # Test data consistency
        print(f"   🔍 Testing data consistency...")
        
        original_mcids = results_data['mcids'][:checkpoint_samples]
        checkpoint_mcids = loaded_checkpoint['data']['mcids'][:checkpoint_samples]
        
        if original_mcids == checkpoint_mcids:
            print(f"      ✅ MCID consistency check passed")
        else:
            print(f"      ❌ MCID consistency check failed")
            print(f"         Original: {original_mcids[:3]}...")
            print(f"         Checkpoint: {checkpoint_mcids[:3]}...")
        
        # Cleanup
        checkpoint_file.unlink()
        print(f"   🧹 Cleaned up test checkpoint")
        
        return True
        
    except Exception as e:
        print(f"   ❌ Checkpoint test failed: {e}")
        print(f"\nError details:")
        traceback.print_exc()
        return False

# Test checkpoint functionality
if test_config and batch_results:
    checkpoint_test_result = test_checkpoint_resume(test_config, batch_results)
else:
    print(f"⚠️  Skipping checkpoint test")
    checkpoint_test_result = False

## Step 7: Test Output File Generation
Test the final output file generation and format validation.

In [None]:
# Test output file generation
def test_output_generation(config: PipelineConfig, results_data: Dict, output_dir: Path):
    """Test final output file generation."""
    
    print(f"\n📄 Testing output file generation...")
    
    if results_data is None or output_dir is None:
        print(f"   ❌ No results data or output directory available")
        return False
    
    try:
        # Create embeddings output directory
        embeddings_dir = output_dir / config.output.embeddings_dir
        embeddings_dir.mkdir(parents=True, exist_ok=True)
        
        output_format = config.data_processing.output_format
        print(f"   📊 Output format: {output_format}")
        print(f"   📁 Output directory: {embeddings_dir}")
        
        # Test JSON output format
        if output_format == "json":
            print(f"   📝 Testing JSON output format...")
            
            train_output_file = embeddings_dir / "train_embeddings.json"
            
            output_data = {
                'job_info': {
                    'job_name': config.job.name,
                    'timestamp': time.time(),
                    'total_samples': len(results_data['mcids'])
                },
                'data': {
                    'mcids': results_data['mcids'],
                    'labels': results_data['labels'],
                    'embeddings': results_data['embeddings']
                },
                'metadata': {
                    'embedding_dimension': len(results_data['embeddings'][0]) if results_data['embeddings'] else 0,
                    'format': 'json',
                    'version': '1.0'
                }
            }
            
            # Save JSON file
            with open(train_output_file, 'w') as f:
                json.dump(output_data, f, indent=2)
            
            file_size = train_output_file.stat().st_size
            print(f"      ✅ JSON file saved: {train_output_file}")
            print(f"      📊 File size: {file_size / 1024:.1f} KB")
            print(f"      📊 Size per sample: {file_size / len(results_data['mcids']):.0f} bytes")
            
            # Test loading JSON file
            with open(train_output_file, 'r') as f:
                loaded_data = json.load(f)
            
            print(f"      ✅ JSON file loaded successfully")
            print(f"      📊 Samples in file: {len(loaded_data['data']['mcids'])}")
            print(f"      📊 Embedding dimension: {loaded_data['metadata']['embedding_dimension']}")
            
        # Test CSV output format
        print(f"   📝 Testing CSV output format...")
        
        train_csv_file = embeddings_dir / "train_embeddings.csv"
        
        # Create CSV format (flattened embeddings)
        csv_data = []
        for i, (mcid, label, embedding) in enumerate(zip(
            results_data['mcids'], 
            results_data['labels'], 
            results_data['embeddings']
        )):
            row = {'mcid': mcid, 'label': label}
            # Add embedding dimensions as separate columns
            for j, emb_val in enumerate(embedding):
                row[f'emb_{j}'] = emb_val
            csv_data.append(row)
        
        csv_df = pd.DataFrame(csv_data)
        csv_df.to_csv(train_csv_file, index=False)
        
        csv_file_size = train_csv_file.stat().st_size
        print(f"      ✅ CSV file saved: {train_csv_file}")
        print(f"      📊 File size: {csv_file_size / 1024:.1f} KB")
        print(f"      📊 CSV shape: {csv_df.shape}")
        print(f"      📊 CSV columns: {list(csv_df.columns)[:5]}... (+{len(csv_df.columns)-5} more)")
        
        # Test loading CSV file
        loaded_csv = pd.read_csv(train_csv_file)
        print(f"      ✅ CSV file loaded successfully")
        print(f"      📊 Loaded shape: {loaded_csv.shape}")
        
        # Compare file sizes
        if output_format == "json":
            json_size = train_output_file.stat().st_size
            csv_size = train_csv_file.stat().st_size
            
            print(f"\n   📊 Format comparison:")
            print(f"      JSON: {json_size / 1024:.1f} KB")
            print(f"      CSV:  {csv_size / 1024:.1f} KB")
            print(f"      Ratio: CSV is {csv_size / json_size:.1f}x the size of JSON")
        
        # Test file integrity
        print(f"   🔍 Testing file integrity...")
        
        # Check that we can reconstruct embeddings from CSV
        embedding_cols = [col for col in loaded_csv.columns if col.startswith('emb_')]
        reconstructed_embeddings = loaded_csv[embedding_cols].values.tolist()
        
        if len(reconstructed_embeddings) == len(results_data['embeddings']):
            print(f"      ✅ Embedding count matches: {len(reconstructed_embeddings)}")
        else:
            print(f"      ❌ Embedding count mismatch: {len(reconstructed_embeddings)} vs {len(results_data['embeddings'])}")
        
        if len(reconstructed_embeddings) > 0 and len(reconstructed_embeddings[0]) == len(results_data['embeddings'][0]):
            print(f"      ✅ Embedding dimension matches: {len(reconstructed_embeddings[0])}")
        else:
            print(f"      ❌ Embedding dimension mismatch")
        
        # Cleanup test files
        if output_format == "json" and train_output_file.exists():
            train_output_file.unlink()
        if train_csv_file.exists():
            train_csv_file.unlink()
        
        print(f"   🧹 Cleaned up test output files")
        
        return True
        
    except Exception as e:
        print(f"   ❌ Output generation test failed: {e}")
        print(f"\nError details:")
        traceback.print_exc()
        return False

# Test output generation
if test_config and batch_results and output_directory:
    output_test_result = test_output_generation(test_config, batch_results, output_directory)
else:
    print(f"⚠️  Skipping output generation test")
    output_test_result = False

## Step 8: Memory and Performance Analysis
Analyze memory usage and performance characteristics.

In [None]:
# Test memory usage and performance
def test_memory_performance(results_data: Dict, config: PipelineConfig):
    """Test memory usage and performance characteristics."""
    
    print(f"\n🔍 Testing memory usage and performance...")
    
    if results_data is None:
        print(f"   ❌ No results data available")
        return
    
    try:
        import sys
        
        # Calculate memory usage
        embeddings = results_data['embeddings']
        mcids = results_data['mcids']
        labels = results_data['labels']
        
        # Memory analysis
        print(f"   📊 Memory usage analysis:")
        
        # Calculate sizes
        embeddings_size = sys.getsizeof(embeddings)
        mcids_size = sys.getsizeof(mcids)
        labels_size = sys.getsizeof(labels)
        
        # Estimate individual embedding size
        if embeddings:
            single_embedding_size = sys.getsizeof(embeddings[0])
            embedding_dimension = len(embeddings[0])
            
            print(f"      🧠 Embeddings list: {embeddings_size / 1024:.1f} KB")
            print(f"      🧠 MCIDs list: {mcids_size / 1024:.1f} KB")
            print(f"      🧠 Labels list: {labels_size / 1024:.1f} KB")
            print(f"      🧠 Single embedding: {single_embedding_size} bytes")
            print(f"      🧠 Total memory: {(embeddings_size + mcids_size + labels_size) / 1024:.1f} KB")
            
            # Extrapolate for larger datasets
            samples_per_mb = (1024 * 1024) // single_embedding_size
            print(f"      📊 Approx samples per MB: {samples_per_mb}")
            
            # Calculate memory for different dataset sizes
            for dataset_size in [1000, 10000, 100000]:
                estimated_mb = (dataset_size * single_embedding_size) / (1024 * 1024)
                print(f"      📊 {dataset_size:,} samples ≈ {estimated_mb:.1f} MB")
        
        # Performance analysis
        print(f"\n   ⚡ Performance analysis:")
        
        batch_size = config.embedding_generation.batch_size
        api_batch_size = config.model_api.batch_size
        
        print(f"      📦 Processing batch size: {batch_size}")
        print(f"      🌐 API batch size: {api_batch_size}")
        
        # Calculate batching efficiency
        if batch_size <= api_batch_size:
            api_calls_per_batch = 1
            efficiency = "Optimal (1 API call per processing batch)"
        else:
            api_calls_per_batch = (batch_size + api_batch_size - 1) // api_batch_size
            efficiency = f"Suboptimal ({api_calls_per_batch} API calls per processing batch)"
        
        print(f"      📊 API calls per processing batch: {api_calls_per_batch}")
        print(f"      📊 Efficiency: {efficiency}")
        
        # Estimate processing time for larger datasets
        samples_processed = len(embeddings)
        if samples_processed > 0:
            # Assume 1 second per API call (rough estimate)
            estimated_time_per_sample = api_calls_per_batch / batch_size  # API calls per sample
            
            print(f"\n   ⏱️  Time estimates (assuming 1s per API call):")
            for dataset_size in [100, 1000, 10000]:
                total_batches = (dataset_size + batch_size - 1) // batch_size
                total_api_calls = total_batches * api_calls_per_batch
                estimated_minutes = total_api_calls / 60
                
                print(f"      📊 {dataset_size:,} samples: {total_api_calls} API calls ≈ {estimated_minutes:.1f} minutes")
        
        # Configuration recommendations
        print(f"\n   💡 Configuration recommendations:")
        
        if batch_size > api_batch_size:
            recommended_batch_size = api_batch_size
            print(f"      🔧 Consider reducing batch_size to {recommended_batch_size} for optimal API usage")
        elif batch_size < api_batch_size // 2:
            recommended_batch_size = api_batch_size
            print(f"      🔧 Consider increasing batch_size to {recommended_batch_size} for better throughput")
        else:
            print(f"      ✅ Current batch_size ({batch_size}) is well-optimized")
        
        # Memory recommendations
        if embeddings:
            total_memory_kb = (embeddings_size + mcids_size + labels_size) / 1024
            if total_memory_kb > 100 * 1024:  # > 100 MB
                print(f"      🔧 Consider using CSV output format for large datasets (more memory efficient)")
            if config.embedding_generation.save_interval > 100:
                print(f"      🔧 Consider more frequent checkpoints (save_interval < 100) for large datasets")
        
    except Exception as e:
        print(f"   ❌ Memory/performance analysis failed: {e}")
        print(f"\nError details:")
        traceback.print_exc()

# Test memory and performance
if batch_results and test_config:
    test_memory_performance(batch_results, test_config)
else:
    print(f"⚠️  Skipping memory/performance analysis")

## Step 9: Summary and Recommendations
Summarize all test results and provide actionable recommendations.

In [None]:
# Summarize embedding pipeline test results
print("\n📋 Embedding Pipeline Debug Test Summary:")
print("=" * 55)

# Collect test results
test_results = [
    ("Configuration Validation", test_config is not None),
    ("Pipeline Initialization", embedding_pipeline is not None),
    ("Data Loading", train_data_batch is not None),
    ("API Communication", api_test_result in [True, "mock"]),
    ("Batch Processing", batch_results is not None),
    ("Checkpoint Functionality", checkpoint_test_result),
    ("Output Generation", output_test_result)
]

# Calculate success rate
passed_tests = [name for name, result in test_results if result]
failed_tests = [name for name, result in test_results if not result]
success_rate = len(passed_tests) / len(test_results) * 100

print(f"\n🎯 Test Results Summary:")
for test_name, result in test_results:
    status = "✅ PASS" if result else "❌ FAIL"
    print(f"   {status} {test_name}")

print(f"\n📊 Overall Success Rate: {success_rate:.0f}%")

# Detailed analysis
print(f"\n🔍 Detailed Analysis:")

if api_test_result == "mock":
    print(f"   ⚠️  API testing used mock data (server not available)")
    print(f"      🔧 Start your MGPT server and rerun for full API testing")
elif api_test_result == True:
    print(f"   ✅ API communication successful with real server")
elif api_test_result == False:
    print(f"   ❌ API communication failed")
    print(f"      🔧 Check server status and configuration")

if batch_results:
    num_embeddings = len(batch_results['embeddings'])
    embedding_dim = len(batch_results['embeddings'][0]) if batch_results['embeddings'] else 0
    print(f"   📊 Successfully processed {num_embeddings} samples")
    print(f"   📊 Embedding dimension: {embedding_dim}")

# Recommendations based on results
print(f"\n💡 Recommendations:")

if success_rate >= 85:
    print(f"   🎉 Embedding pipeline is working well!")
    if api_test_result == "mock":
        print(f"   ➡️  Next: Start your MGPT server and test with real API")
    else:
        print(f"   ➡️  Next: Test classification pipeline with generated embeddings")
        print(f"   ➡️  Or: Run with larger dataset to test scalability")
elif success_rate >= 60:
    print(f"   ⚠️  Most components working, but some issues need attention")
    print(f"   🔧 Focus on fixing the failed tests listed above")
else:
    print(f"   🚨 Significant issues detected - troubleshooting needed")
    print(f"   🔧 Address configuration and API connectivity issues first")

# Configuration recommendations
if test_config:
    print(f"\n🔧 Configuration for next steps:")
    print(f"   # Update this configuration with your actual values")
    print(f"   input:")
    print(f"     dataset_path: \"{test_data_path}\"")
    print(f"     split_ratio: {test_config.input.split_ratio}")
    print(f"   model_api:")
    print(f"     base_url: \"{test_config.model_api.base_url}\"")
    print(f"     batch_size: {test_config.model_api.batch_size}")
    print(f"   embedding_generation:")
    print(f"     batch_size: {test_config.embedding_generation.batch_size}")
    print(f"     save_interval: {test_config.embedding_generation.save_interval}")

print(f"\n📚 Next testing steps:")
if success_rate >= 85:
    print(f"   1. Run test_04_Classification_Pipeline_Debug.ipynb")
    print(f"   2. Test with your actual dataset")
    print(f"   3. Run full end-to-end pipeline")
else:
    print(f"   1. Fix failed tests in this notebook")
    print(f"   2. Verify API server is running and accessible")
    print(f"   3. Re-run this notebook until all tests pass")

# Cleanup
print(f"\n🧹 Cleaning up test files...")
try:
    if test_data_path.exists():
        test_data_path.unlink()
        print(f"   🗑️  Removed: {test_data_path.name}")
    
    # Clean up any remaining test directories
    test_output_dir = mgpt_eval_path / "examples" / "test_outputs"
    if test_output_dir.exists():
        import shutil
        shutil.rmtree(test_output_dir)
        print(f"   🗑️  Removed: test_outputs directory")
    
    test_checkpoint_dir = mgpt_eval_path / "examples" / "test_checkpoints"
    if test_checkpoint_dir.exists():
        import shutil
        shutil.rmtree(test_checkpoint_dir)
        print(f"   🗑️  Removed: test_checkpoints directory")
        
except Exception as e:
    print(f"   ⚠️  Cleanup warning: {e}")

print(f"\n✅ Embedding pipeline debugging complete!")

## 🔧 Debug Cell (Run if needed)
Use this cell to test specific scenarios or debug issues found above.

In [None]:
# Debug cell - modify as needed for specific testing

# Example: Test with different batch sizes
# if test_config and train_data_batch:
#     print("Testing different batch sizes:")
#     for batch_size in [1, 2, 4, 8]:
#         test_config_copy = test_config.copy(deep=True)
#         test_config_copy.embedding_generation.batch_size = batch_size
#         print(f"\nBatch size {batch_size}:")
#         batch_results = test_batch_processing(embedding_pipeline, train_data_batch, test_config_copy, True)

# Example: Test API with custom claims
# if embedding_pipeline:
#     custom_claims = ["E119 I10 N6320", "K9289 Z1239 M549"]
#     print(f"Testing API with custom claims: {custom_claims}")
#     # Add your custom API test here

# Example: Test memory usage with larger datasets
# if batch_results:
#     print("Testing memory scaling:")
#     for multiplier in [10, 100, 1000]:
#         scaled_data = {
#             'mcids': batch_results['mcids'] * multiplier,
#             'labels': batch_results['labels'] * multiplier,
#             'embeddings': batch_results['embeddings'] * multiplier
#         }
#         test_memory_performance(scaled_data, test_config)

# Example: Debug specific configuration issues
# if test_config:
#     print("Current configuration:")
#     print(f"API URL: {test_config.model_api.base_url}")
#     print(f"API batch size: {test_config.model_api.batch_size}")
#     print(f"Processing batch size: {test_config.embedding_generation.batch_size}")
#     print(f"Save interval: {test_config.embedding_generation.save_interval}")

print("💡 Use this cell to run custom embedding pipeline tests and debug specific issues.")