# 🤖 Classification Pipeline Debug Testing

## Purpose
This notebook tests the classification pipeline step-by-step using the actual pipeline code to debug classifier training, hyperparameter search, cross-validation, and model evaluation issues.

## What This Tests
- Classification pipeline initialization and configuration
- Embedding data loading and validation
- Classifier training (Logistic Regression, SVM, Random Forest)
- Hyperparameter search and cross-validation
- Model serialization and loading
- Performance metrics calculation
- Memory usage and training time analysis
- Error handling for edge cases

In [None]:
# Import required libraries
import sys
import os
import yaml
import json
import time
import pickle
import traceback
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple

# Add the mgpt_eval directory to Python path
mgpt_eval_path = Path.cwd().parent if Path.cwd().name == 'examples' else Path.cwd()
sys.path.insert(0, str(mgpt_eval_path))

# Import actual pipeline modules
from models.config_models import PipelineConfig
from models.data_models import DataSample, DataBatch
from pipelines.classification_pipeline import ClassificationPipeline
from utils.logging_utils import setup_logging

# Import ML libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

print(f"✅ Working directory: {Path.cwd()}")
print(f"✅ MGPT-eval path: {mgpt_eval_path}")
print(f"✅ Imports successful")

# Setup logging for debugging
import logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('classification_debug')

## Step 1: Create Test Embeddings Data
Create mock embeddings data to test the classification pipeline.

In [None]:
# Create test embeddings data for classification
def create_classification_test_data(num_samples=50, embedding_dim=768):
    """Create test embeddings data for classification pipeline testing."""
    
    np.random.seed(42)  # For reproducible results
    
    # Generate synthetic embeddings with some pattern for classification
    # Class 0: embeddings centered around -0.5
    # Class 1: embeddings centered around +0.5
    
    mcids = [f'CLASS_TEST_{i:03d}' for i in range(num_samples)]
    labels = [i % 2 for i in range(num_samples)]  # Alternating 0, 1
    
    embeddings = []
    for i, label in enumerate(labels):
        # Create embeddings with different patterns for each class
        if label == 0:
            # Class 0: negative bias
            embedding = np.random.normal(-0.1, 0.3, embedding_dim).tolist()
        else:
            # Class 1: positive bias
            embedding = np.random.normal(0.1, 0.3, embedding_dim).tolist()
        embeddings.append(embedding)
    
    return {
        'mcids': mcids,
        'labels': labels,
        'embeddings': embeddings
    }

# Create train and test embeddings
train_data = create_classification_test_data(40, 768)  # 40 training samples
test_data = create_classification_test_data(20, 768)   # 20 test samples

# Modify test data MCIDs to avoid conflicts
test_data['mcids'] = [f'TEST_{mcid}' for mcid in test_data['mcids']]

print(f"📊 Created training data: {len(train_data['mcids'])} samples")
print(f"📊 Created test data: {len(test_data['mcids'])} samples")
print(f"📊 Embedding dimension: {len(train_data['embeddings'][0])}")
print(f"📊 Train label distribution: {pd.Series(train_data['labels']).value_counts().to_dict()}")
print(f"📊 Test label distribution: {pd.Series(test_data['labels']).value_counts().to_dict()}")

# Verify embeddings have different patterns
train_embeddings_array = np.array(train_data['embeddings'])
class_0_mean = train_embeddings_array[np.array(train_data['labels']) == 0].mean()
class_1_mean = train_embeddings_array[np.array(train_data['labels']) == 1].mean()

print(f"\n🔍 Embedding pattern verification:")
print(f"   Class 0 mean: {class_0_mean:.3f}")
print(f"   Class 1 mean: {class_1_mean:.3f}")
print(f"   Difference: {abs(class_1_mean - class_0_mean):.3f} (should be > 0.1 for good separation)")

In [None]:
# Save embeddings to files for testing
def save_embeddings_data(data: Dict, file_path: Path, format_type='json'):
    """Save embeddings data in specified format."""
    
    if format_type == 'json':
        # JSON format (pipeline default)
        output_data = {
            'job_info': {
                'job_name': 'classification_test',
                'timestamp': time.time(),
                'total_samples': len(data['mcids'])
            },
            'data': {
                'mcids': data['mcids'],
                'labels': data['labels'],
                'embeddings': data['embeddings']
            },
            'metadata': {
                'embedding_dimension': len(data['embeddings'][0]),
                'format': 'json',
                'version': '1.0'
            }
        }
        
        with open(file_path, 'w') as f:
            json.dump(output_data, f, indent=2)
    
    elif format_type == 'csv':
        # CSV format (alternative format)
        csv_data = []
        for mcid, label, embedding in zip(data['mcids'], data['labels'], data['embeddings']):
            row = {'mcid': mcid, 'label': label}
            for i, emb_val in enumerate(embedding):
                row[f'emb_{i}'] = emb_val
            csv_data.append(row)
        
        pd.DataFrame(csv_data).to_csv(file_path, index=False)

# Create test directory and save embeddings
test_embeddings_dir = mgpt_eval_path / "examples" / "test_classification_data"
test_embeddings_dir.mkdir(exist_ok=True)

train_embeddings_path = test_embeddings_dir / "train_embeddings.json"
test_embeddings_path = test_embeddings_dir / "test_embeddings.json"

save_embeddings_data(train_data, train_embeddings_path, 'json')
save_embeddings_data(test_data, test_embeddings_path, 'json')

print(f"💾 Saved train embeddings: {train_embeddings_path}")
print(f"💾 Saved test embeddings: {test_embeddings_path}")
print(f"📊 Train file size: {train_embeddings_path.stat().st_size / 1024:.1f} KB")
print(f"📊 Test file size: {test_embeddings_path.stat().st_size / 1024:.1f} KB")

## Step 2: Create Classification Configuration
Set up configuration for classification pipeline testing.

In [None]:
# Create test configuration for classification pipeline
def create_classification_test_config(train_emb_path: str, test_emb_path: str):
    """Create configuration optimized for classification pipeline testing."""
    
    test_config = {
        'input': {
            'train_embeddings_path': str(train_emb_path),
            'test_embeddings_path': str(test_emb_path)
        },
        'job': {
            'name': 'classification_pipeline_test',
            'output_dir': str(mgpt_eval_path / "examples" / "test_classification_outputs"),
            'random_seed': 42
        },
        'model_api': {
            'base_url': 'http://localhost:8000',  # Not used for classification
            'batch_size': 32,
            'timeout': 300,
            'max_retries': 3
        },
        'pipeline_stages': {
            'embeddings': False,  # Skip embedding generation
            'classification': True,  # Focus on classification
            'evaluation': True,  # Test evaluation too
            'target_word_eval': False,
            'summary_report': True,
            'method_comparison': False
        },
        'data_processing': {
            'random_seed': 42,
            'max_sequence_length': 512,
            'include_mcid': True,
            'output_format': 'json',
            'train_test_split': 0.8
        },
        'embedding_generation': {
            'batch_size': 16,
            'save_interval': 100,
            'checkpoint_dir': 'outputs/checkpoints',
            'resume_from_checkpoint': True,
            'tokenizer_path': '/app/tokenizer'
        },
        'classification': {
            'models': ['logistic_regression', 'svm', 'random_forest'],  # Test all classifiers
            'cross_validation': {
                'n_folds': 3,  # Smaller for faster testing
                'scoring': 'roc_auc',
                'n_jobs': 1  # Single threaded for easier debugging
            },
            'hyperparameter_search': {
                'logistic_regression': {
                    'C': [0.1, 1.0],  # Reduced search space for testing
                    'penalty': ['l2'],
                    'solver': ['liblinear']
                },
                'svm': {
                    'C': [1.0],  # Single value for faster testing
                    'kernel': ['rbf'],
                    'gamma': ['scale']
                },
                'random_forest': {
                    'n_estimators': [50],  # Fewer trees for faster testing
                    'max_depth': [10],
                    'min_samples_split': [2]
                }
            }
        },
        'evaluation': {
            'metrics': ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc', 'confusion_matrix'],
            'visualization': {
                'generate_plots': False,  # Skip plots for testing
                'plot_formats': ['png'],
                'dpi': 150
            }
        },
        'target_word_evaluation': {
            'enable': False,
            'target_codes': ['E119']
        },
        'output': {
            'embeddings_dir': 'embeddings',
            'models_dir': 'models',
            'metrics_dir': 'metrics',
            'logs_dir': 'logs',
            'save_best_model_only': False,  # Save all models for testing
            'model_format': 'pickle'
        },
        'logging': {
            'level': 'DEBUG',
            'console_level': 'INFO',
            'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            'file': 'logs/classification_test.log'
        }
    }
    
    return test_config

# Create and validate test configuration
test_config_dict = create_classification_test_config(train_embeddings_path, test_embeddings_path)

try:
    test_config = PipelineConfig(**test_config_dict)
    print(f"✅ Test configuration validated")
    
    print(f"\n🔧 Configuration summary:")
    print(f"   Job name: {test_config.job.name}")
    print(f"   Train embeddings: {test_config.input.train_embeddings_path}")
    print(f"   Test embeddings: {test_config.input.test_embeddings_path}")
    print(f"   Classifiers: {test_config.classification.models}")
    print(f"   CV folds: {test_config.classification.cross_validation.n_folds}")
    print(f"   CV scoring: {test_config.classification.cross_validation.scoring}")
    
except Exception as e:
    print(f"❌ Configuration validation failed: {e}")
    print(f"\nError details:")
    traceback.print_exc()
    test_config = None

## Step 3: Test Embeddings Data Loading
Test loading embeddings data using the classification pipeline.

In [None]:
# Test embeddings data loading
def test_embeddings_loading(config: PipelineConfig):
    """Test loading embeddings data for classification."""
    
    print(f"\n📂 Testing embeddings data loading...")
    
    try:
        # Load train embeddings
        print(f"   📄 Loading train embeddings: {config.input.train_embeddings_path}")
        
        with open(config.input.train_embeddings_path, 'r') as f:
            train_data = json.load(f)
        
        # Extract data (handle both formats)
        if 'data' in train_data:  # New format with metadata
            train_mcids = train_data['data']['mcids']
            train_labels = train_data['data']['labels']
            train_embeddings = train_data['data']['embeddings']
            train_metadata = train_data.get('metadata', {})
        else:  # Direct format
            train_mcids = train_data['mcids']
            train_labels = train_data['labels']
            train_embeddings = train_data['embeddings']
            train_metadata = {}
        
        print(f"      ✅ Train data loaded: {len(train_mcids)} samples")
        print(f"      📊 Embedding dimension: {len(train_embeddings[0]) if train_embeddings else 0}")
        print(f"      📊 Label distribution: {pd.Series(train_labels).value_counts().to_dict()}")
        
        # Load test embeddings
        print(f"   📄 Loading test embeddings: {config.input.test_embeddings_path}")
        
        with open(config.input.test_embeddings_path, 'r') as f:
            test_data = json.load(f)
        
        # Extract test data
        if 'data' in test_data:
            test_mcids = test_data['data']['mcids']
            test_labels = test_data['data']['labels']
            test_embeddings = test_data['data']['embeddings']
            test_metadata = test_data.get('metadata', {})
        else:
            test_mcids = test_data['mcids']
            test_labels = test_data['labels']
            test_embeddings = test_data['embeddings']
            test_metadata = {}
        
        print(f"      ✅ Test data loaded: {len(test_mcids)} samples")
        print(f"      📊 Embedding dimension: {len(test_embeddings[0]) if test_embeddings else 0}")
        print(f"      📊 Label distribution: {pd.Series(test_labels).value_counts().to_dict()}")
        
        # Validation checks
        print(f"   🔍 Data validation:")
        
        # Check dimension consistency
        train_dim = len(train_embeddings[0]) if train_embeddings else 0
        test_dim = len(test_embeddings[0]) if test_embeddings else 0
        
        if train_dim == test_dim and train_dim > 0:
            print(f"      ✅ Embedding dimensions match: {train_dim}")
        else:
            print(f"      ❌ Embedding dimension mismatch: train={train_dim}, test={test_dim}")
        
        # Check label format
        train_unique_labels = set(train_labels)
        test_unique_labels = set(test_labels)
        
        if train_unique_labels.issubset({0, 1}) and test_unique_labels.issubset({0, 1}):
            print(f"      ✅ Valid binary labels")
        else:
            print(f"      ❌ Invalid labels found: train={train_unique_labels}, test={test_unique_labels}")
        
        # Check for minimum samples per class
        train_label_counts = pd.Series(train_labels).value_counts()
        min_class_count = train_label_counts.min()
        
        if min_class_count >= config.classification.cross_validation.n_folds:
            print(f"      ✅ Sufficient samples for CV: min={min_class_count}, folds={config.classification.cross_validation.n_folds}")
        else:
            print(f"      ⚠️  Warning: May not have enough samples for CV: min={min_class_count}, folds={config.classification.cross_validation.n_folds}")
        
        # Convert to numpy arrays for ML processing
        X_train = np.array(train_embeddings)
        y_train = np.array(train_labels)
        X_test = np.array(test_embeddings)
        y_test = np.array(test_labels)
        
        print(f"      ✅ Data converted to numpy arrays")
        print(f"      📊 Train shape: X={X_train.shape}, y={y_train.shape}")
        print(f"      📊 Test shape: X={X_test.shape}, y={y_test.shape}")
        
        return {
            'train': {
                'mcids': train_mcids,
                'X': X_train,
                'y': y_train,
                'metadata': train_metadata
            },
            'test': {
                'mcids': test_mcids,
                'X': X_test,
                'y': y_test,
                'metadata': test_metadata
            }
        }
        
    except Exception as e:
        print(f"   ❌ Data loading failed: {e}")
        print(f"\nError details:")
        traceback.print_exc()
        return None

# Test data loading
if test_config:
    loaded_data = test_embeddings_loading(test_config)
else:
    print(f"⚠️  Skipping data loading test due to config issues")
    loaded_data = None

## Step 4: Test Individual Classifiers
Test each classifier individually to debug training and hyperparameter search.

In [None]:
# Test individual classifiers
def test_individual_classifier(classifier_name: str, config: PipelineConfig, data: Dict):
    """Test a single classifier with hyperparameter search."""
    
    print(f"\n🤖 Testing {classifier_name} classifier...")
    
    if data is None:
        print(f"   ❌ No data available for testing")
        return None
    
    try:
        X_train = data['train']['X']
        y_train = data['train']['y']
        X_test = data['test']['X']
        y_test = data['test']['y']
        
        print(f"   📊 Training on {X_train.shape[0]} samples, {X_train.shape[1]} features")
        print(f"   📊 Testing on {X_test.shape[0]} samples")
        
        # Get hyperparameter grid for this classifier
        param_grid = config.classification.hyperparameter_search.get(classifier_name, {})
        cv_config = config.classification.cross_validation
        
        print(f"   🔧 Hyperparameter grid: {param_grid}")
        print(f"   🔧 CV config: {cv_config.n_folds} folds, scoring={cv_config.scoring}")
        
        # Initialize base classifier
        start_time = time.time()
        
        if classifier_name == 'logistic_regression':
            base_classifier = LogisticRegression(random_state=config.job.random_seed, max_iter=1000)
        elif classifier_name == 'svm':
            base_classifier = SVC(random_state=config.job.random_seed, probability=True)
        elif classifier_name == 'random_forest':
            base_classifier = RandomForestClassifier(random_state=config.job.random_seed)
        else:
            print(f"   ❌ Unknown classifier: {classifier_name}")
            return None
        
        print(f"   ✅ Base classifier initialized: {type(base_classifier).__name__}")
        
        # Test without hyperparameter search first (faster)
        print(f"   🧪 Testing base classifier without hyperparameter search...")
        
        base_classifier.fit(X_train, y_train)
        train_score = base_classifier.score(X_train, y_train)
        test_score = base_classifier.score(X_test, y_test)
        
        print(f"      ✅ Base training score: {train_score:.3f}")
        print(f"      ✅ Base test score: {test_score:.3f}")
        
        # Test cross-validation
        print(f"   🔄 Testing cross-validation...")
        
        cv_scores = cross_val_score(
            base_classifier, 
            X_train, 
            y_train, 
            cv=cv_config.n_folds,
            scoring=cv_config.scoring,
            n_jobs=cv_config.n_jobs
        )
        
        print(f"      ✅ CV scores: {cv_scores}")
        print(f"      ✅ CV mean: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
        
        # Test hyperparameter search (if parameters provided)
        best_estimator = base_classifier
        best_params = {}
        best_cv_score = cv_scores.mean()
        
        if param_grid:
            print(f"   🔍 Testing hyperparameter search...")
            
            grid_search = GridSearchCV(
                base_classifier,
                param_grid,
                cv=cv_config.n_folds,
                scoring=cv_config.scoring,
                n_jobs=cv_config.n_jobs,
                verbose=0
            )
            
            grid_search.fit(X_train, y_train)
            
            best_estimator = grid_search.best_estimator_
            best_params = grid_search.best_params_
            best_cv_score = grid_search.best_score_
            
            print(f"      ✅ Best parameters: {best_params}")
            print(f"      ✅ Best CV score: {best_cv_score:.3f}")
            print(f"      📊 Grid search tested {len(grid_search.cv_results_['params'])} combinations")
        
        # Test final model on test set
        print(f"   📊 Testing final model on test set...")
        
        test_predictions = best_estimator.predict(X_test)
        test_probabilities = None
        
        # Get probabilities if available
        if hasattr(best_estimator, 'predict_proba'):
            test_probabilities = best_estimator.predict_proba(X_test)[:, 1]
        elif hasattr(best_estimator, 'decision_function'):
            test_probabilities = best_estimator.decision_function(X_test)
        
        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(y_test, test_predictions),
            'precision': precision_score(y_test, test_predictions, average='binary', zero_division=0),
            'recall': recall_score(y_test, test_predictions, average='binary', zero_division=0),
            'f1_score': f1_score(y_test, test_predictions, average='binary', zero_division=0)
        }
        
        if test_probabilities is not None:
            try:
                metrics['roc_auc'] = roc_auc_score(y_test, test_probabilities)
            except ValueError:
                metrics['roc_auc'] = None  # May fail if only one class in test set
        
        # Confusion matrix
        cm = confusion_matrix(y_test, test_predictions)
        
        training_time = time.time() - start_time
        
        print(f"      ✅ Final test metrics:")
        for metric, value in metrics.items():
            if value is not None:
                print(f"         {metric}: {value:.3f}")
        
        print(f"      📊 Confusion matrix:")
        print(f"         TN={cm[0,0]}, FP={cm[0,1]}")
        print(f"         FN={cm[1,0]}, TP={cm[1,1]}")
        
        print(f"      ⏱️  Total training time: {training_time:.2f}s")
        
        return {
            'classifier_name': classifier_name,
            'best_estimator': best_estimator,
            'best_params': best_params,
            'best_cv_score': best_cv_score,
            'test_metrics': metrics,
            'confusion_matrix': cm,
            'training_time': training_time,
            'predictions': test_predictions,
            'probabilities': test_probabilities
        }
        
    except Exception as e:
        print(f"   ❌ Classifier testing failed: {e}")
        print(f"\nError details:")
        traceback.print_exc()
        return None

# Test all classifiers
classifier_results = {}

if test_config and loaded_data:
    for classifier_name in test_config.classification.models:
        result = test_individual_classifier(classifier_name, test_config, loaded_data)
        if result:
            classifier_results[classifier_name] = result
else:
    print(f"⚠️  Skipping classifier tests due to config or data issues")

## Step 5: Test Model Serialization and Loading
Test saving and loading trained models.

In [None]:
# Test model serialization and loading
def test_model_serialization(results: Dict, config: PipelineConfig):
    """Test saving and loading trained models."""
    
    print(f"\n💾 Testing model serialization and loading...")
    
    if not results:
        print(f"   ❌ No trained models available for testing")
        return False
    
    try:
        # Create models directory
        output_dir = Path(config.job.output_dir) / config.job.name
        models_dir = output_dir / config.output.models_dir
        models_dir.mkdir(parents=True, exist_ok=True)
        
        print(f"   📁 Models directory: {models_dir}")
        
        serialization_results = {}
        
        for classifier_name, result in results.items():
            print(f"\n   💾 Testing {classifier_name} serialization...")
            
            model = result['best_estimator']
            model_file = models_dir / f"{classifier_name}_model.pkl"
            
            # Test saving
            start_time = time.time()
            
            with open(model_file, 'wb') as f:
                pickle.dump(model, f)
            
            save_time = time.time() - start_time
            file_size = model_file.stat().st_size
            
            print(f"      ✅ Model saved: {model_file}")
            print(f"      📊 File size: {file_size / 1024:.1f} KB")
            print(f"      ⏱️  Save time: {save_time:.3f}s")
            
            # Test loading
            start_time = time.time()
            
            with open(model_file, 'rb') as f:
                loaded_model = pickle.load(f)
            
            load_time = time.time() - start_time
            
            print(f"      ✅ Model loaded successfully")
            print(f"      ⏱️  Load time: {load_time:.3f}s")
            
            # Test that loaded model works
            if loaded_data:
                X_test = loaded_data['test']['X']
                original_predictions = model.predict(X_test)
                loaded_predictions = loaded_model.predict(X_test)
                
                predictions_match = np.array_equal(original_predictions, loaded_predictions)
                
                if predictions_match:
                    print(f"      ✅ Loaded model predictions match original")
                else:
                    print(f"      ❌ Loaded model predictions differ from original")
                
                # Test model parameters
                if hasattr(model, 'get_params') and hasattr(loaded_model, 'get_params'):
                    original_params = model.get_params()
                    loaded_params = loaded_model.get_params()
                    
                    params_match = original_params == loaded_params
                    if params_match:
                        print(f"      ✅ Model parameters preserved")
                    else:
                        print(f"      ⚠️  Some model parameters may differ")
            
            serialization_results[classifier_name] = {
                'file_path': model_file,
                'file_size_kb': file_size / 1024,
                'save_time': save_time,
                'load_time': load_time,
                'loaded_successfully': True
            }
        
        # Summary of serialization results
        print(f"\n   📊 Serialization summary:")
        total_size = sum(r['file_size_kb'] for r in serialization_results.values())
        print(f"      📊 Total models size: {total_size:.1f} KB")
        print(f"      📊 Models saved: {len(serialization_results)}")
        
        # Test loading all models at once (memory test)
        print(f"\n   🧠 Testing loading all models simultaneously...")
        all_models = {}
        start_time = time.time()
        
        for classifier_name, result in serialization_results.items():
            with open(result['file_path'], 'rb') as f:
                all_models[classifier_name] = pickle.load(f)
        
        load_all_time = time.time() - start_time
        
        print(f"      ✅ All models loaded: {len(all_models)} models in {load_all_time:.3f}s")
        
        # Cleanup test files
        for result in serialization_results.values():
            try:
                result['file_path'].unlink()
            except:
                pass
        
        print(f"   🧹 Cleaned up test model files")
        
        return True
        
    except Exception as e:
        print(f"   ❌ Model serialization test failed: {e}")
        print(f"\nError details:")
        traceback.print_exc()
        return False

# Test model serialization
if classifier_results and test_config:
    serialization_success = test_model_serialization(classifier_results, test_config)
else:
    print(f"⚠️  Skipping model serialization test")
    serialization_success = False

## Step 6: Test Edge Cases and Error Handling
Test various edge cases that might cause classification issues.

In [None]:
# Test edge cases and error handling
def test_classification_edge_cases():
    """Test edge cases for classification pipeline."""
    
    print(f"\n🧪 Testing classification edge cases...")
    
    # Test case 1: Single class data
    print(f"\n1. Testing single class data...")
    try:
        # Create data with only one class
        single_class_X = np.random.normal(0, 1, (20, 10))
        single_class_y = np.ones(20)  # All class 1
        
        lr = LogisticRegression(random_state=42)
        lr.fit(single_class_X, single_class_y)
        
        predictions = lr.predict(single_class_X)
        unique_predictions = np.unique(predictions)
        
        print(f"   ✅ Single class training completed")
        print(f"   📊 Unique predictions: {unique_predictions}")
        
        # Try to calculate ROC-AUC (should fail gracefully)
        try:
            roc_auc = roc_auc_score(single_class_y, predictions)
            print(f"   ⚠️  ROC-AUC calculated: {roc_auc} (unexpected)")
        except ValueError as e:
            print(f"   ✅ ROC-AUC correctly failed: {str(e)[:50]}...")
        
    except Exception as e:
        print(f"   ❌ Single class test failed: {e}")
    
    # Test case 2: Perfect separation
    print(f"\n2. Testing perfectly separable data...")
    try:
        # Create perfectly separable data
        perfect_X = np.vstack([
            np.random.normal(-2, 0.1, (20, 2)),  # Class 0: far left
            np.random.normal(2, 0.1, (20, 2))    # Class 1: far right
        ])
        perfect_y = np.hstack([np.zeros(20), np.ones(20)])
        
        lr = LogisticRegression(random_state=42)
        lr.fit(perfect_X, perfect_y)
        
        train_accuracy = lr.score(perfect_X, perfect_y)
        print(f"   ✅ Perfect separation training completed")
        print(f"   📊 Training accuracy: {train_accuracy:.3f}")
        
        if train_accuracy > 0.95:
            print(f"   ✅ Data is indeed perfectly/nearly separable")
        
    except Exception as e:
        print(f"   ❌ Perfect separation test failed: {e}")
    
    # Test case 3: Very small dataset
    print(f"\n3. Testing very small dataset...")
    try:
        # Create tiny dataset
        tiny_X = np.random.normal(0, 1, (4, 5))  # Only 4 samples
        tiny_y = np.array([0, 1, 0, 1])
        
        lr = LogisticRegression(random_state=42)
        lr.fit(tiny_X, tiny_y)
        
        # Try cross-validation (should handle small dataset)
        try:
            cv_scores = cross_val_score(lr, tiny_X, tiny_y, cv=2)  # Only 2 folds possible
            print(f"   ✅ Small dataset training completed")
            print(f"   📊 CV scores: {cv_scores}")
        except ValueError as e:
            print(f"   ⚠️  CV failed on small dataset: {str(e)[:50]}...")
        
    except Exception as e:
        print(f"   ❌ Small dataset test failed: {e}")
    
    # Test case 4: High-dimensional data (more features than samples)
    print(f"\n4. Testing high-dimensional data...")
    try:
        # Create data with more features than samples
        high_dim_X = np.random.normal(0, 1, (10, 50))  # 10 samples, 50 features
        high_dim_y = np.array([0, 1] * 5)
        
        lr = LogisticRegression(random_state=42, max_iter=1000)
        lr.fit(high_dim_X, high_dim_y)
        
        train_accuracy = lr.score(high_dim_X, high_dim_y)
        print(f"   ✅ High-dimensional training completed")
        print(f"   📊 Training accuracy: {train_accuracy:.3f}")
        
        if train_accuracy > 0.9:
            print(f"   ⚠️  Possible overfitting detected (high accuracy on small sample)")
        
    except Exception as e:
        print(f"   ❌ High-dimensional test failed: {e}")
    
    # Test case 5: NaN/Inf values
    print(f"\n5. Testing data with NaN/Inf values...")
    try:
        # Create data with problematic values
        problematic_X = np.random.normal(0, 1, (20, 5))
        problematic_X[0, 0] = np.nan  # NaN value
        problematic_X[1, 1] = np.inf  # Inf value
        problematic_y = np.array([0, 1] * 10)
        
        lr = LogisticRegression(random_state=42)
        
        try:
            lr.fit(problematic_X, problematic_y)
            print(f"   ⚠️  Training with NaN/Inf succeeded (unexpected)")
        except ValueError as e:
            print(f"   ✅ Training correctly failed with NaN/Inf: {str(e)[:50]}...")
        
    except Exception as e:
        print(f"   ❌ NaN/Inf test failed: {e}")
    
    print(f"\n   📊 Edge case testing completed")

# Run edge case tests
test_classification_edge_cases()

## Step 7: Performance and Memory Analysis
Analyze training performance and memory usage.

In [None]:
# Test performance and memory analysis
def test_classification_performance(results: Dict, config: PipelineConfig):
    """Analyze classification performance and memory usage."""
    
    print(f"\n⚡ Classification performance analysis...")
    
    if not results:
        print(f"   ❌ No results available for analysis")
        return
    
    try:
        # Performance comparison
        print(f"   📊 Classifier performance comparison:")
        
        performance_data = []
        for classifier_name, result in results.items():
            metrics = result['test_metrics']
            performance_data.append({
                'Classifier': classifier_name,
                'Accuracy': f"{metrics['accuracy']:.3f}",
                'Precision': f"{metrics['precision']:.3f}",
                'Recall': f"{metrics['recall']:.3f}",
                'F1-Score': f"{metrics['f1_score']:.3f}",
                'ROC-AUC': f"{metrics.get('roc_auc', 'N/A'):.3f}" if metrics.get('roc_auc') else 'N/A',
                'Training Time': f"{result['training_time']:.2f}s"
            })
        
        performance_df = pd.DataFrame(performance_data)
        print("")
        print(performance_df.to_string(index=False))
        
        # Find best performer
        best_accuracy = max(results.items(), key=lambda x: x[1]['test_metrics']['accuracy'])
        best_f1 = max(results.items(), key=lambda x: x[1]['test_metrics']['f1_score'])
        fastest = min(results.items(), key=lambda x: x[1]['training_time'])
        
        print(f"\n   🏆 Performance highlights:")
        print(f"      Best accuracy: {best_accuracy[0]} ({best_accuracy[1]['test_metrics']['accuracy']:.3f})")
        print(f"      Best F1-score: {best_f1[0]} ({best_f1[1]['test_metrics']['f1_score']:.3f})")
        print(f"      Fastest training: {fastest[0]} ({fastest[1]['training_time']:.2f}s)")
        
        # Training time analysis
        print(f"\n   ⏱️  Training time analysis:")
        
        total_time = sum(r['training_time'] for r in results.values())
        print(f"      Total training time: {total_time:.2f}s")
        
        # Estimate scaling
        current_samples = len(loaded_data['train']['X']) if loaded_data else 0
        if current_samples > 0:
            time_per_sample = total_time / current_samples
            
            print(f"      Time per sample: {time_per_sample:.4f}s")
            print(f"      Estimated time for larger datasets:")
            
            for dataset_size in [1000, 10000, 100000]:
                estimated_time = time_per_sample * dataset_size
                if estimated_time < 60:
                    print(f"         {dataset_size:,} samples: {estimated_time:.1f}s")
                elif estimated_time < 3600:
                    print(f"         {dataset_size:,} samples: {estimated_time/60:.1f}m")
                else:
                    print(f"         {dataset_size:,} samples: {estimated_time/3600:.1f}h")
        
        # Memory analysis
        print(f"\n   🧠 Memory analysis:")
        
        if loaded_data:
            train_memory = loaded_data['train']['X'].nbytes / 1024 / 1024  # MB
            test_memory = loaded_data['test']['X'].nbytes / 1024 / 1024   # MB
            
            print(f"      Training data: {train_memory:.2f} MB")
            print(f"      Test data: {test_memory:.2f} MB")
            print(f"      Total data: {train_memory + test_memory:.2f} MB")
            
            # Estimate memory for larger datasets
            bytes_per_sample = loaded_data['train']['X'].nbytes / len(loaded_data['train']['X'])
            print(f"      Memory per sample: {bytes_per_sample / 1024:.1f} KB")
            
            print(f"      Estimated memory for larger datasets:")
            for dataset_size in [1000, 10000, 100000]:
                estimated_mb = (bytes_per_sample * dataset_size) / 1024 / 1024
                if estimated_mb < 1024:
                    print(f"         {dataset_size:,} samples: {estimated_mb:.1f} MB")
                else:
                    print(f"         {dataset_size:,} samples: {estimated_mb/1024:.1f} GB")
        
        # Configuration recommendations
        print(f"\n   💡 Configuration recommendations:")
        
        # CV recommendations
        current_folds = config.classification.cross_validation.n_folds
        if current_samples and current_samples < current_folds * 10:
            recommended_folds = max(2, current_samples // 10)
            print(f"      🔧 Consider reducing CV folds to {recommended_folds} for small datasets")
        
        # Hyperparameter search recommendations
        if fastest[1]['training_time'] > 60:  # If slowest is > 1 minute
            print(f"      🔧 Consider reducing hyperparameter search space for faster training")
        
        # Model selection recommendations
        accuracy_diff = max(r['test_metrics']['accuracy'] for r in results.values()) - \
                       min(r['test_metrics']['accuracy'] for r in results.values())
        
        if accuracy_diff < 0.05:  # Less than 5% difference
            print(f"      📊 All classifiers perform similarly - consider using the fastest ({fastest[0]})")
        else:
            print(f"      📊 Significant performance differences - use best performer ({best_accuracy[0]})")
        
    except Exception as e:
        print(f"   ❌ Performance analysis failed: {e}")
        print(f"\nError details:")
        traceback.print_exc()

# Run performance analysis
if classifier_results and test_config:
    test_classification_performance(classifier_results, test_config)
else:
    print(f"⚠️  Skipping performance analysis")

## Step 8: Summary and Recommendations
Summarize all test results and provide actionable recommendations.

In [None]:
# Summarize classification pipeline test results
print("\n📋 Classification Pipeline Debug Test Summary:")
print("=" * 55)

# Collect test results
test_results = [
    ("Configuration Validation", test_config is not None),
    ("Embeddings Data Loading", loaded_data is not None),
    ("Classifier Training", len(classifier_results) > 0),
    ("Model Serialization", serialization_success),
]

# Calculate success rate
passed_tests = [name for name, result in test_results if result]
failed_tests = [name for name, result in test_results if not result]
success_rate = len(passed_tests) / len(test_results) * 100

print(f"\n🎯 Test Results Summary:")
for test_name, result in test_results:
    status = "✅ PASS" if result else "❌ FAIL"
    print(f"   {status} {test_name}")

print(f"\n📊 Overall Success Rate: {success_rate:.0f}%")

# Detailed analysis
print(f"\n🔍 Detailed Analysis:")

if classifier_results:
    print(f"   ✅ Successfully trained {len(classifier_results)} classifiers")
    
    best_performer = max(classifier_results.items(), 
                        key=lambda x: x[1]['test_metrics']['accuracy'])
    
    print(f"   🏆 Best performer: {best_performer[0]} (accuracy: {best_performer[1]['test_metrics']['accuracy']:.3f})")
    
    # Check if results are reasonable
    accuracies = [r['test_metrics']['accuracy'] for r in classifier_results.values()]
    avg_accuracy = np.mean(accuracies)
    
    if avg_accuracy > 0.9:
        print(f"   📊 Excellent performance: average accuracy {avg_accuracy:.3f}")
    elif avg_accuracy > 0.7:
        print(f"   📊 Good performance: average accuracy {avg_accuracy:.3f}")
    elif avg_accuracy > 0.5:
        print(f"   📊 Moderate performance: average accuracy {avg_accuracy:.3f}")
    else:
        print(f"   ⚠️  Low performance: average accuracy {avg_accuracy:.3f} - check data quality")

else:
    print(f"   ❌ No classifiers were successfully trained")

# Recommendations based on results
print(f"\n💡 Recommendations:")

if success_rate >= 75:
    print(f"   🎉 Classification pipeline is working well!")
    print(f"   ➡️  Next: Test evaluation pipeline to generate metrics and plots")
    print(f"   ➡️  Or: Test with real embeddings from your embedding pipeline")
elif success_rate >= 50:
    print(f"   ⚠️  Most components working, but some issues need attention")
    print(f"   🔧 Focus on fixing the failed tests listed above")
else:
    print(f"   🚨 Significant issues detected - troubleshooting needed")
    print(f"   🔧 Address data loading and configuration issues first")

# Configuration recommendations
if test_config and classifier_results:
    print(f"\n🔧 Configuration for next steps:")
    print(f"   # Use this configuration for evaluation pipeline testing")
    print(f"   input:")
    print(f"     train_embeddings_path: \"{test_config.input.train_embeddings_path}\"")
    print(f"     test_embeddings_path: \"{test_config.input.test_embeddings_path}\"")
    print(f"   classification:")
    print(f"     models: {test_config.classification.models}")
    print(f"     cross_validation:")
    print(f"       n_folds: {test_config.classification.cross_validation.n_folds}")
    print(f"       scoring: \"{test_config.classification.cross_validation.scoring}\"")

print(f"\n📚 Next testing steps:")
if success_rate >= 75:
    print(f"   1. Run test_05_Target_Word_Evaluation_Debug.ipynb")
    print(f"   2. Run test_06_Evaluation_Pipeline_Debug.ipynb")
    print(f"   3. Test with real embeddings from your embedding pipeline")
else:
    print(f"   1. Fix failed tests in this notebook")
    print(f"   2. Verify embeddings data format and quality")
    print(f"   3. Re-run this notebook until all tests pass")

# Cleanup
print(f"\n🧹 Cleaning up test files...")
try:
    # Clean up test embeddings directory
    import shutil
    if test_embeddings_dir.exists():
        shutil.rmtree(test_embeddings_dir)
        print(f"   🗑️  Removed: {test_embeddings_dir}")
    
    # Clean up test outputs directory
    test_output_dir = mgpt_eval_path / "examples" / "test_classification_outputs"
    if test_output_dir.exists():
        shutil.rmtree(test_output_dir)
        print(f"   🗑️  Removed: test_classification_outputs directory")
        
except Exception as e:
    print(f"   ⚠️  Cleanup warning: {e}")

print(f"\n✅ Classification pipeline debugging complete!")

## 🔧 Debug Cell (Run if needed)
Use this cell to test specific scenarios or debug issues found above.

In [None]:
# Debug cell - modify as needed for specific testing

# Example: Test with different hyperparameter grids
# if loaded_data:
#     print("Testing custom hyperparameter grid:")
#     custom_grid = {'C': [0.01, 0.1, 1.0, 10.0], 'penalty': ['l1', 'l2']}
#     
#     lr = LogisticRegression(random_state=42, max_iter=1000)
#     grid_search = GridSearchCV(lr, custom_grid, cv=3, scoring='roc_auc')
#     grid_search.fit(loaded_data['train']['X'], loaded_data['train']['y'])
#     
#     print(f"Best params: {grid_search.best_params_}")
#     print(f"Best score: {grid_search.best_score_:.3f}")

# Example: Test feature scaling
# if loaded_data:
#     print("Testing with feature scaling:")
#     scaler = StandardScaler()
#     X_train_scaled = scaler.fit_transform(loaded_data['train']['X'])
#     X_test_scaled = scaler.transform(loaded_data['test']['X'])
#     
#     lr = LogisticRegression(random_state=42)
#     lr.fit(X_train_scaled, loaded_data['train']['y'])
#     
#     scaled_accuracy = lr.score(X_test_scaled, loaded_data['test']['y'])
#     print(f"Scaled accuracy: {scaled_accuracy:.3f}")

# Example: Analyze feature importance (for Random Forest)
# if 'random_forest' in classifier_results:
#     rf_model = classifier_results['random_forest']['best_estimator']
#     if hasattr(rf_model, 'feature_importances_'):
#         importances = rf_model.feature_importances_
#         top_features = np.argsort(importances)[-10:]  # Top 10 features
#         print(f"Top 10 feature importances: {importances[top_features]}")
#         print(f"Top 10 feature indices: {top_features}")

# Example: Test cross-validation stability
# if loaded_data:
#     print("Testing CV stability across multiple runs:")
#     lr = LogisticRegression(random_state=42)
#     
#     for run in range(3):
#         cv_scores = cross_val_score(lr, loaded_data['train']['X'], loaded_data['train']['y'], 
#                                   cv=3, scoring='accuracy')
#         print(f"Run {run + 1}: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

print("💡 Use this cell to run custom classification tests and debug specific issues.")