# 📊 Data Loading & Validation Testing

## Purpose
This notebook tests data loading, validation, and preprocessing step-by-step to debug data format issues, validate CSV structure, and ensure proper train/test splitting.

## What This Tests
- CSV file loading and validation
- Required column checking (mcid, claims, label)
- Data type validation and conversion
- Train/test splitting logic
- Data preprocessing and cleaning
- Edge cases (empty data, missing values, invalid labels)
- Configuration-based data loading

In [None]:
# Import required libraries
import sys
import os
import yaml
import json
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split

# Add the mgpt_eval directory to Python path
mgpt_eval_path = Path.cwd().parent if Path.cwd().name == 'examples' else Path.cwd()
sys.path.insert(0, str(mgpt_eval_path))

# Import actual pipeline modules
from models.config_models import PipelineConfig
from models.data_models import DataSample, DataBatch
from utils.logging_utils import setup_logging

print(f"✅ Working directory: {Path.cwd()}")
print(f"✅ MGPT-eval path: {mgpt_eval_path}")
print(f"✅ Imports successful")

## Step 1: Create Sample Test Data
First, we'll create sample test data to ensure we have valid data for testing.

In [None]:
# Create sample test data with various scenarios
def create_sample_data():
    """Create sample medical claims data for testing."""
    
    sample_data = {
        'mcid': [
            'CLAIM_001', 'CLAIM_002', 'CLAIM_003', 'CLAIM_004', 'CLAIM_005',
            'CLAIM_006', 'CLAIM_007', 'CLAIM_008', 'CLAIM_009', 'CLAIM_010',
            'CLAIM_011', 'CLAIM_012', 'CLAIM_013', 'CLAIM_014', 'CLAIM_015'
        ],
        'claims': [
            'N6320 G0378 |eoc| Z91048 M1710',
            'E119 76642 |eoc| K9289 O0903',
            'I10 E785 |eoc| Z1239 M549',
            'E119 N6320 |eoc| K9289 76642',
            'O0903 Z91048 |eoc| M1710 G0378',
            'K9289 I10 |eoc| E785 N6320',
            'Z1239 E119 |eoc| 76642 M549',
            'M1710 O0903 |eoc| G0378 Z91048',
            'E785 K9289 |eoc| I10 N6320',
            '76642 Z1239 |eoc| E119 M549',
            'G0378 M1710 |eoc| O0903 Z91048',
            'N6320 E785 |eoc| I10 K9289',
            'M549 76642 |eoc| Z1239 E119',
            'Z91048 G0378 |eoc| M1710 O0903',
            'K9289 N6320 |eoc| E785 I10'
        ],
        'label': [1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1]
    }
    
    return pd.DataFrame(sample_data)

# Create and save sample data
sample_df = create_sample_data()
sample_data_path = mgpt_eval_path / "examples" / "sample_test_data.csv"
sample_df.to_csv(sample_data_path, index=False)

print(f"📄 Created sample data: {sample_data_path}")
print(f"📊 Data shape: {sample_df.shape}")
print(f"📋 Columns: {list(sample_df.columns)}")
print(f"\n🔍 First 3 rows:")
print(sample_df.head(3))
print(f"\n📊 Label distribution:")
print(sample_df['label'].value_counts())

## Step 2: Load Configuration
Load configuration and test data path resolution.

In [None]:
# Load configuration for testing
config_path = mgpt_eval_path / "configs" / "templates" / "04_full_pipeline.yaml"

with open(config_path, 'r') as f:
    config_dict = yaml.safe_load(f)

# Override with our test data path
config_dict['input']['dataset_path'] = str(sample_data_path)

# Validate configuration
try:
    config = PipelineConfig(**config_dict)
    print("✅ Configuration loaded and validated")
    
    # Extract input configuration
    input_config = config.input
    print(f"\n📁 Input Configuration:")
    print(f"  Dataset path: {input_config.dataset_path}")
    if hasattr(input_config, 'split_ratio'):
        print(f"  Split ratio: {input_config.split_ratio}")
    if hasattr(input_config, 'train_dataset_path'):
        print(f"  Train dataset: {input_config.train_dataset_path}")
    if hasattr(input_config, 'test_dataset_path'):
        print(f"  Test dataset: {input_config.test_dataset_path}")
        
except Exception as e:
    print(f"❌ Configuration validation failed: {e}")
    config = None

## Step 3: Test Basic Data Loading
Test the basic CSV loading functionality.

In [None]:
# Test basic data loading function (mimics pipeline data loading)
def load_and_validate_data(file_path: str):
    """Load and validate data similar to the pipeline."""
    
    print(f"📂 Loading data from: {file_path}")
    
    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Dataset file not found: {file_path}")
    
    # Load CSV
    try:
        df = pd.read_csv(file_path)
        print(f"✅ CSV loaded successfully")
        print(f"📊 Shape: {df.shape}")
    except Exception as e:
        print(f"❌ Failed to load CSV: {e}")
        return None
    
    # Check required columns
    required_columns = ['mcid', 'claims', 'label']
    missing_columns = [col for col in required_columns if col not in df.columns]
    
    if missing_columns:
        print(f"❌ Missing required columns: {missing_columns}")
        print(f"📋 Available columns: {list(df.columns)}")
        return None
    else:
        print(f"✅ All required columns present: {required_columns}")
    
    # Check data types and basic validation
    print(f"\n🔍 Data validation:")
    
    # Check for empty dataframe
    if len(df) == 0:
        print(f"❌ Dataset is empty")
        return None
    else:
        print(f"✅ Dataset has {len(df)} rows")
    
    # Check for null values
    null_counts = df.isnull().sum()
    if null_counts.any():
        print(f"⚠️  Null values found:")
        for col, count in null_counts.items():
            if count > 0:
                print(f"   {col}: {count} null values")
    else:
        print(f"✅ No null values found")
    
    # Check label values
    unique_labels = df['label'].unique()
    print(f"📊 Unique labels: {sorted(unique_labels)}")
    
    if not all(label in [0, 1] for label in unique_labels):
        print(f"❌ Invalid label values. Expected 0 and 1, found: {unique_labels}")
        return None
    else:
        print(f"✅ Valid binary labels")
    
    # Check if all labels are the same
    if len(unique_labels) == 1:
        print(f"⚠️  All labels are the same value: {unique_labels[0]}")
        print(f"   This will cause issues with classification")
    
    # Check MCID uniqueness
    duplicate_mcids = df['mcid'].duplicated().sum()
    if duplicate_mcids > 0:
        print(f"⚠️  Found {duplicate_mcids} duplicate MCIDs")
    else:
        print(f"✅ All MCIDs are unique")
    
    # Check claims format
    empty_claims = df['claims'].str.strip().eq('').sum()
    if empty_claims > 0:
        print(f"⚠️  Found {empty_claims} empty claims")
    else:
        print(f"✅ No empty claims found")
    
    return df

# Test data loading
loaded_df = load_and_validate_data(str(sample_data_path))

## Step 4: Test Train/Test Splitting
Test the train/test splitting logic used by the pipeline.

In [None]:
# Test train/test splitting (mimics pipeline splitting logic)
def test_train_test_split(df, split_ratio=0.8, random_seed=42):
    """Test train/test splitting with stratification."""
    
    print(f"\n🔄 Testing train/test split:")
    print(f"   Split ratio: {split_ratio} (train) / {1-split_ratio} (test)")
    print(f"   Random seed: {random_seed}")
    
    if df is None:
        print(f"❌ Cannot split: DataFrame is None")
        return None, None
    
    try:
        # Check if stratification is possible
        label_counts = df['label'].value_counts()
        min_class_count = label_counts.min()
        test_size = 1 - split_ratio
        min_test_samples = int(min_class_count * test_size)
        
        print(f"📊 Label distribution: {dict(label_counts)}")
        print(f"📊 Minimum class count: {min_class_count}")
        print(f"📊 Expected test samples per class: ~{min_test_samples}")
        
        if min_test_samples < 1:
            print(f"⚠️  Warning: Very small test set for minority class")
            print(f"   Consider using a smaller split ratio or more data")
        
        # Perform stratified split
        train_df, test_df = train_test_split(
            df,
            test_size=test_size,
            random_state=random_seed,
            stratify=df['label']  # Maintain label distribution
        )
        
        print(f"\n✅ Split successful:")
        print(f"   Train set: {len(train_df)} samples")
        print(f"   Test set: {len(test_df)} samples")
        
        # Verify label distributions
        train_dist = train_df['label'].value_counts(normalize=True).sort_index()
        test_dist = test_df['label'].value_counts(normalize=True).sort_index()
        original_dist = df['label'].value_counts(normalize=True).sort_index()
        
        print(f"\n📊 Label distribution comparison:")
        print(f"   Original: {dict(original_dist.round(3))}")
        print(f"   Train:    {dict(train_dist.round(3))}")
        print(f"   Test:     {dict(test_dist.round(3))}")
        
        # Check if distributions are similar (within 5%)
        dist_diff = abs(train_dist - test_dist).max()
        if dist_diff > 0.05:
            print(f"⚠️  Warning: Label distributions differ by {dist_diff:.3f}")
        else:
            print(f"✅ Label distributions are well balanced")
        
        return train_df, test_df
        
    except Exception as e:
        print(f"❌ Split failed: {e}")
        return None, None

# Test splitting with configuration values
if loaded_df is not None and config:
    split_ratio = getattr(config.input, 'split_ratio', 0.8)
    random_seed = getattr(config.job, 'random_seed', 42)
    
    train_df, test_df = test_train_test_split(loaded_df, split_ratio, random_seed)
else:
    print("⚠️  Skipping split test due to data loading or config issues")

## Step 5: Test Data Model Validation
Test the Pydantic data models used by the pipeline.

In [None]:
# Test DataSample and DataBatch models
def test_data_models(df):
    """Test Pydantic data models with sample data."""
    
    print(f"\n🧪 Testing Pydantic data models:")
    
    if df is None or len(df) == 0:
        print(f"❌ No data available for testing")
        return
    
    # Test DataSample model
    print(f"\n1. Testing DataSample model:")
    try:
        # Create DataSample from first row
        first_row = df.iloc[0]
        sample = DataSample(
            mcid=first_row['mcid'],
            claims=first_row['claims'],
            label=int(first_row['label'])
        )
        
        print(f"   ✅ DataSample created successfully")
        print(f"   📋 MCID: {sample.mcid}")
        print(f"   📋 Claims: {sample.claims[:50]}...")
        print(f"   📋 Label: {sample.label}")
        
    except Exception as e:
        print(f"   ❌ DataSample creation failed: {e}")
    
    # Test DataBatch model
    print(f"\n2. Testing DataBatch model:")
    try:
        # Create DataBatch from multiple rows
        batch_size = min(3, len(df))
        batch_rows = df.head(batch_size)
        
        samples = []
        for _, row in batch_rows.iterrows():
            sample = DataSample(
                mcid=row['mcid'],
                claims=row['claims'],
                label=int(row['label'])
            )
            samples.append(sample)
        
        batch = DataBatch(samples=samples)
        
        print(f"   ✅ DataBatch created successfully")
        print(f"   📊 Batch size: {len(batch.samples)}")
        print(f"   📋 MCIDs: {[s.mcid for s in batch.samples]}")
        print(f"   📋 Labels: {[s.label for s in batch.samples]}")
        
        # Test batch properties
        mcids = batch.get_mcids()
        claims = batch.get_claims()
        labels = batch.get_labels()
        
        print(f"   ✅ Batch methods work correctly")
        print(f"   📊 MCIDs extracted: {len(mcids)}")
        print(f"   📊 Claims extracted: {len(claims)}")
        print(f"   📊 Labels extracted: {len(labels)}")
        
    except Exception as e:
        print(f"   ❌ DataBatch creation failed: {e}")
    
    # Test edge cases
    print(f"\n3. Testing edge cases:")
    
    # Test empty claims
    try:
        empty_sample = DataSample(
            mcid="TEST_EMPTY",
            claims="",
            label=0
        )
        print(f"   ⚠️  Empty claims accepted (may cause issues downstream)")
    except Exception as e:
        print(f"   ✅ Empty claims rejected: {e}")
    
    # Test invalid label
    try:
        invalid_sample = DataSample(
            mcid="TEST_INVALID",
            claims="E119 I10",
            label=2  # Invalid label (should be 0 or 1)
        )
        print(f"   ⚠️  Invalid label (2) accepted")
    except Exception as e:
        print(f"   ✅ Invalid label rejected: {e}")

# Test data models
if loaded_df is not None:
    test_data_models(loaded_df)
else:
    print("⚠️  Skipping data model tests due to loading issues")

## Step 6: Test Edge Cases and Error Scenarios
Test various problematic data scenarios to ensure robust error handling.

In [None]:
# Create problematic test datasets
def create_problematic_datasets():
    """Create datasets with various issues for testing."""
    
    test_datasets = {}
    base_path = mgpt_eval_path / "examples"
    
    # 1. Missing columns
    missing_cols_df = pd.DataFrame({
        'id': ['C001', 'C002'],  # Wrong column name
        'text': ['E119 I10', 'N6320 K9289'],  # Wrong column name
        'target': [1, 0]  # Wrong column name
    })
    missing_cols_path = base_path / "test_missing_columns.csv"
    missing_cols_df.to_csv(missing_cols_path, index=False)
    test_datasets['missing_columns'] = missing_cols_path
    
    # 2. Invalid labels
    invalid_labels_df = pd.DataFrame({
        'mcid': ['C001', 'C002', 'C003'],
        'claims': ['E119 I10', 'N6320 K9289', 'Z1239 M549'],
        'label': [1, 2, 0]  # Invalid label '2'
    })
    invalid_labels_path = base_path / "test_invalid_labels.csv"
    invalid_labels_df.to_csv(invalid_labels_path, index=False)
    test_datasets['invalid_labels'] = invalid_labels_path
    
    # 3. Null values
    null_values_df = pd.DataFrame({
        'mcid': ['C001', None, 'C003'],
        'claims': ['E119 I10', 'N6320 K9289', None],
        'label': [1, 0, 1]
    })
    null_values_path = base_path / "test_null_values.csv"
    null_values_df.to_csv(null_values_path, index=False)
    test_datasets['null_values'] = null_values_path
    
    # 4. Single class
    single_class_df = pd.DataFrame({
        'mcid': ['C001', 'C002', 'C003'],
        'claims': ['E119 I10', 'N6320 K9289', 'Z1239 M549'],
        'label': [1, 1, 1]  # All same label
    })
    single_class_path = base_path / "test_single_class.csv"
    single_class_df.to_csv(single_class_path, index=False)
    test_datasets['single_class'] = single_class_path
    
    # 5. Empty dataset
    empty_df = pd.DataFrame(columns=['mcid', 'claims', 'label'])
    empty_path = base_path / "test_empty.csv"
    empty_df.to_csv(empty_path, index=False)
    test_datasets['empty'] = empty_path
    
    # 6. Duplicate MCIDs
    duplicate_mcids_df = pd.DataFrame({
        'mcid': ['C001', 'C001', 'C002'],  # Duplicate MCID
        'claims': ['E119 I10', 'N6320 K9289', 'Z1239 M549'],
        'label': [1, 0, 1]
    })
    duplicate_path = base_path / "test_duplicate_mcids.csv"
    duplicate_mcids_df.to_csv(duplicate_path, index=False)
    test_datasets['duplicate_mcids'] = duplicate_path
    
    return test_datasets

# Create and test problematic datasets
print("🧪 Creating problematic test datasets:")
problematic_datasets = create_problematic_datasets()

for dataset_name, dataset_path in problematic_datasets.items():
    print(f"   📄 {dataset_name}: {dataset_path.name}")

print(f"\n🔍 Testing error handling:")

# Test each problematic dataset
for dataset_name, dataset_path in problematic_datasets.items():
    print(f"\n--- Testing {dataset_name} ---")
    try:
        result_df = load_and_validate_data(str(dataset_path))
        if result_df is not None:
            print(f"   ⚠️  Dataset loaded despite issues (may cause problems later)")
        else:
            print(f"   ✅ Dataset correctly rejected")
    except Exception as e:
        print(f"   ✅ Exception caught: {e}")

# Cleanup test files
print(f"\n🧹 Cleaning up test files...")
for dataset_path in problematic_datasets.values():
    try:
        dataset_path.unlink()
        print(f"   🗑️  Deleted: {dataset_path.name}")
    except:
        pass

## Step 7: Test Alternative Input Configurations
Test different input configuration scenarios (separate train/test files, embeddings files).

In [None]:
# Test separate train/test file configuration
def test_separate_files_config():
    """Test configuration with separate train and test files."""
    
    print(f"\n🔄 Testing separate train/test files configuration:")
    
    if train_df is None or test_df is None:
        print(f"   ❌ No train/test data available")
        return
    
    # Save separate files
    train_path = mgpt_eval_path / "examples" / "test_train.csv"
    test_path = mgpt_eval_path / "examples" / "test_test.csv"
    
    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)
    
    print(f"   📄 Train file: {train_path} ({len(train_df)} samples)")
    print(f"   📄 Test file: {test_path} ({len(test_df)} samples)")
    
    # Test configuration with separate files
    separate_config = {
        'input': {
            'train_dataset_path': str(train_path),
            'test_dataset_path': str(test_path)
            # Note: split_ratio should be ignored
        },
        'job': {'name': 'test_separate_files', 'output_dir': 'outputs'},
        'model_api': {'base_url': 'http://localhost:8000', 'batch_size': 32, 'timeout': 300, 'max_retries': 3},
        'pipeline_stages': {'embeddings': True, 'classification': True, 'evaluation': True, 'target_word_eval': False, 'summary_report': True, 'method_comparison': False},
        'data_processing': {'random_seed': 42, 'max_sequence_length': 512, 'include_mcid': True, 'output_format': 'json'},
        'embedding_generation': {'batch_size': 16, 'save_interval': 100, 'checkpoint_dir': 'outputs/checkpoints', 'resume_from_checkpoint': True, 'tokenizer_path': '/app/tokenizer'},
        'classification': {'models': ['logistic_regression'], 'cross_validation': {'n_folds': 3, 'scoring': 'roc_auc', 'n_jobs': 1}},
        'evaluation': {'metrics': ['accuracy', 'precision', 'recall', 'f1_score'], 'visualization': {'generate_plots': False}},
        'target_word_evaluation': {'enable': False, 'target_codes': ['E119']},
        'output': {'embeddings_dir': 'outputs/embeddings', 'models_dir': 'outputs/models', 'metrics_dir': 'outputs/metrics', 'logs_dir': 'outputs/logs'},
        'logging': {'level': 'INFO', 'console_level': 'INFO', 'format': '%(asctime)s - %(levelname)s - %(message)s', 'file': 'outputs/logs/pipeline.log'}
    }
    
    try:
        separate_config_obj = PipelineConfig(**separate_config)
        print(f"   ✅ Separate files configuration validated")
        
        # Test loading both files
        train_loaded = load_and_validate_data(str(train_path))
        test_loaded = load_and_validate_data(str(test_path))
        
        if train_loaded is not None and test_loaded is not None:
            print(f"   ✅ Both files loaded successfully")
            print(f"   📊 Train: {len(train_loaded)} samples")
            print(f"   📊 Test: {len(test_loaded)} samples")
        else:
            print(f"   ❌ Failed to load one or both files")
            
    except Exception as e:
        print(f"   ❌ Configuration validation failed: {e}")
    
    # Cleanup
    try:
        train_path.unlink()
        test_path.unlink()
        print(f"   🧹 Cleaned up test files")
    except:
        pass

# Test embeddings file configuration
def test_embeddings_config():
    """Test configuration with pre-computed embeddings."""
    
    print(f"\n🔄 Testing embeddings files configuration:")
    
    # Create mock embeddings data
    mock_embeddings_data = {
        'mcids': ['C001', 'C002', 'C003'],
        'labels': [1, 0, 1],
        'embeddings': [
            [0.1, 0.2, 0.3] * 256,  # 768-dim mock embedding
            [0.4, 0.5, 0.6] * 256,
            [0.7, 0.8, 0.9] * 256
        ]
    }
    
    # Save mock embeddings
    train_emb_path = mgpt_eval_path / "examples" / "test_train_embeddings.json"
    test_emb_path = mgpt_eval_path / "examples" / "test_test_embeddings.json"
    
    with open(train_emb_path, 'w') as f:
        json.dump(mock_embeddings_data, f)
    
    with open(test_emb_path, 'w') as f:
        json.dump(mock_embeddings_data, f)
    
    print(f"   📄 Train embeddings: {train_emb_path}")
    print(f"   📄 Test embeddings: {test_emb_path}")
    
    # Test embeddings configuration
    embeddings_config = {
        'input': {
            'train_embeddings_path': str(train_emb_path),
            'test_embeddings_path': str(test_emb_path)
        },
        'job': {'name': 'test_embeddings', 'output_dir': 'outputs'},
        'model_api': {'base_url': 'http://localhost:8000', 'batch_size': 32, 'timeout': 300, 'max_retries': 3},
        'pipeline_stages': {'embeddings': False, 'classification': True, 'evaluation': True, 'target_word_eval': False, 'summary_report': True, 'method_comparison': False},
        'data_processing': {'random_seed': 42, 'max_sequence_length': 512, 'include_mcid': True, 'output_format': 'json'},
        'embedding_generation': {'batch_size': 16, 'save_interval': 100, 'checkpoint_dir': 'outputs/checkpoints', 'resume_from_checkpoint': True, 'tokenizer_path': '/app/tokenizer'},
        'classification': {'models': ['logistic_regression'], 'cross_validation': {'n_folds': 3, 'scoring': 'roc_auc', 'n_jobs': 1}},
        'evaluation': {'metrics': ['accuracy', 'precision', 'recall', 'f1_score'], 'visualization': {'generate_plots': False}},
        'target_word_evaluation': {'enable': False, 'target_codes': ['E119']},
        'output': {'embeddings_dir': 'outputs/embeddings', 'models_dir': 'outputs/models', 'metrics_dir': 'outputs/metrics', 'logs_dir': 'outputs/logs'},
        'logging': {'level': 'INFO', 'console_level': 'INFO', 'format': '%(asctime)s - %(levelname)s - %(message)s', 'file': 'outputs/logs/pipeline.log'}
    }
    
    try:
        embeddings_config_obj = PipelineConfig(**embeddings_config)
        print(f"   ✅ Embeddings configuration validated")
        
        # Test loading embeddings
        with open(train_emb_path, 'r') as f:
            train_emb_data = json.load(f)
        
        with open(test_emb_path, 'r') as f:
            test_emb_data = json.load(f)
        
        print(f"   ✅ Embeddings loaded successfully")
        print(f"   📊 Train embeddings: {len(train_emb_data['embeddings'])} samples")
        print(f"   📊 Embedding dimension: {len(train_emb_data['embeddings'][0])}")
        
    except Exception as e:
        print(f"   ❌ Configuration or loading failed: {e}")
    
    # Cleanup
    try:
        train_emb_path.unlink()
        test_emb_path.unlink()
        print(f"   🧹 Cleaned up test files")
    except:
        pass

# Run alternative configuration tests
test_separate_files_config()
test_embeddings_config()

## Step 8: Performance Testing with Large Data
Test data loading performance with larger datasets.

In [None]:
# Test performance with larger datasets
def test_large_data_performance():
    """Test data loading performance with larger datasets."""
    
    print(f"\n⚡ Testing data loading performance:")
    
    dataset_sizes = [100, 500, 1000, 5000]
    
    for size in dataset_sizes:
        print(f"\n📊 Testing with {size} samples:")
        
        # Generate large dataset
        import time
        start_time = time.time()
        
        large_data = {
            'mcid': [f'CLAIM_{i:06d}' for i in range(size)],
            'claims': ['E119 I10 N6320 K9289 |eoc| Z1239 M549 76642'] * size,
            'label': [i % 2 for i in range(size)]  # Alternating 0, 1
        }
        
        large_df = pd.DataFrame(large_data)
        generation_time = time.time() - start_time
        
        # Save to file
        large_file_path = mgpt_eval_path / "examples" / f"test_large_{size}.csv"
        start_time = time.time()
        large_df.to_csv(large_file_path, index=False)
        save_time = time.time() - start_time
        
        # Test loading
        start_time = time.time()
        loaded_large_df = load_and_validate_data(str(large_file_path))
        load_time = time.time() - start_time
        
        if loaded_large_df is not None:
            # Test splitting
            start_time = time.time()
            train_large, test_large = test_train_test_split(loaded_large_df, 0.8, 42)
            split_time = time.time() - start_time
            
            print(f"   ✅ Performance results:")
            print(f"      Generation: {generation_time:.3f}s")
            print(f"      Save: {save_time:.3f}s ({size/save_time:.0f} samples/sec)")
            print(f"      Load: {load_time:.3f}s ({size/load_time:.0f} samples/sec)")
            print(f"      Split: {split_time:.3f}s")
            print(f"      Memory: ~{large_df.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")
        else:
            print(f"   ❌ Failed to load dataset of size {size}")
        
        # Cleanup
        try:
            large_file_path.unlink()
        except:
            pass
        
        # Brief pause between tests
        time.sleep(0.5)

# Run performance tests
test_large_data_performance()

## Step 9: Summary and Recommendations
Summarize all test results and provide recommendations.

In [None]:
# Summarize data validation test results
print("\n📋 Data Loading & Validation Test Summary:")
print("=" * 55)

# Check what tests passed
tests_passed = []
tests_failed = []

if config:
    tests_passed.append("Configuration validation")
else:
    tests_failed.append("Configuration validation")

if loaded_df is not None:
    tests_passed.append("Basic data loading")
else:
    tests_failed.append("Basic data loading")

if train_df is not None and test_df is not None:
    tests_passed.append("Train/test splitting")
else:
    tests_failed.append("Train/test splitting")

# Display results
print(f"\n🎯 Test Results:")
for test in tests_passed:
    print(f"   ✅ PASS: {test}")

for test in tests_failed:
    print(f"   ❌ FAIL: {test}")

success_rate = len(tests_passed) / (len(tests_passed) + len(tests_failed)) * 100
print(f"\n📊 Overall Success Rate: {success_rate:.0f}%")

# Recommendations
print(f"\n💡 Recommendations:")

if success_rate == 100:
    print(f"   🎉 All data validation tests passed!")
    print(f"   ✅ Your data format is compatible with the pipeline")
    print(f"   ➡️  Next: Test API connectivity (if not done yet)")
    print(f"   ➡️  Then: Test embedding pipeline with this data")
else:
    print(f"   🔧 Fix the failed tests before proceeding to pipeline testing")
    
if loaded_df is not None:
    label_dist = loaded_df['label'].value_counts(normalize=True)
    if len(label_dist) < 2:
        print(f"   ⚠️  Warning: Only one class in data - add samples from other class")
    elif label_dist.min() < 0.1:
        print(f"   ⚠️  Warning: Imbalanced classes ({label_dist.to_dict()}) - consider rebalancing")
    else:
        print(f"   ✅ Good class balance: {label_dist.to_dict()}")

print(f"\n🔧 Configuration for embedding pipeline:")
if config:
    print(f"   input:")
    print(f"     dataset_path: \"{sample_data_path}\"")
    if hasattr(config.input, 'split_ratio'):
        print(f"     split_ratio: {config.input.split_ratio}")
    
    print(f"   data_processing:")
    print(f"     random_seed: {config.data_processing.random_seed}")
    print(f"     max_sequence_length: {config.data_processing.max_sequence_length}")
    print(f"     output_format: \"{config.data_processing.output_format}\"")

print(f"\n📚 Next steps:")
print(f"   1. Ensure your actual data follows the same format as the test data")
print(f"   2. Update the configuration with your actual data path")
print(f"   3. Run test_03_Embedding_Pipeline_Debug.ipynb")
print(f"   4. If issues persist, check the original CSV file manually")

# Cleanup sample data
try:
    sample_data_path.unlink()
    print(f"\n🧹 Cleaned up sample test data")
except:
    pass

## 🔧 Debug Cell (Run if needed)
Use this cell to test specific data scenarios or debug issues found above.

In [None]:
# Debug cell - modify as needed for specific testing

# Example: Load your actual data file
# actual_data_path = "/path/to/your/actual_data.csv"
# if os.path.exists(actual_data_path):
#     print(f"Testing actual data file: {actual_data_path}")
#     actual_df = load_and_validate_data(actual_data_path)
#     if actual_df is not None:
#         print(f"Actual data shape: {actual_df.shape}")
#         print(f"Actual label distribution: {actual_df['label'].value_counts()}")

# Example: Check specific data issues
# if 'loaded_df' in locals() and loaded_df is not None:
#     print("Checking for specific issues:")
#     print(f"Claims with |eoc|: {loaded_df['claims'].str.contains('\|eoc\|').sum()}")
#     print(f"Average claim length: {loaded_df['claims'].str.len().mean():.1f} characters")
#     print(f"Unique codes found: {len(set(' '.join(loaded_df['claims']).split()))}")

# Example: Test custom splitting ratios
# if 'loaded_df' in locals() and loaded_df is not None:
#     for ratio in [0.6, 0.7, 0.8, 0.9]:
#         print(f"\nTesting split ratio {ratio}:")
#         test_train_test_split(loaded_df, ratio, 42)

print("💡 Use this cell to run custom data validation tests and debug specific issues.")