# UCSF Embedding Validation

This notebook validates the generated UCSF embeddings against the original CSV data.
It compares patient-wise and file-wise statistics to ensure data integrity.

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import os

# Set up plotting style
plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Original CSV Data

In [30]:
# Load original CSV data
csv_path = "../r2d2_audio_index_with_labels.csv"
original_df = pd.read_csv(csv_path)

print(f"Original CSV shape: {original_df.shape}")
print(f"Columns: {list(original_df.columns)}")
print("\nFirst few rows:")
print(original_df.head())

Original CSV shape: (19798, 4)
Columns: ['patientID', 'filename', 'path', 'label']

First few rows:
   patientID                                           filename  \
0  R2D202272  Lung-LeftLateralInfraScapularPosterior-N1vSYj_...   
1  R2D202272  Lung-RightInterScapularPosterior-N1vS8Fc0_Icuz...   
2  R2D202272  Lung-RightLateralInfraScapularPosterior-N1vSeD...   
3  R2D202272  Lung-RightInfraClavicularAnterior-N1vRFB6Dy_0y...   
4  R2D202272  Lung-LeftInfraScapularPosterior-N1vSM75j9I3s6e...   

                                                path        label  
0  ../../../../../../../../../../../../data/Audiu...  TB Negative  
1  ../../../../../../../../../../../../data/Audiu...  TB Negative  
2  ../../../../../../../../../../../../data/Audiu...  TB Negative  
3  ../../../../../../../../../../../../data/Audiu...  TB Negative  
4  ../../../../../../../../../../../../data/Audiu...  TB Negative  


## Load Embeddings Data

In [31]:
def load_embeddings_with_labels(embedding_path, metadata_path, max_rows=None):
    """Load embeddings and match with labels from CSV."""
    # Load embeddings
    embeddings_data = np.load(embedding_path)
    
    # Check what keys are available in the .npz file
    print(f"Available keys in {embedding_path}: {len(list(embeddings_data.keys()))} keys")
    print(f"First 5 keys: {list(embeddings_data.keys())[:5]}")
    
    # This .npz file structure has each file as a separate key
    # Extract embeddings and keys from the structure
    keys = list(embeddings_data.keys())
    
    # Check embedding shapes first
    first_embedding = embeddings_data[keys[0]]
    print(f"First embedding shape: {first_embedding.shape}")
    
    # Load metadata
    metadata = pd.read_csv(metadata_path)
    if max_rows:
        metadata = metadata.head(max_rows)
    
    # Create full key for matching
    metadata['full_key'] = metadata['patientID'] + '/' + metadata['filename']
    
    # Create label mapping
    label_map = {"TB Positive": 1, "TB Negative": 0}
    
    # Match embeddings with labels
    matched_data = []
    unmatched_count = 0
    for key in keys:
        key_str = key.decode('utf-8') if isinstance(key, bytes) else str(key)
        matching_row = metadata[metadata['full_key'] == key_str]
        
        if len(matching_row) > 0:
            label_str = matching_row.iloc[0]['label']
            label = label_map.get(label_str, -1)
            patient_id = matching_row.iloc[0]['patientID']
            filename = matching_row.iloc[0]['filename']
            
            matched_data.append({
                'key': key_str,
                'patientID': patient_id,
                'filename': filename,
                'label': label,
                'label_str': label_str,
                'embedding': embeddings_data[key]  # Store individual embedding
            })
        else:
            unmatched_count += 1
            if unmatched_count <= 5:  # Show first 5 unmatched
                print(f"No match found for key: {key_str}")
    
    if unmatched_count > 5:
        print(f"... and {unmatched_count - 5} more unmatched keys")
    
    print(f"Successfully matched {len(matched_data)} embeddings with labels")
    return pd.DataFrame(matched_data)

# Load embeddings
embedding_path = "../01_data_processing/data/audium_UCSF_embeddings.npz"
embeddings_df = load_embeddings_with_labels(embedding_path, csv_path)

print(f"Embeddings DataFrame shape: {embeddings_df.shape}")
if len(embeddings_df) > 0:
    print(f"Embedding dimension: {embeddings_df['embedding'].iloc[0].shape}")
    print("\nFirst few rows:")
    print(embeddings_df[['key', 'patientID', 'filename', 'label_str']].head())
    
    # Check label distribution
    print("\nLabel distribution:")
    print(embeddings_df['label_str'].value_counts())
else:
    print("No embeddings were successfully matched!")

Available keys in audium_UCSF_embeddings.npz: 19484 keys
First 5 keys: ['R2D202272/Lung-LeftLateralInfraScapularPosterior-N1vSYj_jSBZ6TbbOOSE.wav', 'R2D202272/Lung-RightInterScapularPosterior-N1vS8Fc0_IcuzBHHJIO.wav', 'R2D202272/Lung-RightLateralInfraScapularPosterior-N1vSeDnv1QVQAVfVvRu.wav', 'R2D202272/Lung-RightInfraClavicularAnterior-N1vRFB6Dy_0yYnGvC8V.wav', 'R2D202272/Lung-LeftInfraScapularPosterior-N1vSM75j9I3s6eprIEO.wav']
First embedding shape: (11, 1024)
Successfully matched 19484 embeddings with labels
Embeddings DataFrame shape: (19484, 6)
Embedding dimension: (11, 1024)

First few rows:
                                                 key  patientID  \
0  R2D202272/Lung-LeftLateralInfraScapularPosteri...  R2D202272   
1  R2D202272/Lung-RightInterScapularPosterior-N1v...  R2D202272   
2  R2D202272/Lung-RightLateralInfraScapularPoster...  R2D202272   
3  R2D202272/Lung-RightInfraClavicularAnterior-N1...  R2D202272   
4  R2D202272/Lung-LeftInfraScapularPosterior-N1vS...  R2D2

## File-wise Statistics Comparison

In [32]:
def calculate_file_stats(df, label_col='label', name_prefix=''):
    """Calculate file-wise statistics."""
    stats = {}
    
    # Total files
    stats[f'{name_prefix}total_files'] = len(df)
    
    # Check if the DataFrame is empty or missing the label column
    if len(df) == 0:
        stats[f'{name_prefix}positive_files'] = 0
        stats[f'{name_prefix}negative_files'] = 0
        stats[f'{name_prefix}positive_ratio'] = 0
        stats[f'{name_prefix}negative_ratio'] = 0
        return stats
    
    if label_col not in df.columns:
        print(f"Warning: Column '{label_col}' not found in DataFrame. Available columns: {list(df.columns)}")
        stats[f'{name_prefix}positive_files'] = 0
        stats[f'{name_prefix}negative_files'] = 0
        stats[f'{name_prefix}positive_ratio'] = 0
        stats[f'{name_prefix}negative_ratio'] = 0
        return stats
    
    # Label distribution
    if label_col == 'label':
        # For embeddings (numeric labels)
        positive_files = (df[label_col] == 1).sum()
        negative_files = (df[label_col] == 0).sum()
    else:
        # For original CSV (string labels)
        positive_files = (df[label_col] == 'TB Positive').sum()
        negative_files = (df[label_col] == 'TB Negative').sum()
    
    stats[f'{name_prefix}positive_files'] = positive_files
    stats[f'{name_prefix}negative_files'] = negative_files
    stats[f'{name_prefix}positive_ratio'] = positive_files / len(df) if len(df) > 0 else 0
    stats[f'{name_prefix}negative_ratio'] = negative_files / len(df) if len(df) > 0 else 0
    
    return stats

# Calculate file-wise stats
original_file_stats = calculate_file_stats(original_df, 'label', 'original_')
embeddings_file_stats = calculate_file_stats(embeddings_df, 'label', 'embeddings_')

print("FILE-WISE STATISTICS COMPARISON")
print("=" * 50)
print(f"Original CSV total files: {original_file_stats['original_total_files']}")
print(f"Embeddings total files: {embeddings_file_stats['embeddings_total_files']}")

if original_file_stats['original_total_files'] > 0:
    print(f"Files matched: {embeddings_file_stats['embeddings_total_files'] / original_file_stats['original_total_files']:.2%}")
else:
    print("Files matched: N/A (no original files)")

print()
print("TB Positive files:")
print(f"  Original: {original_file_stats['original_positive_files']} ({original_file_stats['original_positive_ratio']:.2%})")
print(f"  Embeddings: {embeddings_file_stats['embeddings_positive_files']} ({embeddings_file_stats['embeddings_positive_ratio']:.2%})")
print()
print("TB Negative files:")
print(f"  Original: {original_file_stats['original_negative_files']} ({original_file_stats['original_negative_ratio']:.2%})")
print(f"  Embeddings: {embeddings_file_stats['embeddings_negative_files']} ({embeddings_file_stats['embeddings_negative_ratio']:.2%})")

# Debug information
print("\nDEBUG INFO:")
print(f"Embeddings DataFrame columns: {list(embeddings_df.columns) if len(embeddings_df) > 0 else 'DataFrame is empty'}")
print(f"Embeddings DataFrame shape: {embeddings_df.shape}")

FILE-WISE STATISTICS COMPARISON
Original CSV total files: 19798
Embeddings total files: 19484
Files matched: 98.41%

TB Positive files:
  Original: 0 (0.00%)
  Embeddings: 2505 (12.86%)

TB Negative files:
  Original: 0 (0.00%)
  Embeddings: 16818 (86.32%)

DEBUG INFO:
Embeddings DataFrame columns: ['key', 'patientID', 'filename', 'label', 'label_str', 'embedding']
Embeddings DataFrame shape: (19484, 6)


## Patient-wise Statistics Comparison

In [33]:
def calculate_patient_stats(df, label_col='label', name_prefix=''):
    """Calculate patient-wise statistics."""
    stats = {}
    
    # Check if DataFrame is empty
    if len(df) == 0:
        stats[f'{name_prefix}total_patients'] = 0
        stats[f'{name_prefix}positive_patients'] = 0
        stats[f'{name_prefix}negative_patients'] = 0
        stats[f'{name_prefix}positive_patient_ratio'] = 0
        stats[f'{name_prefix}negative_patient_ratio'] = 0
        stats[f'{name_prefix}avg_files_per_patient'] = 0
        stats[f'{name_prefix}min_files_per_patient'] = 0
        stats[f'{name_prefix}max_files_per_patient'] = 0
        return stats, []
    
    # Check if label column exists
    if label_col not in df.columns:
        print(f"Warning: Column '{label_col}' not found in DataFrame. Available columns: {list(df.columns)}")
        stats[f'{name_prefix}total_patients'] = 0
        stats[f'{name_prefix}positive_patients'] = 0
        stats[f'{name_prefix}negative_patients'] = 0
        stats[f'{name_prefix}positive_patient_ratio'] = 0
        stats[f'{name_prefix}negative_patient_ratio'] = 0
        stats[f'{name_prefix}avg_files_per_patient'] = 0
        stats[f'{name_prefix}min_files_per_patient'] = 0
        stats[f'{name_prefix}max_files_per_patient'] = 0
        return stats, []
    
    # Group by patient
    patient_groups = df.groupby('patientID')
    
    # Total patients
    stats[f'{name_prefix}total_patients'] = len(patient_groups)
    
    # Patient label determination (any positive file makes patient positive)
    patient_labels = []
    for patient_id, group in patient_groups:
        if label_col == 'label':
            # For embeddings (numeric labels)
            has_positive = (group[label_col] == 1).any()
        else:
            # For original CSV (string labels)
            has_positive = (group[label_col] == 'TB Positive').any()
        
        patient_labels.append(1 if has_positive else 0)
    
    positive_patients = sum(patient_labels)
    negative_patients = len(patient_labels) - positive_patients
    
    stats[f'{name_prefix}positive_patients'] = positive_patients
    stats[f'{name_prefix}negative_patients'] = negative_patients
    stats[f'{name_prefix}positive_patient_ratio'] = positive_patients / len(patient_labels) if len(patient_labels) > 0 else 0
    stats[f'{name_prefix}negative_patient_ratio'] = negative_patients / len(patient_labels) if len(patient_labels) > 0 else 0
    
    # Files per patient statistics
    files_per_patient = patient_groups.size()
    stats[f'{name_prefix}avg_files_per_patient'] = files_per_patient.mean()
    stats[f'{name_prefix}min_files_per_patient'] = files_per_patient.min()
    stats[f'{name_prefix}max_files_per_patient'] = files_per_patient.max()
    
    return stats, patient_labels

# Calculate patient-wise stats
original_patient_stats, original_patient_labels = calculate_patient_stats(original_df, 'label', 'original_')
embeddings_patient_stats, embeddings_patient_labels = calculate_patient_stats(embeddings_df, 'label', 'embeddings_')

print("PATIENT-WISE STATISTICS COMPARISON")
print("=" * 50)
print(f"Original CSV total patients: {original_patient_stats['original_total_patients']}")
print(f"Embeddings total patients: {embeddings_patient_stats['embeddings_total_patients']}")

if original_patient_stats['original_total_patients'] > 0:
    print(f"Patients matched: {embeddings_patient_stats['embeddings_total_patients'] / original_patient_stats['original_total_patients']:.2%}")
else:
    print("Patients matched: N/A (no original patients)")

print()
print("TB Positive patients:")
print(f"  Original: {original_patient_stats['original_positive_patients']} ({original_patient_stats['original_positive_patient_ratio']:.2%})")
print(f"  Embeddings: {embeddings_patient_stats['embeddings_positive_patients']} ({embeddings_patient_stats['embeddings_positive_patient_ratio']:.2%})")
print()
print("TB Negative patients:")
print(f"  Original: {original_patient_stats['original_negative_patients']} ({original_patient_stats['original_negative_patient_ratio']:.2%})")
print(f"  Embeddings: {embeddings_patient_stats['embeddings_negative_patients']} ({embeddings_patient_stats['embeddings_negative_patient_ratio']:.2%})")
print()
print("Files per patient:")
print(f"  Original - Avg: {original_patient_stats['original_avg_files_per_patient']:.1f}, Min: {original_patient_stats['original_min_files_per_patient']}, Max: {original_patient_stats['original_max_files_per_patient']}")
print(f"  Embeddings - Avg: {embeddings_patient_stats['embeddings_avg_files_per_patient']:.1f}, Min: {embeddings_patient_stats['embeddings_min_files_per_patient']}, Max: {embeddings_patient_stats['embeddings_max_files_per_patient']}")

# Debug information
print("\nDEBUG INFO:")
print(f"Embeddings DataFrame columns: {list(embeddings_df.columns) if len(embeddings_df) > 0 else 'DataFrame is empty'}")
print(f"Embeddings DataFrame shape: {embeddings_df.shape}")

PATIENT-WISE STATISTICS COMPARISON
Original CSV total patients: 551
Embeddings total patients: 550
Patients matched: 99.82%

TB Positive patients:
  Original: 0 (0.00%)
  Embeddings: 128 (23.27%)

TB Negative patients:
  Original: 551 (100.00%)
  Embeddings: 422 (76.73%)

Files per patient:
  Original - Avg: 35.9, Min: 1, Max: 8975
  Embeddings - Avg: 35.4, Min: 1, Max: 8798

DEBUG INFO:
Embeddings DataFrame columns: ['key', 'patientID', 'filename', 'label', 'label_str', 'embedding']
Embeddings DataFrame shape: (19484, 6)


## Data Integrity Checks

In [34]:
def perform_integrity_checks(original_df, embeddings_df):
    """Perform comprehensive data integrity checks."""
    print("DATA INTEGRITY CHECKS")
    print("=" * 50)
    
    # Check 1: Missing files
    original_keys = set(original_df['patientID'] + '/' + original_df['filename'])
    embedding_keys = set(embeddings_df['key']) if len(embeddings_df) > 0 else set()
    
    missing_in_embeddings = original_keys - embedding_keys
    extra_in_embeddings = embedding_keys - original_keys
    
    print(f"✓ Files in original CSV: {len(original_keys)}")
    print(f"✓ Files in embeddings: {len(embedding_keys)}")
    print(f"✓ Missing from embeddings: {len(missing_in_embeddings)}")
    print(f"✓ Extra in embeddings: {len(extra_in_embeddings)}")
    
    if missing_in_embeddings:
        print("\n⚠️  Sample missing files:")
        for i, missing_file in enumerate(list(missing_in_embeddings)[:5]):
            print(f"  {i+1}. {missing_file}")
        if len(missing_in_embeddings) > 5:
            print(f"  ... and {len(missing_in_embeddings) - 5} more")
    
    # Check 2: Label consistency (sample check)
    common_keys = original_keys & embedding_keys
    label_mismatches = 0
    
    if len(common_keys) > 0:
        sample_size = min(100, len(common_keys))
        sample_keys = list(common_keys)[:sample_size]
        
        for key in sample_keys:
            original_label = original_df[original_df['patientID'] + '/' + original_df['filename'] == key]['label'].iloc[0]
            embedding_label = embeddings_df[embeddings_df['key'] == key]['label_str'].iloc[0]
            
            if original_label != embedding_label:
                label_mismatches += 1
        
        print(f"\n✓ Label consistency check (sample of {sample_size}): {sample_size - label_mismatches}/{sample_size} matches")
        if label_mismatches > 0:
            print(f"⚠️  Label mismatches found: {label_mismatches}")
    
    # Check 3: Embedding quality
    if len(embeddings_df) > 0:
        sample_embedding = embeddings_df['embedding'].iloc[0]
        
        # Check for NaN or infinite values
        nan_count = np.isnan(sample_embedding).sum()
        inf_count = np.isinf(sample_embedding).sum()
        
        print(f"\n✓ Embedding quality:")
        print(f"  Shape: {sample_embedding.shape}")
        print(f"  Data type: {sample_embedding.dtype}")
        print(f"  Mean: {sample_embedding.mean():.4f}")
        print(f"  Std: {sample_embedding.std():.4f}")
        print(f"  Min/Max: {sample_embedding.min():.4f} / {sample_embedding.max():.4f}")
        print(f"  NaN values: {nan_count}")
        print(f"  Infinite values: {inf_count}")
    
    return {
        'missing_files': len(missing_in_embeddings),
        'extra_files': len(extra_in_embeddings),
        'label_mismatches': label_mismatches,
        'common_files': len(common_keys)
    }

integrity_results = perform_integrity_checks(original_df, embeddings_df)

DATA INTEGRITY CHECKS
✓ Files in original CSV: 19798
✓ Files in embeddings: 19484
✓ Missing from embeddings: 314
✓ Extra in embeddings: 0

⚠️  Sample missing files:
  1. R2D204247/-MlslEvUEPBdW_B6NT9O.wav
  2. R2D204247/-MlsktUJhiKqHKBEBeF6.wav
  3. R2D201001/-MzDtZ5Gl6aiB5Qf-98d.wav
  4. R2D201001/-Mj3t5IcxzF53K43T4kN.wav
  5. R2D201294/-MyuyN3pkcop4NF73sPu.wav
  ... and 309 more

✓ Label consistency check (sample of 100): 100/100 matches

✓ Embedding quality:
  Shape: (11, 1024)
  Data type: float32
  Mean: -0.0156
  Std: 1.0074
  Min/Max: -3.7064 / 3.8234
  NaN values: 0
  Infinite values: 0


## Visualization Dashboard

In [35]:
def create_validation_dashboard(original_df, embeddings_df, original_file_stats, embeddings_file_stats, 
                               original_patient_stats, embeddings_patient_stats):
    """Create a comprehensive validation dashboard."""
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('UCSF Embedding Validation Dashboard', fontsize=16, fontweight='bold')
    
    # 1. File count comparison
    categories = ['Total Files', 'TB Positive', 'TB Negative']
    original_counts = [original_file_stats['original_total_files'], 
                      original_file_stats['original_positive_files'],
                      original_file_stats['original_negative_files']]
    embedding_counts = [embeddings_file_stats['embeddings_total_files'],
                       embeddings_file_stats['embeddings_positive_files'],
                       embeddings_file_stats['embeddings_negative_files']]
    
    x = np.arange(len(categories))
    width = 0.35
    
    axes[0, 0].bar(x - width/2, original_counts, width, label='Original CSV', alpha=0.8)
    axes[0, 0].bar(x + width/2, embedding_counts, width, label='Embeddings', alpha=0.8)
    axes[0, 0].set_title('File Count Comparison')
    axes[0, 0].set_xlabel('Category')
    axes[0, 0].set_ylabel('Count')
    axes[0, 0].set_xticks(x)
    axes[0, 0].set_xticklabels(categories)
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Patient count comparison
    patient_categories = ['Total Patients', 'TB Positive', 'TB Negative']
    original_patient_counts = [original_patient_stats['original_total_patients'],
                              original_patient_stats['original_positive_patients'],
                              original_patient_stats['original_negative_patients']]
    embedding_patient_counts = [embeddings_patient_stats['embeddings_total_patients'],
                               embeddings_patient_stats['embeddings_positive_patients'],
                               embeddings_patient_stats['embeddings_negative_patients']]
    
    axes[0, 1].bar(x - width/2, original_patient_counts, width, label='Original CSV', alpha=0.8)
    axes[0, 1].bar(x + width/2, embedding_patient_counts, width, label='Embeddings', alpha=0.8)
    axes[0, 1].set_title('Patient Count Comparison')
    axes[0, 1].set_xlabel('Category')
    axes[0, 1].set_ylabel('Count')
    axes[0, 1].set_xticks(x)
    axes[0, 1].set_xticklabels(patient_categories)
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Label distribution pie charts
    original_labels = ['TB Positive', 'TB Negative']
    original_sizes = [original_file_stats['original_positive_files'], 
                     original_file_stats['original_negative_files']]
    
    axes[0, 2].pie(original_sizes, labels=original_labels, autopct='%1.1f%%', startangle=90)
    axes[0, 2].set_title('Original CSV\nLabel Distribution')
    
    # 4. Files per patient distribution
    original_files_per_patient = original_df.groupby('patientID').size()
    embeddings_files_per_patient = embeddings_df.groupby('patientID').size()
    
    axes[1, 0].hist(original_files_per_patient, bins=20, alpha=0.7, label='Original CSV', density=True)
    axes[1, 0].hist(embeddings_files_per_patient, bins=20, alpha=0.7, label='Embeddings', density=True)
    axes[1, 0].set_title('Files per Patient Distribution')
    axes[1, 0].set_xlabel('Files per Patient')
    axes[1, 0].set_ylabel('Density')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # 5. Embedding quality metrics
    if len(embeddings_df) > 0:
        embedding_matrix = np.stack(embeddings_df['embedding'].values)
        norms = np.linalg.norm(embedding_matrix, axis=1)
        
        axes[1, 1].hist(norms, bins=30, alpha=0.7, color='green')
        axes[1, 1].set_title('Embedding Magnitude Distribution')
        axes[1, 1].set_xlabel('L2 Norm')
        axes[1, 1].set_ylabel('Frequency')
        axes[1, 1].grid(True, alpha=0.3)
        
        # Add statistics text
        mean_norm = np.mean(norms)
        std_norm = np.std(norms)
        axes[1, 1].axvline(mean_norm, color='red', linestyle='--', label=f'Mean: {mean_norm:.3f}')
        axes[1, 1].legend()
    
    # 6. Data integrity summary
    axes[1, 2].axis('off')
    
    # Create summary text
    summary_text = f"""DATA INTEGRITY SUMMARY

File Coverage:
• Original: {original_file_stats['original_total_files']} files
• Embeddings: {embeddings_file_stats['embeddings_total_files']} files
• Coverage: {embeddings_file_stats['embeddings_total_files']/original_file_stats['original_total_files']:.1%}

Patient Coverage:
• Original: {original_patient_stats['original_total_patients']} patients
• Embeddings: {embeddings_patient_stats['embeddings_total_patients']} patients
• Coverage: {embeddings_patient_stats['embeddings_total_patients']/original_patient_stats['original_total_patients']:.1%}

Label Balance (File-wise):
• TB Positive: {original_file_stats['original_positive_ratio']:.1%} → {embeddings_file_stats['embeddings_positive_ratio']:.1%}
• TB Negative: {original_file_stats['original_negative_ratio']:.1%} → {embeddings_file_stats['embeddings_negative_ratio']:.1%}

Label Balance (Patient-wise):
• TB Positive: {original_patient_stats['original_positive_patient_ratio']:.1%} → {embeddings_patient_stats['embeddings_positive_patient_ratio']:.1%}
• TB Negative: {original_patient_stats['original_negative_patient_ratio']:.1%} → {embeddings_patient_stats['embeddings_negative_patient_ratio']:.1%}"""
    
    axes[1, 2].text(0.05, 0.95, summary_text, transform=axes[1, 2].transAxes, 
                   fontsize=10, verticalalignment='top', fontfamily='monospace',
                   bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
    
    plt.tight_layout()
    plt.show()

create_validation_dashboard(original_df, embeddings_df, original_file_stats, embeddings_file_stats,
                           original_patient_stats, embeddings_patient_stats)

  x = x / sx


ValueError: cannot convert float NaN to integer

ValueError: need at least one array to concatenate

<Figure size 1800x1200 with 6 Axes>

## Final Validation Report

In [None]:
def generate_validation_report(original_df, embeddings_df, original_file_stats, embeddings_file_stats,
                              original_patient_stats, embeddings_patient_stats, integrity_results):
    """Generate a comprehensive validation report."""
    
    print("\n" + "="*80)
    print("UCSF EMBEDDING VALIDATION REPORT")
    print("="*80)
    
    print("\n📊 SUMMARY")
    print("-" * 40)
    print(f"Original CSV: {original_file_stats['original_total_files']} files from {original_patient_stats['original_total_patients']} patients")
    print(f"Generated embeddings: {embeddings_file_stats['embeddings_total_files']} files from {embeddings_patient_stats['embeddings_total_patients']} patients")
    print(f"Coverage: {embeddings_file_stats['embeddings_total_files']/original_file_stats['original_total_files']:.1%} files, {embeddings_patient_stats['embeddings_total_patients']/original_patient_stats['original_total_patients']:.1%} patients")
    
    print("\n📈 FILE-WISE STATISTICS")
    print("-" * 40)
    print(f"TB Positive files: {original_file_stats['original_positive_files']} → {embeddings_file_stats['embeddings_positive_files']} ({original_file_stats['original_positive_ratio']:.1%} → {embeddings_file_stats['embeddings_positive_ratio']:.1%})")
    print(f"TB Negative files: {original_file_stats['original_negative_files']} → {embeddings_file_stats['embeddings_negative_files']} ({original_file_stats['original_negative_ratio']:.1%} → {embeddings_file_stats['embeddings_negative_ratio']:.1%})")
    
    print("\n👥 PATIENT-WISE STATISTICS")
    print("-" * 40)
    print(f"TB Positive patients: {original_patient_stats['original_positive_patients']} → {embeddings_patient_stats['embeddings_positive_patients']} ({original_patient_stats['original_positive_patient_ratio']:.1%} → {embeddings_patient_stats['embeddings_positive_patient_ratio']:.1%})")
    print(f"TB Negative patients: {original_patient_stats['original_negative_patients']} → {embeddings_patient_stats['embeddings_negative_patients']} ({original_patient_stats['original_negative_patient_ratio']:.1%} → {embeddings_patient_stats['embeddings_negative_patient_ratio']:.1%})")
    print(f"Avg files per patient: {original_patient_stats['original_avg_files_per_patient']:.1f} → {embeddings_patient_stats['embeddings_avg_files_per_patient']:.1f}")
    
    print("\n🔍 DATA INTEGRITY")
    print("-" * 40)
    print(f"Files successfully processed: {integrity_results['common_files']}/{original_file_stats['original_total_files']} ({integrity_results['common_files']/original_file_stats['original_total_files']:.1%})")
    print(f"Missing files: {integrity_results['missing_files']}")
    print(f"Extra files: {integrity_results['extra_files']}")
    print(f"Label mismatches: {integrity_results['label_mismatches']}")
    
    if len(embeddings_df) > 0:
        embedding_matrix = np.stack(embeddings_df['embedding'].values)
        print("\n🎯 EMBEDDING QUALITY")
        print("-" * 40)
        print(f"Embedding shape: {embedding_matrix.shape}")
        print(f"Data type: {embedding_matrix.dtype}")
        print(f"Mean magnitude: {np.mean(np.linalg.norm(embedding_matrix, axis=1)):.4f}")
        print(f"Std magnitude: {np.std(np.linalg.norm(embedding_matrix, axis=1)):.4f}")
        print(f"Min/Max values: {embedding_matrix.min():.4f} / {embedding_matrix.max():.4f}")
        
        # Check for problematic values
        nan_count = np.isnan(embedding_matrix).sum()
        inf_count = np.isinf(embedding_matrix).sum()
        if nan_count > 0 or inf_count > 0:
            print(f"⚠️  Issues found: {nan_count} NaN values, {inf_count} infinite values")
        else:
            print("✅ No NaN or infinite values found")
    
    print("\n✅ VALIDATION CONCLUSION")
    print("-" * 40)
    
    # Determine overall validation status
    issues = []
    if integrity_results['missing_files'] > 0:
        issues.append(f"{integrity_results['missing_files']} missing files")
    if integrity_results['label_mismatches'] > 0:
        issues.append(f"{integrity_results['label_mismatches']} label mismatches")
    if len(embeddings_df) > 0:
        embedding_matrix = np.stack(embeddings_df['embedding'].values)
        if np.isnan(embedding_matrix).sum() > 0 or np.isinf(embedding_matrix).sum() > 0:
            issues.append("problematic embedding values")
    
    if not issues:
        print("🎉 VALIDATION PASSED: Embeddings are consistent with original data")
        print("   • All files processed successfully")
        print("   • Label distributions match")
        print("   • Embedding quality is good")
    else:
        print("⚠️  VALIDATION ISSUES FOUND:")
        for issue in issues:
            print(f"   • {issue}")
    
    print("\n" + "="*80)

# Generate final report
generate_validation_report(original_df, embeddings_df, original_file_stats, embeddings_file_stats,
                          original_patient_stats, embeddings_patient_stats, integrity_results)