# Data Validation and Exploration - New UCSF R2D2 Training Dataset

This notebook performs initial data validation and exploration of the new UCSF R2D2 training dataset.

## Dataset Information
- **Metadata File**: `/Users/abelvillcaroque/data/Audium/202504_UCSF_New_Trainig_set(R2D2)/R2D2 lung sounds metadata_TRAIN_2025.05.08_v3.csv`
- **Audio Data**: `/Users/abelvillcaroque/data/Audium/202504_UCSF_New_Trainig_set(R2D2)/R2D2_Train_Data/`
- **Expected**: ~20 files per patient average
- **Label Column**: `Microbiologicreferencestandard` (TB Negative, TB Positive; ignore Indeterminate)
- **Patient ID Format**: R2D2NNNNN


In [None]:
import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set up paths
metadata_path = '/Users/abelvillcaroque/data/Audium/202504_UCSF_New_Trainig_set(R2D2)/R2D2 lung sounds metadata_TRAIN_2025.05.08_v3.csv'
audio_data_path = '/Users/abelvillcaroque/data/Audium/202504_UCSF_New_Trainig_set(R2D2)/R2D2_Train_Data'
output_dir = '/Users/abelvillcaroque/git/github/audiumhealth/hear/audium_notebooks/09_202504_UCSF_New_Training_set(R2D2)'

## 1. Load and Inspect Metadata

In [None]:
# Load metadata
metadata = pd.read_csv(metadata_path)

print(f"Metadata Shape: {metadata.shape}")
print(f"\nColumns: {list(metadata.columns)}")
print(f"\nFirst few rows:")
metadata.head()

In [None]:
# Check StudyID format
print("StudyID Format Analysis:")
print(f"Total patients: {len(metadata)}")
print(f"Unique StudyIDs: {metadata['StudyID'].nunique()}")
print(f"StudyID format examples: {metadata['StudyID'].head(10).tolist()}")

# Check for R2D2 pattern
r2d2_pattern = metadata['StudyID'].str.match(r'^R2D2\d{5}$')
print(f"\nStudyIDs matching R2D2NNNNN pattern: {r2d2_pattern.sum()}/{len(metadata)}")
if not r2d2_pattern.all():
    print("Non-matching StudyIDs:")
    print(metadata[~r2d2_pattern]['StudyID'].tolist())

In [None]:
# Analyze TB Labels
print("TB Label Analysis:")
label_counts = metadata['Microbiologicreferencestandard'].value_counts()
print(f"\nLabel distribution:")
print(label_counts)

# Filter out Indeterminate as specified
valid_labels = metadata[metadata['Microbiologicreferencestandard'].isin(['TB Negative', 'TB Positive'])]
print(f"\nAfter filtering (excluding Indeterminate):")
print(f"Valid patients: {len(valid_labels)}/{len(metadata)}")
print(f"TB Positive: {(valid_labels['Microbiologicreferencestandard'] == 'TB Positive').sum()}")
print(f"TB Negative: {(valid_labels['Microbiologicreferencestandard'] == 'TB Negative').sum()}")
print(f"TB Prevalence: {(valid_labels['Microbiologicreferencestandard'] == 'TB Positive').mean():.3f}")

## 2. Audio File Validation

In [None]:
# Scan audio directory structure
print("Audio Directory Analysis:")
print(f"Audio data path: {audio_data_path}")
print(f"Directory exists: {os.path.exists(audio_data_path)}")

# Get all patient directories
patient_dirs = [d for d in os.listdir(audio_data_path) if os.path.isdir(os.path.join(audio_data_path, d))]
patient_dirs.sort()
print(f"\nTotal patient directories: {len(patient_dirs)}")
print(f"First 10 patient directories: {patient_dirs[:10]}")

# Check if all patient directories match R2D2 pattern
r2d2_dirs = [d for d in patient_dirs if d.startswith('R2D2')]
print(f"\nDirectories matching R2D2 pattern: {len(r2d2_dirs)}/{len(patient_dirs)}")

In [None]:
# Count audio files per patient
file_counts = {}
total_files = 0

for patient_dir in patient_dirs:
    patient_path = os.path.join(audio_data_path, patient_dir)
    wav_files = glob.glob(os.path.join(patient_path, '*.wav'))
    file_counts[patient_dir] = len(wav_files)
    total_files += len(wav_files)

print(f"Audio File Statistics:")
print(f"Total audio files: {total_files}")
print(f"Average files per patient: {np.mean(list(file_counts.values())):.2f}")
print(f"Median files per patient: {np.median(list(file_counts.values())):.2f}")
print(f"Min files per patient: {min(file_counts.values())}")
print(f"Max files per patient: {max(file_counts.values())}")

# Check if average is around 20 as expected
expected_avg = 20
actual_avg = np.mean(list(file_counts.values()))
print(f"\nExpected average: ~{expected_avg}")
print(f"Actual average: {actual_avg:.2f}")
print(f"Deviation from expected: {abs(actual_avg - expected_avg):.2f}")

In [None]:
# Distribution of files per patient
file_count_dist = Counter(file_counts.values())
print("\nFile count distribution:")
for count, freq in sorted(file_count_dist.items()):
    print(f"{count} files: {freq} patients")

# Visualize distribution
plt.figure(figsize=(10, 6))
plt.hist(list(file_counts.values()), bins=range(min(file_counts.values()), max(file_counts.values()) + 2), 
         alpha=0.7, edgecolor='black')
plt.axvline(x=expected_avg, color='red', linestyle='--', label=f'Expected (~{expected_avg})')
plt.axvline(x=actual_avg, color='green', linestyle='--', label=f'Actual ({actual_avg:.1f})')
plt.xlabel('Number of Audio Files per Patient')
plt.ylabel('Number of Patients')
plt.title('Distribution of Audio Files per Patient')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{output_dir}/results/file_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Data Alignment Validation

In [None]:
# Check alignment between metadata and audio files
metadata_patients = set(metadata['StudyID'])
audio_patients = set(patient_dirs)

print("Data Alignment Analysis:")
print(f"Patients in metadata: {len(metadata_patients)}")
print(f"Patients with audio files: {len(audio_patients)}")
print(f"Perfect match: {metadata_patients == audio_patients}")

# Check for mismatches
in_metadata_not_audio = metadata_patients - audio_patients
in_audio_not_metadata = audio_patients - metadata_patients

if in_metadata_not_audio:
    print(f"\nPatients in metadata but NO audio files ({len(in_metadata_not_audio)}):")
    print(sorted(list(in_metadata_not_audio))[:10])  # Show first 10
    
if in_audio_not_metadata:
    print(f"\nPatients with audio files but NOT in metadata ({len(in_audio_not_metadata)}):")
    print(sorted(list(in_audio_not_metadata))[:10])  # Show first 10

# Get aligned patients (intersection)
aligned_patients = metadata_patients.intersection(audio_patients)
print(f"\nPerfectly aligned patients: {len(aligned_patients)}")

## 4. Sample Audio File Inspection

In [None]:
# Inspect sample audio files
import librosa
import soundfile as sf

# Get first patient with audio files
sample_patient = patient_dirs[0]
sample_path = os.path.join(audio_data_path, sample_patient)
sample_files = glob.glob(os.path.join(sample_path, '*.wav'))

print(f"Sample Audio File Analysis (Patient: {sample_patient}):")
print(f"Number of files: {len(sample_files)}")
print(f"Sample file names: {[os.path.basename(f) for f in sample_files[:5]]}")

# Check first audio file properties
if sample_files:
    sample_file = sample_files[0]
    print(f"\nInspecting: {os.path.basename(sample_file)}")
    
    try:
        # Load audio file
        audio, sr = librosa.load(sample_file, sr=None)
        print(f"Sample rate: {sr} Hz")
        print(f"Duration: {len(audio) / sr:.2f} seconds")
        print(f"Audio shape: {audio.shape}")
        print(f"Audio dtype: {audio.dtype}")
        print(f"Audio range: [{audio.min():.6f}, {audio.max():.6f}]")
        
        # Check if it's mono or stereo
        if len(audio.shape) == 1:
            print("Audio format: Mono")
        else:
            print(f"Audio format: {audio.shape[1]} channels")
    except Exception as e:
        print(f"Error loading audio file: {e}")
        
        # Try with soundfile as backup
        try:
            audio, sr = sf.read(sample_file)
            print(f"\nLoaded with soundfile:")
            print(f"Sample rate: {sr} Hz")
            print(f"Duration: {len(audio) / sr:.2f} seconds")
            print(f"Audio shape: {audio.shape}")
        except Exception as e2:
            print(f"Also failed with soundfile: {e2}")

## 5. Create Validation Summary

In [None]:
# Create validation summary
validation_summary = {
    'metadata_patients': len(metadata_patients),
    'audio_patients': len(audio_patients),
    'aligned_patients': len(aligned_patients),
    'total_audio_files': total_files,
    'avg_files_per_patient': actual_avg,
    'expected_avg_files': expected_avg,
    'avg_deviation': abs(actual_avg - expected_avg),
    'tb_positive_patients': (valid_labels['Microbiologicreferencestandard'] == 'TB Positive').sum(),
    'tb_negative_patients': (valid_labels['Microbiologicreferencestandard'] == 'TB Negative').sum(),
    'indeterminate_patients': (metadata['Microbiologicreferencestandard'] == 'Indeterminate').sum(),
    'tb_prevalence': (valid_labels['Microbiologicreferencestandard'] == 'TB Positive').mean(),
    'r2d2_format_match': r2d2_pattern.sum(),
    'perfect_alignment': metadata_patients == audio_patients,
    'missing_audio': len(in_metadata_not_audio),
    'missing_metadata': len(in_audio_not_metadata)
}

print("\n" + "="*60)
print("VALIDATION SUMMARY")
print("="*60)
for key, value in validation_summary.items():
    print(f"{key}: {value}")

# Save validation summary
summary_df = pd.DataFrame([validation_summary])
summary_df.to_csv(f'{output_dir}/data/validation_summary.csv', index=False)
print(f"\nValidation summary saved to: {output_dir}/data/validation_summary.csv")

## 6. Sanity Check Assessment

In [None]:
# Sanity check assessment
print("\n" + "="*60)
print("SANITY CHECK ASSESSMENT")
print("="*60)

sanity_checks = [
    {
        'check': 'Average files per patient (~20)',
        'expected': 20,
        'actual': actual_avg,
        'tolerance': 5,
        'passed': abs(actual_avg - 20) <= 5
    },
    {
        'check': 'Perfect metadata-audio alignment',
        'expected': True,
        'actual': metadata_patients == audio_patients,
        'tolerance': 'N/A',
        'passed': metadata_patients == audio_patients
    },
    {
        'check': 'R2D2 format compliance',
        'expected': len(metadata),
        'actual': r2d2_pattern.sum(),
        'tolerance': 0,
        'passed': r2d2_pattern.all()
    },
    {
        'check': 'Valid TB labels available',
        'expected': '> 0',
        'actual': len(valid_labels),
        'tolerance': 'N/A',
        'passed': len(valid_labels) > 0
    },
    {
        'check': 'Minimum files per patient',
        'expected': '> 0',
        'actual': min(file_counts.values()),
        'tolerance': 'N/A',
        'passed': min(file_counts.values()) > 0
    }
]

all_passed = True
for check in sanity_checks:
    status = "✅ PASS" if check['passed'] else "❌ FAIL"
    print(f"{status} {check['check']}")
    print(f"    Expected: {check['expected']}, Actual: {check['actual']}, Tolerance: {check['tolerance']}")
    if not check['passed']:
        all_passed = False

print(f"\n{'='*60}")
if all_passed:
    print("🎉 ALL SANITY CHECKS PASSED! Data structure is valid.")
else:
    print("⚠️  SOME SANITY CHECKS FAILED! Review data structure.")
print(f"{'='*60}")

## 7. Prepare Small Dataset for Testing

In [None]:
# Create small dataset for initial testing (5-10 patients)
# Balance TB positive and negative patients
tb_positive_patients = valid_labels[valid_labels['Microbiologicreferencestandard'] == 'TB Positive']['StudyID'].tolist()
tb_negative_patients = valid_labels[valid_labels['Microbiologicreferencestandard'] == 'TB Negative']['StudyID'].tolist()

# Select patients that have audio files
tb_positive_with_audio = [p for p in tb_positive_patients if p in audio_patients]
tb_negative_with_audio = [p for p in tb_negative_patients if p in audio_patients]

print(f"TB Positive patients with audio: {len(tb_positive_with_audio)}")
print(f"TB Negative patients with audio: {len(tb_negative_with_audio)}")

# Create small balanced test set
n_test_patients = 10
n_positive = min(5, len(tb_positive_with_audio))
n_negative = min(5, len(tb_negative_with_audio))

small_test_patients = (
    tb_positive_with_audio[:n_positive] + 
    tb_negative_with_audio[:n_negative]
)

print(f"\nSmall test dataset created:")
print(f"Total patients: {len(small_test_patients)}")
print(f"TB Positive: {n_positive}")
print(f"TB Negative: {n_negative}")
print(f"Test patients: {small_test_patients}")

# Save small test patient list
small_test_df = pd.DataFrame({
    'StudyID': small_test_patients,
    'Label': [valid_labels[valid_labels['StudyID'] == p]['Microbiologicreferencestandard'].iloc[0] for p in small_test_patients]
})
small_test_df.to_csv(f'{output_dir}/data/small_test_patients.csv', index=False)
print(f"\nSmall test patient list saved to: {output_dir}/data/small_test_patients.csv")

## Summary

This notebook has validated the new UCSF R2D2 training dataset structure and prepared it for the next steps in the pipeline. The validation summary and small test dataset are saved for use in subsequent analysis steps.