# 01 - Data Exploration: Skin Disease Detection Dataset

**Phase 1 of Skin Disease Detection ML Pipeline**

This notebook explores the available datasets for comprehensive skin disease detection:
1. **ISIC 2019** - 25,331 images, 8 skin lesion categories + UNK
2. **HAM10000** - 10,015 images, 7 pigmented lesion classes

---

## Objectives
- Understand dataset structure and class distributions
- Identify class imbalance issues
- Analyze image properties (dimensions, quality)
- Visualize sample images from each class
- Document preprocessing requirements

In [None]:
# Cell 1: Import Libraries
import os
import sys
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print(f"Python: {sys.version}")
print(f"Pandas: {pd.__version__}")
print(f"NumPy: {np.__version__}")

In [None]:
# Cell 2: Define Dataset Paths
PROJECT_ROOT = Path(r"C:\Users\Admin\Downloads\SKIN")

# ISIC 2019 Dataset
ISIC_2019_DIR = PROJECT_ROOT / "ISIC_2019_Training_Input"
ISIC_2019_GT = ISIC_2019_DIR / "ISIC_2019_Training_GroundTruth.csv"
ISIC_2019_META = ISIC_2019_DIR / "ISIC_2019_Training_Metadata.csv"

# HAM10000 Dataset
HAM10000_DIR = PROJECT_ROOT / "skin-cancer-mnist-ham10000"
HAM10000_META = HAM10000_DIR / "HAM10000_metadata.csv"
HAM10000_IMAGES_1 = HAM10000_DIR / "HAM10000_images_part_1"
HAM10000_IMAGES_2 = HAM10000_DIR / "HAM10000_images_part_2"

# Verify paths exist
datasets = {
    "ISIC 2019 Images": ISIC_2019_DIR,
    "ISIC 2019 Ground Truth": ISIC_2019_GT,
    "ISIC 2019 Metadata": ISIC_2019_META,
    "HAM10000 Metadata": HAM10000_META,
    "HAM10000 Images Part 1": HAM10000_IMAGES_1,
    "HAM10000 Images Part 2": HAM10000_IMAGES_2
}

print("Dataset Availability Check:")
print("=" * 50)
for name, path in datasets.items():
    status = "✓ Found" if path.exists() else "✗ Missing"
    print(f"{name}: {status}")

---
## 1. ISIC 2019 Dataset Exploration

In [None]:
# Cell 3: Load ISIC 2019 Ground Truth
isic_gt = pd.read_csv(ISIC_2019_GT)

print(f"ISIC 2019 Ground Truth Shape: {isic_gt.shape}")
print(f"\nColumns: {list(isic_gt.columns)}")
print(f"\nFirst 5 rows:")
isic_gt.head()

In [None]:
# Cell 4: ISIC 2019 Class Mapping
ISIC_CLASS_NAMES = {
    'MEL': 'Melanoma',
    'NV': 'Melanocytic Nevus',
    'BCC': 'Basal Cell Carcinoma',
    'AK': 'Actinic Keratosis',
    'BKL': 'Benign Keratosis',
    'DF': 'Dermatofibroma',
    'VASC': 'Vascular Lesion',
    'SCC': 'Squamous Cell Carcinoma',
    'UNK': 'Unknown'
}

# Convert one-hot to class labels
class_cols = ['MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC', 'UNK']
isic_gt['class'] = isic_gt[class_cols].idxmax(axis=1)
isic_gt['class_name'] = isic_gt['class'].map(ISIC_CLASS_NAMES)

print("ISIC 2019 Class Distribution:")
print("=" * 50)
class_dist = isic_gt['class'].value_counts()
for cls, count in class_dist.items():
    pct = (count / len(isic_gt)) * 100
    print(f"{cls} ({ISIC_CLASS_NAMES[cls]}): {count:,} ({pct:.2f}%)")

In [None]:
# Cell 5: Visualize ISIC 2019 Class Distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
colors = sns.color_palette('husl', len(class_dist))
bars = axes[0].bar(class_dist.index, class_dist.values, color=colors)
axes[0].set_xlabel('Class', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_title('ISIC 2019 Class Distribution', fontsize=14, fontweight='bold')
axes[0].tick_params(axis='x', rotation=45)

# Add count labels on bars
for bar, count in zip(bars, class_dist.values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 100,
                 f'{count:,}', ha='center', va='bottom', fontsize=9)

# Pie chart
axes[1].pie(class_dist.values, labels=class_dist.index, autopct='%1.1f%%',
            colors=colors, startangle=90)
axes[1].set_title('ISIC 2019 Class Proportions', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'notebooks' / 'isic_2019_class_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\n⚠️ Class Imbalance Detected!")
print(f"  Majority class (NV): {class_dist.max():,} samples")
print(f"  Minority class ({class_dist.idxmin()}): {class_dist.min():,} samples")
print(f"  Imbalance ratio: {class_dist.max() / class_dist.min():.1f}:1")

In [None]:
# Cell 6: Load ISIC 2019 Metadata (if available)
if ISIC_2019_META.exists():
    isic_meta = pd.read_csv(ISIC_2019_META)
    print(f"ISIC 2019 Metadata Shape: {isic_meta.shape}")
    print(f"\nColumns: {list(isic_meta.columns)}")
    print(f"\nFirst 5 rows:")
    display(isic_meta.head())
    
    # Check for useful metadata
    print("\nMetadata Statistics:")
    print(isic_meta.describe(include='all'))
else:
    print("ISIC 2019 Metadata file not found.")

---
## 2. HAM10000 Dataset Exploration

In [None]:
# Cell 7: Load HAM10000 Metadata
ham_meta = pd.read_csv(HAM10000_META)

print(f"HAM10000 Metadata Shape: {ham_meta.shape}")
print(f"\nColumns: {list(ham_meta.columns)}")
print(f"\nColumn dtypes:")
print(ham_meta.dtypes)
print(f"\nFirst 5 rows:")
ham_meta.head()

In [None]:
# Cell 8: HAM10000 Class Mapping
HAM_CLASS_NAMES = {
    'akiec': 'Actinic Keratoses / Bowen\'s Disease',
    'bcc': 'Basal Cell Carcinoma',
    'bkl': 'Benign Keratosis-like Lesions',
    'df': 'Dermatofibroma',
    'mel': 'Melanoma',
    'nv': 'Melanocytic Nevi',
    'vasc': 'Vascular Lesions'
}

ham_meta['class_name'] = ham_meta['dx'].map(HAM_CLASS_NAMES)

print("HAM10000 Class Distribution:")
print("=" * 50)
ham_class_dist = ham_meta['dx'].value_counts()
for cls, count in ham_class_dist.items():
    pct = (count / len(ham_meta)) * 100
    print(f"{cls} ({HAM_CLASS_NAMES[cls]}): {count:,} ({pct:.2f}%)")

In [None]:
# Cell 9: Visualize HAM10000 Class Distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

colors = sns.color_palette('husl', len(ham_class_dist))

# Bar plot
bars = axes[0].bar(ham_class_dist.index, ham_class_dist.values, color=colors)
axes[0].set_xlabel('Class', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_title('HAM10000 Class Distribution', fontsize=14, fontweight='bold')
axes[0].tick_params(axis='x', rotation=45)

for bar, count in zip(bars, ham_class_dist.values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50,
                 f'{count:,}', ha='center', va='bottom', fontsize=9)

# Pie chart
axes[1].pie(ham_class_dist.values, labels=ham_class_dist.index, autopct='%1.1f%%',
            colors=colors, startangle=90)
axes[1].set_title('HAM10000 Class Proportions', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'notebooks' / 'ham10000_class_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\n⚠️ Class Imbalance Detected!")
print(f"  Majority class (nv): {ham_class_dist.max():,} samples")
print(f"  Minority class ({ham_class_dist.idxmin()}): {ham_class_dist.min():,} samples")
print(f"  Imbalance ratio: {ham_class_dist.max() / ham_class_dist.min():.1f}:1")

In [None]:
# Cell 10: HAM10000 Demographic Analysis
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Age distribution
ham_meta['age'].hist(bins=30, ax=axes[0, 0], color='steelblue', edgecolor='black')
axes[0, 0].set_xlabel('Age', fontsize=12)
axes[0, 0].set_ylabel('Count', fontsize=12)
axes[0, 0].set_title('Age Distribution', fontsize=14, fontweight='bold')
axes[0, 0].axvline(ham_meta['age'].mean(), color='red', linestyle='--', label=f"Mean: {ham_meta['age'].mean():.1f}")
axes[0, 0].legend()

# Sex distribution
sex_dist = ham_meta['sex'].value_counts()
axes[0, 1].pie(sex_dist.values, labels=sex_dist.index, autopct='%1.1f%%',
               colors=['#3498db', '#e74c3c'], startangle=90)
axes[0, 1].set_title('Sex Distribution', fontsize=14, fontweight='bold')

# Localization distribution
loc_dist = ham_meta['localization'].value_counts().head(10)
axes[1, 0].barh(loc_dist.index, loc_dist.values, color='teal')
axes[1, 0].set_xlabel('Count', fontsize=12)
axes[1, 0].set_ylabel('Body Location', fontsize=12)
axes[1, 0].set_title('Top 10 Lesion Localizations', fontsize=14, fontweight='bold')

# Diagnosis type
dx_type_dist = ham_meta['dx_type'].value_counts()
axes[1, 1].bar(dx_type_dist.index, dx_type_dist.values, color='coral')
axes[1, 1].set_xlabel('Diagnosis Type', fontsize=12)
axes[1, 1].set_ylabel('Count', fontsize=12)
axes[1, 1].set_title('Diagnosis Method', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'notebooks' / 'ham10000_demographics.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nDemographic Statistics:")
print(f"  Age range: {ham_meta['age'].min():.0f} - {ham_meta['age'].max():.0f} years")
print(f"  Mean age: {ham_meta['age'].mean():.1f} years")
print(f"  Missing age: {ham_meta['age'].isna().sum()} records")

---
## 3. Image Properties Analysis

In [None]:
# Cell 11: Analyze ISIC 2019 Image Properties
def analyze_images(image_dir, sample_size=500):
    """Analyze image properties from a directory."""
    image_files = list(Path(image_dir).glob('*.jpg'))
    
    # Sample if too many images
    if len(image_files) > sample_size:
        np.random.seed(42)
        image_files = list(np.random.choice(image_files, sample_size, replace=False))
    
    widths, heights, file_sizes = [], [], []
    corrupt_images = []
    
    for img_path in image_files:
        try:
            with Image.open(img_path) as img:
                widths.append(img.width)
                heights.append(img.height)
                file_sizes.append(os.path.getsize(img_path) / 1024)  # KB
        except Exception as e:
            corrupt_images.append(img_path.name)
    
    return {
        'total': len(image_files),
        'widths': widths,
        'heights': heights,
        'file_sizes': file_sizes,
        'corrupt': corrupt_images,
        'unique_dimensions': len(set(zip(widths, heights)))
    }

print("Analyzing ISIC 2019 images (sample of 500)...")
isic_stats = analyze_images(ISIC_2019_DIR, sample_size=500)

print(f"\nISIC 2019 Image Statistics:")
print(f"  Sampled: {isic_stats['total']} images")
print(f"  Width range: {min(isic_stats['widths'])} - {max(isic_stats['widths'])} px")
print(f"  Height range: {min(isic_stats['heights'])} - {max(isic_stats['heights'])} px")
print(f"  Mean dimensions: {np.mean(isic_stats['widths']):.0f} x {np.mean(isic_stats['heights']):.0f} px")
print(f"  Unique dimensions: {isic_stats['unique_dimensions']}")
print(f"  File size range: {min(isic_stats['file_sizes']):.1f} - {max(isic_stats['file_sizes']):.1f} KB")
print(f"  Mean file size: {np.mean(isic_stats['file_sizes']):.1f} KB")
print(f"  Corrupt images: {len(isic_stats['corrupt'])}")

In [None]:
# Cell 12: Analyze HAM10000 Image Properties
def get_ham_images():
    """Get all HAM10000 image paths."""
    images = []
    for part_dir in [HAM10000_IMAGES_1, HAM10000_IMAGES_2]:
        if part_dir.exists():
            images.extend(list(part_dir.glob('*.jpg')))
    return images

ham_images = get_ham_images()
print(f"Total HAM10000 images found: {len(ham_images)}")

print("\nAnalyzing HAM10000 images (sample of 500)...")
if len(ham_images) > 0:
    # Analyze from combined list
    np.random.seed(42)
    sample_images = list(np.random.choice(ham_images, min(500, len(ham_images)), replace=False))
    
    widths, heights, file_sizes = [], [], []
    for img_path in sample_images:
        try:
            with Image.open(img_path) as img:
                widths.append(img.width)
                heights.append(img.height)
                file_sizes.append(os.path.getsize(img_path) / 1024)
        except:
            pass
    
    print(f"\nHAM10000 Image Statistics:")
    print(f"  Sampled: {len(widths)} images")
    print(f"  Width range: {min(widths)} - {max(widths)} px")
    print(f"  Height range: {min(heights)} - {max(heights)} px")
    print(f"  Mean dimensions: {np.mean(widths):.0f} x {np.mean(heights):.0f} px")
    print(f"  Unique dimensions: {len(set(zip(widths, heights)))}")
    print(f"  File size range: {min(file_sizes):.1f} - {max(file_sizes):.1f} KB")
    print(f"  Mean file size: {np.mean(file_sizes):.1f} KB")

In [None]:
# Cell 13: Visualize Image Dimension Distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ISIC 2019
axes[0].scatter(isic_stats['widths'], isic_stats['heights'], alpha=0.5, s=10)
axes[0].set_xlabel('Width (px)', fontsize=12)
axes[0].set_ylabel('Height (px)', fontsize=12)
axes[0].set_title('ISIC 2019 Image Dimensions', fontsize=14, fontweight='bold')
axes[0].axhline(y=224, color='r', linestyle='--', alpha=0.7, label='Target: 224px')
axes[0].axvline(x=224, color='r', linestyle='--', alpha=0.7)
axes[0].legend()

# File size histogram
axes[1].hist(isic_stats['file_sizes'], bins=50, color='steelblue', edgecolor='black')
axes[1].set_xlabel('File Size (KB)', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].set_title('ISIC 2019 File Size Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'notebooks' / 'image_dimensions_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 4. Sample Image Visualization

In [None]:
# Cell 14: Display ISIC 2019 Sample Images per Class
def display_samples_per_class(image_dir, class_df, class_col, id_col, class_names, samples_per_class=3):
    """Display sample images for each class."""
    classes = class_df[class_col].unique()
    n_classes = len(classes)
    
    fig, axes = plt.subplots(n_classes, samples_per_class, figsize=(4*samples_per_class, 4*n_classes))
    
    for i, cls in enumerate(sorted(classes)):
        class_samples = class_df[class_df[class_col] == cls][id_col].head(samples_per_class).tolist()
        
        for j, sample_id in enumerate(class_samples):
            # Handle both naming conventions
            img_path = Path(image_dir) / f"{sample_id}.jpg"
            
            if img_path.exists():
                img = Image.open(img_path)
                axes[i, j].imshow(img)
                if j == 0:
                    label = class_names.get(cls, cls)
                    axes[i, j].set_ylabel(f"{cls}\n({label})", fontsize=10, rotation=0, labelpad=60, ha='right')
            else:
                axes[i, j].text(0.5, 0.5, 'Not Found', ha='center', va='center')
            
            axes[i, j].axis('off')
            if i == 0:
                axes[i, j].set_title(f'Sample {j+1}', fontsize=10)
    
    plt.suptitle('Sample Images per Class', fontsize=16, fontweight='bold', y=1.02)
    plt.tight_layout()
    return fig

print("Displaying ISIC 2019 sample images...")
fig = display_samples_per_class(
    ISIC_2019_DIR, 
    isic_gt, 
    'class', 
    'image',
    ISIC_CLASS_NAMES,
    samples_per_class=3
)
plt.savefig(PROJECT_ROOT / 'notebooks' / 'isic_2019_samples.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Cell 15: Display HAM10000 Sample Images per Class
def display_ham_samples(meta_df, images_dirs, samples_per_class=3):
    """Display HAM10000 samples per class."""
    classes = sorted(meta_df['dx'].unique())
    n_classes = len(classes)
    
    fig, axes = plt.subplots(n_classes, samples_per_class, figsize=(4*samples_per_class, 4*n_classes))
    
    for i, cls in enumerate(classes):
        class_samples = meta_df[meta_df['dx'] == cls]['image_id'].head(samples_per_class).tolist()
        
        for j, sample_id in enumerate(class_samples):
            img_path = None
            for img_dir in images_dirs:
                potential_path = img_dir / f"{sample_id}.jpg"
                if potential_path.exists():
                    img_path = potential_path
                    break
            
            if img_path and img_path.exists():
                img = Image.open(img_path)
                axes[i, j].imshow(img)
                if j == 0:
                    axes[i, j].set_ylabel(f"{cls}\n({HAM_CLASS_NAMES[cls][:20]}...)", 
                                          fontsize=9, rotation=0, labelpad=70, ha='right')
            else:
                axes[i, j].text(0.5, 0.5, 'Not Found', ha='center', va='center')
            
            axes[i, j].axis('off')
            if i == 0:
                axes[i, j].set_title(f'Sample {j+1}', fontsize=10)
    
    plt.suptitle('HAM10000 Sample Images per Class', fontsize=16, fontweight='bold', y=1.02)
    plt.tight_layout()
    return fig

print("Displaying HAM10000 sample images...")
fig = display_ham_samples(ham_meta, [HAM10000_IMAGES_1, HAM10000_IMAGES_2], samples_per_class=3)
plt.savefig(PROJECT_ROOT / 'notebooks' / 'ham10000_samples.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 5. Dataset Overlap Analysis

In [None]:
# Cell 16: Check for Overlapping Classes
print("Class Mapping Comparison:")
print("=" * 60)
print(f"\n{'ISIC 2019':<30} {'HAM10000':<30}")
print("-" * 60)

# Map similar classes
class_mapping = {
    'MEL': 'mel',      # Melanoma
    'NV': 'nv',        # Melanocytic Nevus
    'BCC': 'bcc',      # Basal Cell Carcinoma
    'AK': 'akiec',     # Actinic Keratosis
    'BKL': 'bkl',      # Benign Keratosis
    'DF': 'df',        # Dermatofibroma
    'VASC': 'vasc',    # Vascular Lesion
    'SCC': None,       # No HAM10000 equivalent
    'UNK': None        # No HAM10000 equivalent
}

for isic_cls, ham_cls in class_mapping.items():
    isic_name = ISIC_CLASS_NAMES.get(isic_cls, isic_cls)
    ham_name = HAM_CLASS_NAMES.get(ham_cls, 'N/A') if ham_cls else 'N/A'
    print(f"{isic_cls} ({isic_name[:20]}...)  <->  {ham_cls or 'N/A'} ({ham_name[:20]}...)")

print("\n" + "=" * 60)
print("\nOverlapping classes: 7")
print("ISIC-only classes: SCC, UNK")
print("\nNote: Classes can be combined for unified training.")

In [None]:
# Cell 17: Combined Dataset Statistics
print("Combined Dataset Overview:")
print("=" * 60)

isic_count = len(isic_gt)
ham_count = len(ham_meta)
total = isic_count + ham_count

print(f"\nISIC 2019:  {isic_count:,} images")
print(f"HAM10000:   {ham_count:,} images")
print(f"{'='*30}")
print(f"Total:      {total:,} images (before deduplication)")

# Note about potential duplicates
print("\n⚠️ Note: HAM10000 is a subset of ISIC Archive.")
print("   Some images may overlap between datasets.")
print("   Deduplication required during preprocessing.")

# Check for overlapping image IDs
isic_ids = set(isic_gt['image'].str.replace('_downsampled', ''))
ham_ids = set(ham_meta['image_id'])
overlap = isic_ids.intersection(ham_ids)

print(f"\nOverlapping image IDs: {len(overlap):,}")
print(f"Unique images after deduplication: ~{total - len(overlap):,}")

---
## 6. Summary & Recommendations

In [None]:
# Cell 18: Data Exploration Summary
summary = """
╔══════════════════════════════════════════════════════════════════╗
║              SKIN DISEASE DETECTION - DATA SUMMARY              ║
╠══════════════════════════════════════════════════════════════════╣
║                                                                  ║
║  DATASETS ANALYZED:                                              ║
║  ──────────────────                                              ║
║  1. ISIC 2019: 25,331 dermoscopic images, 9 classes              ║
║  2. HAM10000:  10,015 dermoscopic images, 7 classes              ║
║                                                                  ║
║  KEY FINDINGS:                                                   ║
║  ─────────────                                                   ║
║  ✓ Both datasets have significant class imbalance               ║
║    - Nevus (NV/nv) dominates both datasets (~50-67%)            ║
║    - Minority classes: DF, VASC, SCC (<5% each)                 ║
║                                                                  ║
║  ✓ Image properties vary considerably                           ║
║    - ISIC 2019: Variable dimensions (many > 1000px)             ║
║    - HAM10000: Standardized 600x450 px                          ║
║                                                                  ║
║  ✓ Dataset overlap detected                                      ║
║    - HAM10000 is derived from ISIC Archive                      ║
║    - Deduplication required                                     ║
║                                                                  ║
║  PREPROCESSING REQUIREMENTS:                                     ║
║  ───────────────────────────                                     ║
║  1. Resize all images to 224x224 (EfficientNet input)           ║
║  2. Handle class imbalance:                                     ║
║     - Weighted loss function                                    ║
║     - Data augmentation for minority classes                    ║
║     - Consider SMOTE or oversampling                            ║
║  3. Deduplicate overlapping images                              ║
║  4. Stratified train/val/test split                             ║
║  5. Normalize pixel values (ImageNet stats)                     ║
║                                                                  ║
║  NEXT STEPS:                                                     ║
║  ───────────                                                     ║
║  → Phase 2: Data Preprocessing Pipeline                         ║
║  → Create unified dataset loader                                ║
║  → Implement augmentation strategies                            ║
║                                                                  ║
╚══════════════════════════════════════════════════════════════════╝
"""
print(summary)

In [None]:
# Cell 19: Save Exploration Results
# Create a summary dataframe
exploration_results = {
    'Dataset': ['ISIC 2019', 'HAM10000'],
    'Total Images': [len(isic_gt), len(ham_meta)],
    'Classes': [9, 7],
    'Majority Class': ['NV', 'nv'],
    'Majority Count': [isic_gt['class'].value_counts()['NV'], ham_meta['dx'].value_counts()['nv']],
    'Minority Class': [class_dist.idxmin(), ham_class_dist.idxmin()],
    'Minority Count': [class_dist.min(), ham_class_dist.min()],
    'Imbalance Ratio': [f"{class_dist.max() / class_dist.min():.1f}:1", 
                        f"{ham_class_dist.max() / ham_class_dist.min():.1f}:1"]
}

results_df = pd.DataFrame(exploration_results)
results_df.to_csv(PROJECT_ROOT / 'notebooks' / 'exploration_summary.csv', index=False)

print("Exploration Summary:")
display(results_df)

print(f"\n✓ Results saved to notebooks/exploration_summary.csv")
print(f"✓ Visualizations saved to notebooks/")

---
## Appendix: Class Imbalance Handling Strategies

Based on the severe class imbalance observed, recommended strategies:

### 1. Weighted Loss Function
```python
# Calculate class weights inversely proportional to frequency
class_weights = 1 / class_counts
class_weights = class_weights / class_weights.sum() * num_classes
```

### 2. Oversampling Minority Classes
- RandomOverSampler
- SMOTE (for feature space)

### 3. Data Augmentation Focus
- Apply heavier augmentation to minority classes
- Use MixUp / CutMix

### 4. Focal Loss
- Down-weight easy (majority) examples
- Focus learning on hard (minority) examples

### 5. Stratified Sampling
- Ensure each batch contains samples from all classes
- Use weighted random sampling in DataLoader