# Dataset Overview - Cancer Histopathology

This notebook provides an overview of the cancer histopathology dataset.

## Setup

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

%matplotlib inline

## Load Data Splits

In [None]:
# Load train/val/test splits
splits_dir = Path('../data/splits')

train_df = pd.read_csv(splits_dir / 'train.csv')
val_df = pd.read_csv(splits_dir / 'val.csv')
test_df = pd.read_csv(splits_dir / 'test.csv')

print(f"Train: {len(train_df)} samples")
print(f"Val: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")
print(f"Total: {len(train_df) + len(val_df) + len(test_df)} samples")

## Class Distribution

In [None]:
# Plot class distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, (name, df) in zip(axes, [('Train', train_df), ('Val', val_df), ('Test', test_df)]):
    counts = df['label'].value_counts()
    ax.bar(counts.index, counts.values)
    ax.set_title(f'{name} Set')
    ax.set_xlabel('Class')
    ax.set_ylabel('Count')
    ax.set_xticks([0, 1])
    ax.set_xticklabels(['Non-Cancer', 'Cancer'])

plt.tight_layout()
plt.show()

In [None]:
# Print detailed statistics
print("\nClass Distribution:")
print("\nTrain:")
print(train_df['label'].value_counts())
print(f"Balance: {train_df['label'].value_counts()[0] / train_df['label'].value_counts()[1]:.2f}")

print("\nValidation:")
print(val_df['label'].value_counts())

print("\nTest:")
print(test_df['label'].value_counts())

## Sample Images

In [None]:
# Display sample images from each class
n_samples = 5

fig, axes = plt.subplots(2, n_samples, figsize=(15, 6))

for class_id in [0, 1]:
    class_samples = train_df[train_df['label'] == class_id].sample(n=n_samples, random_state=42)
    
    for idx, (_, row) in enumerate(class_samples.iterrows()):
        img_path = row['image_path']
        try:
            img = Image.open(img_path)
            axes[class_id, idx].imshow(img)
            axes[class_id, idx].axis('off')
            
            if idx == 0:
                axes[class_id, idx].set_title(
                    f"Class {class_id}\n({'Non-Cancer' if class_id == 0 else 'Cancer'})",
                    fontsize=12
                )
        except Exception as e:
            print(f"Error loading {img_path}: {e}")

plt.tight_layout()
plt.show()

## Image Statistics

In [None]:
# Analyze image sizes and properties
def get_image_stats(df, n_samples=100):
    """Get statistics from a sample of images."""
    sample_df = df.sample(n=min(n_samples, len(df)), random_state=42)
    
    sizes = []
    means = []
    stds = []
    
    for _, row in sample_df.iterrows():
        try:
            img = Image.open(row['image_path'])
            img_array = np.array(img)
            
            sizes.append(img.size)
            means.append(img_array.mean())
            stds.append(img_array.std())
        except:
            pass
    
    return sizes, means, stds

sizes, means, stds = get_image_stats(train_df)

print(f"Sample size: {len(sizes)} images")
print(f"\nImage sizes (unique): {set(sizes)}")
print(f"\nMean pixel intensity: {np.mean(means):.2f} ± {np.std(means):.2f}")
print(f"Std pixel intensity: {np.mean(stds):.2f} ± {np.std(stds):.2f}")

In [None]:
# Plot distributions
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(means, bins=30, edgecolor='black')
axes[0].set_title('Distribution of Mean Pixel Intensity')
axes[0].set_xlabel('Mean Intensity')
axes[0].set_ylabel('Count')

axes[1].hist(stds, bins=30, edgecolor='black')
axes[1].set_title('Distribution of Pixel Std Dev')
axes[1].set_xlabel('Std Dev')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

## Color Analysis

In [None]:
# Analyze color distribution per class
def get_color_stats_by_class(df, n_samples=50):
    """Get color statistics by class."""
    stats = {0: {'r': [], 'g': [], 'b': []}, 1: {'r': [], 'g': [], 'b': []}}
    
    for class_id in [0, 1]:
        class_df = df[df['label'] == class_id].sample(n=min(n_samples, len(df[df['label'] == class_id])), random_state=42)
        
        for _, row in class_df.iterrows():
            try:
                img = np.array(Image.open(row['image_path']))
                stats[class_id]['r'].append(img[:, :, 0].mean())
                stats[class_id]['g'].append(img[:, :, 1].mean())
                stats[class_id]['b'].append(img[:, :, 2].mean())
            except:
                pass
    
    return stats

color_stats = get_color_stats_by_class(train_df)

# Plot
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, (channel, color) in enumerate([('r', 'red'), ('g', 'green'), ('b', 'blue')]):
    axes[idx].hist(color_stats[0][channel], bins=20, alpha=0.5, label='Non-Cancer', color=color)
    axes[idx].hist(color_stats[1][channel], bins=20, alpha=0.5, label='Cancer', color=color)
    axes[idx].set_title(f'{channel.upper()} Channel Distribution')
    axes[idx].set_xlabel('Mean Intensity')
    axes[idx].set_ylabel('Count')
    axes[idx].legend()

plt.tight_layout()
plt.show()

## Summary

- Dataset has been successfully loaded and split
- Class distribution is analyzed
- Image properties and color distributions have been examined
- Ready for model training!