# Dataset Overview and Exploratory Data Analysis

This notebook provides an overview of the cancer histopathology dataset and performs exploratory data analysis.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

%matplotlib inline
sns.set_style('whitegrid')

## Load Dataset Splits

In [None]:
train_df = pd.read_csv('../data/splits/train.csv')
val_df = pd.read_csv('../data/splits/val.csv')
test_df = pd.read_csv('../data/splits/test.csv')

print(f"Train: {len(train_df)} images")
print(f"Val: {len(val_df)} images")
print(f"Test: {len(test_df)} images")

## Class Distribution

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, (name, df) in zip(axes, [('Train', train_df), ('Val', val_df), ('Test', test_df)]):
    df['label'].value_counts().plot(kind='bar', ax=ax)
    ax.set_title(f'{name} Class Distribution')
    ax.set_xlabel('Class')
    ax.set_ylabel('Count')

plt.tight_layout()
plt.show()

## Sample Images

In [None]:
import cv2

fig, axes = plt.subplots(2, 5, figsize=(15, 6))
axes = axes.flatten()

for i, ax in enumerate(axes):
    row = train_df.sample(1).iloc[0]
    img = cv2.imread(row['image_path'])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    ax.imshow(img)
    ax.set_title(f"Class {row['label']}")
    ax.axis('off')

plt.tight_layout()
plt.show()