In [2]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from config import CONFIG
from dataset import MNISTLaplacianDataset
from torchvision import transforms

# Load data using numpy for .npz files
data = np.load(CONFIG['data_path'])
train_images = data['train_images']
train_labels = data['train_labels']
test_images = data['test_images']
test_labels = data['test_labels']

# Transform to convert numpy arrays to torch tensors
data_transform = transforms.Compose([transforms.ToTensor()])

# Convert numpy arrays to torch tensors
train_images_tensor = torch.from_numpy(train_images).float()
train_labels_tensor = torch.from_numpy(train_labels).long()
test_images_tensor = torch.from_numpy(test_images).float()
test_labels_tensor = torch.from_numpy(test_labels).long()

# Calculate original L2 norm using torch
# Reshape if needed (assuming images are [N, H, W] and need to be flattened to [N, H*W])
if len(train_images_tensor.shape) == 3:  # [N, H, W]
    train_norms = torch.norm(train_images_tensor.reshape(train_images_tensor.size(0), -1), dim=1)
    test_norms = torch.norm(test_images_tensor.reshape(test_images_tensor.size(0), -1), dim=1)
else:  # Already in appropriate format
    train_norms = torch.norm(train_images_tensor, dim=(1, 2))
    test_norms = torch.norm(test_images_tensor, dim=(1, 2))

# Find maximum L2 norm
max_l2_norm = torch.max(torch.max(train_norms), torch.max(test_norms))
print(f"Maximum L2 Norm: {max_l2_norm.item()}")

# Test various epsilon values
epsilon_values = [0.01, 0.1, 1, 10, 100, 1000, 10000]
max_l2_norms = []

for epsilon in epsilon_values:
    # Create dataset with Laplacian noise
    train_dataset = MNISTLaplacianDataset(train_images, train_labels, epsilon_p=epsilon, transform=data_transform)
    
    # Create a DataLoader to efficiently process the dataset
    train_loader = DataLoader(train_dataset, batch_size=len(train_dataset), shuffle=False)
    
    # Get all images directly from the dataset
    for images_batch, _ in train_loader:
        # Calculate L2 norms
        batch_norms = torch.norm(images_batch.reshape(images_batch.size(0), -1), dim=1)
        max_l2_norm = torch.max(batch_norms).item()
        max_l2_norms.append(max_l2_norm)
        print(f"Epsilon: {epsilon}, Max L2 Norm: {max_l2_norm}")
        break  # Only need one batch since we're loading the entire dataset

max_l2_norms

Maximum L2 Norm: 6324.39208984375
Epsilon: 0.01, Max L2 Norm: 24.798959732055664
Epsilon: 0.1, Max L2 Norm: 24.77741813659668
Epsilon: 1, Max L2 Norm: 24.757051467895508
Epsilon: 10, Max L2 Norm: 24.77016258239746
Epsilon: 100, Max L2 Norm: 24.74211311340332
Epsilon: 1000, Max L2 Norm: 24.745718002319336
Epsilon: 10000, Max L2 Norm: 24.74977684020996


[24.798959732055664,
 24.77741813659668,
 24.757051467895508,
 24.77016258239746,
 24.74211311340332,
 24.745718002319336,
 24.74977684020996]

In [None]:
def visualize_dataset_statistics(dataloaders, image_datasets, class_names):
    """
    Perform full EDA on dataset including:
    - Class distribution
    - Batch visualization
    - Before and after transformations
    - Pixel intensity distribution
    """

    # 1️⃣ Class Distribution
    fig, ax = plt.subplots(figsize=(10, 5))
    class_counts = [len(image_datasets['train'].samples) for _ in class_names]
    ax.bar(class_names, class_counts, color='skyblue')
    ax.set_title('Class Distribution in Training Set')
    ax.set_xlabel('Class')
    ax.set_ylabel('Number of Images')
    ax.set_xticklabels(class_names, rotation=45)
    plt.show()

    # 2️⃣ Visualizing a Batch of Images
    def imshow(inp, title=None):
        """Imshow for Tensors"""
        inp = inp.numpy().transpose((1, 2, 0))
        mean = np.array([0.485, 0.456, 0.406])
        std = np.array([0.229, 0.224, 0.225])
        inp = std * inp + mean  # Unnormalize
        inp = np.clip(inp, 0, 1)
        plt.imshow(inp)
        if title:
            plt.title(title)
        plt.axis('off')

    images, labels = next(iter(dataloaders['train']))
    fig, axes = plt.subplots(1, len(images), figsize=(12, 5))
    for img, lbl, ax in zip(images, labels, axes):
        imshow(img, title=class_names[lbl])
        ax.axis('off')
    plt.show()

    # 3️⃣ Before and After Transformations
    transform_before = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])

    transform_after = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    img_path = image_datasets['train'].samples[0][0]
    img = datasets.folder.default_loader(img_path)

    img_before = transform_before(img)
    img_after = transform_after(img)

    fig, axes = plt.subplots(1, 2, figsize=(8, 4))
    axes[0].imshow(img_before.permute(1, 2, 0))
    axes[0].set_title("Before Augmentation")
    axes[0].axis('off')

    img_after_np = img_after.numpy().transpose(1, 2, 0)
    img_after_np = np.clip(img_after_np * 0.225 + 0.456, 0, 1)  # Unnormalize
    axes[1].imshow(img_after_np)
    axes[1].set_title("After Augmentation")
    axes[1].axis('off')

    plt.show()

    # 4️⃣ Pixel Intensity Distribution
    fig, ax = plt.subplots(1, 2, figsize=(12, 5))

    # Before Normalization
    img_flat = img_before.numpy().flatten()
    ax[0].hist(img_flat, bins=50, color='blue', alpha=0.6)
    ax[0].set_title("Pixel Intensity Before Normalization")
    
    # After Normalization
    img_flat_after = img_after.numpy().flatten()
    ax[1].hist(img_flat_after, bins=50, color='red', alpha=0.6)
    ax[1].set_title("Pixel Intensity After Normalization")

    plt.show()

# Call function to visualize dataset statistics
visualize_dataset_statistics(dataloaders, image_datasets, class_names)