# 🌟Initializing Phase

In this section, we initialize the hyper parameters and load the training data

### Seed Control

In [None]:
import random
import numpy as np
import torch

SEED = 9999
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("Random sample (Python):", random.random())
print("Random sample (NumPy):", np.random.rand())
print("Random sample (PyTorch):", torch.rand(1).item())

Random sample (Python): 0.8347577610922152
Random sample (NumPy): 0.8233890742543671
Random sample (PyTorch): 0.7876027822494507


### Set Hyper Parameters



In [None]:
# According to the paper
EPOCHS = 200
BATCH_SIZE = 128
LEARNING_RATE = 0.05
MOMENTUM = 0.9
WEIGHT_DECAY = 5e-4
LR_MILESTONES = [100, 150]  # decayed by 0.1 at 100 and 150 of the epochs
LR_GAMMA = 0.1  # "decayed by 0.1" (multiply lr by 0.1)

### Load CIFAR-100 Data

In [None]:
# Calculate the mean and std for normalization
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.utils.data as data
from tqdm.notebook import tqdm

calc_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# load the trainset and calculate mean and std
calc_dataset = datasets.CIFAR100(
    root='./data', train=True, download=True, transform=calc_transform)

calc_loader = data.DataLoader(
    calc_dataset, batch_size=BATCH_SIZE, shuffle=False)

# This takes about 2~3 minutes
def get_mean_std(loader):
    # Initialize accumulators as float64 (Double)
    # This prevents precision loss when the numbers get huge
    channels_sum = torch.zeros(3, dtype=torch.float64)
    channels_sqr_sum = torch.zeros(3, dtype=torch.float64)
    total_pixels = 0

    for images, _ in tqdm(loader):
        # Cast batch to float64 before math operations
        images_dbl = images.to(dtype=torch.float64)

        # images shape: [Batch_Size, 3, Height, Width]
        # We sum up the values for each channel (0, 1, 2)
        # Summing over dimension 0 (batch), 2 (height), and 3 (width)
        channels_sum += torch.sum(images_dbl, dim=[0, 2, 3])
        channels_sqr_sum += torch.sum(images_dbl ** 2, dim=[0, 2, 3])

        # Count total pixels (Batch * Height * Width)
        # We don't count channels because we want stats per channel
        total_pixels += images.size(0) * images.size(2) * images.size(3)

    # Calculate Mean and Std
    mean = channels_sum / total_pixels

    # Std = sqrt( E[x^2] - (E[x])^2 )
    std = (channels_sqr_sum / total_pixels - mean ** 2) ** 0.5

    return mean, std

# Perform the calculation
calculated_mean, calculated_std = get_mean_std(calc_loader)

print(f"\nCalculated Mean: {calculated_mean}")
print(f"Calculated Std:  {calculated_std}")

100%|██████████| 169M/169M [00:03<00:00, 43.6MB/s]


  0%|          | 0/391 [00:00<?, ?it/s]


Calculated Mean: tensor([0.5071, 0.4865, 0.4409], dtype=torch.float64)
Calculated Std:  tensor([0.2623, 0.2513, 0.2714], dtype=torch.float64)


In [None]:
# start loading training and testing data (With data augmentation)
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(), # avoid overfitting
    transforms.ToTensor(),
    transforms.Normalize(mean=calculated_mean.tolist(), std=calculated_std.tolist())
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=calculated_mean.tolist(), std=calculated_std.tolist())
])

# Load the full CIFAR-100 training dataset (all 100 classes)
trainset = datasets.CIFAR100(
    root='./data', train=True, download=True, transform=train_transform)

# Load the full CIFAR-100 test dataset (all 100 classes)
testset = datasets.CIFAR100(
    root='./data', train=False, download=True, transform=test_transform)

train_iterator = data.DataLoader(trainset, shuffle=True, batch_size=BATCH_SIZE, num_workers=4)
test_iterator = data.DataLoader(testset, batch_size=BATCH_SIZE, num_workers=4)

images, labels = next(iter(train_iterator))

print(f'Number of training images (100 classes): {len(trainset)}')
print(f'Number of testing images (100 classes): {len(testset)}')
print(f"Shape of one batch of images: {images.shape}")
print(f"Shape of one batch of labels: {labels.shape}")

Number of training images (100 classes): 50000
Number of testing images (100 classes): 10000
Shape of one batch of images: torch.Size([128, 3, 224, 224])
Shape of one batch of labels: torch.Size([128])


# 🌟Model Define

In this section, we define our PyTorch model! After this section there will be a model named `resnet50_fresh` to train

### Basic model definition

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

resnet50_fresh = models.resnet50(weights=None)  # load from scratch
resnet50_fresh.fc = nn.Linear(resnet50_fresh.fc.in_features, 100)  # for CIFAR-100
resnet50_fresh = resnet50_fresh.to(device)

print("Using device:", device)

### More settings

In [None]:
import torch.optim as optim

optimizer = optim.SGD(
    resnet50_fresh.parameters(),
    lr=LEARNING_RATE,
    momentum=MOMENTUM,
    weight_decay=WEIGHT_DECAY
)

scheduler = optim.lr_scheduler.MultiStepLR(
    optimizer,
    milestones=LR_MILESTONES,
    gamma=LR_GAMMA
)

criterion = nn.CrossEntropyLoss()

### Model Metrices (FLOPS, Params...)

In [None]:
from torchsummary import summary

# Straightly simulate with the input data to see Params count
summary(resnet50_fresh, (3,224,224))

In [None]:
# model structure
print(resnet50_fresh)

In [None]:
!pip install thop

In [None]:
# FLOPS
def evaluate_model_complexity(model, input_size=(1, 3, 224, 224), device='cuda'):
    """
    評估模型的參數量與計算量 (FLOPs)。
    需安裝 thop: pip install thop
    """
    try:
        from thop import profile, clever_format
    except ImportError:
        print("Error: 'thop' library is not installed. Please run: pip install thop")
        return

    model = model.to(device)
    model.eval()

    dummy_input = torch.randn(input_size).to(device)

    # 計算 FLOPs 和 Params
    flops, params = profile(model, inputs=(dummy_input, ), verbose=False)

    # 格式化輸出
    flops_fmt, params_fmt = clever_format([flops, params], "%.3f")

    print("="*40)
    print(f"FLOPs Evaluation")
    print(f"Input Shape: {input_size}")
    print(f"FLOPs: {flops_fmt}")
    print("="*40)

    return flops, params

evaluate_model_complexity(resnet50_fresh, input_size=(1, 3, 224, 224), device=device)

# 🌟Start Training!

### Define functions for training / evaluating w.r.t 1 epoch

In [None]:
from tqdm.notebook import tqdm

# Getting Acc in a batch
def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

# Train an epoch
def train(model, iterator, optimizer, criterion, device):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for (x, y) in tqdm(iterator, desc="Training", leave=False):
        x = x.to(device)
        y = y.to(device)

        # Reset gradients from the previous iteration (avoid accumulation)
        optimizer.zero_grad()

        # Forward pass: compute logits/predictions for this batch
        y_pred = model(x)
        # Compute scalar loss for this batch (e.g., CrossEntropy)
        loss = criterion(y_pred, y)
        # Compute a metric for monitoring (e.g., top-1 accuracy)
        acc = calculate_accuracy(y_pred, y)
        # Backward pass: compute gradients w.r.t. all learnable parameters
        loss.backward()

        # Optimizer step: update parameters using computed gradients
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# Evaluate an epoch
def evaluate(model, iterator, criterion, device):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():

        for (x, y) in tqdm(iterator, desc="Evaluating", leave=False):

            x = x.to(device)
            y = y.to(device)

            y_pred = model(x)

            loss = criterion(y_pred, y)

            acc = calculate_accuracy(y_pred, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

### Define the main training function

In [None]:
import time

def train_model(model, train_iterator, test_iterator, device,
                optimizer, scheduler, criterion,
                epochs=EPOCHS, model_path='Best-ResNet50-CIFAR.pt'):

    model = model.to(device)

    optimizer = optimizer
    scheduler = scheduler
    criterion = criterion

    best_valid_loss = float('inf')

    train_losses = []
    train_accuracies = []
    valid_losses = []
    valid_accuracies = []
    learning_rates = []

    for epoch in tqdm(range(epochs)):
        start_time = time.monotonic()

        train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)
        valid_loss, valid_acc = evaluate(model, test_iterator, criterion, device)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), model_path)

        end_time = time.monotonic()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        current_lr = optimizer.param_groups[0]['lr']

        # Decaying after an Epoch
        scheduler.step()

        print(f'Epoch: {epoch+1} | Epoch Time: {epoch_mins}m {epoch_secs}s | LR: {current_lr:.5f}')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

        train_losses.append(train_loss)
        train_accuracies.append(train_acc)
        valid_losses.append(valid_loss)
        valid_accuracies.append(valid_acc)
        learning_rates.append(current_lr)

    return {
        'train_losses': train_losses,
        'train_accuracies': train_accuracies,
        'valid_losses': valid_losses,
        'valid_accuracies': valid_accuracies,
        'learning_rates': learning_rates,
        'best_model_path': model_path
    }

### Training Begins!!!!!

In [None]:
trained_model = resnet50_fresh.to(device)

# call the train method
trained_stats = train_model(
    model=trained_model,
    train_iterator=train_iterator,
    test_iterator=test_iterator,
    device=device,
    optimizer=optimizer,
    scheduler=scheduler,
    criterion=criterion,
)

torch.save(trained_stats, 'trained_stats_ResNet50_CIFAR.pt')

  0%|          | 0/20 [00:00<?, ?it/s]

Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 1 | Epoch Time: 10m 22s | LR: 0.05
	Train Loss: 4.530 | Train Acc: 3.65%
	 Val. Loss: 4.058 |  Val. Acc: 6.84%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 2 | Epoch Time: 10m 21s | LR: 0.05
	Train Loss: 3.811 | Train Acc: 10.17%
	 Val. Loss: 3.646 |  Val. Acc: 13.23%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 3 | Epoch Time: 10m 21s | LR: 0.05
	Train Loss: 3.383 | Train Acc: 17.66%
	 Val. Loss: 3.251 |  Val. Acc: 19.98%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 4 | Epoch Time: 10m 21s | LR: 0.05
	Train Loss: 2.977 | Train Acc: 25.14%
	 Val. Loss: 3.017 |  Val. Acc: 25.70%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 5 | Epoch Time: 10m 22s | LR: 0.05
	Train Loss: 2.591 | Train Acc: 32.93%
	 Val. Loss: 3.015 |  Val. Acc: 26.41%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 6 | Epoch Time: 10m 22s | LR: 0.05
	Train Loss: 2.231 | Train Acc: 40.52%
	 Val. Loss: 2.486 |  Val. Acc: 36.09%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 7 | Epoch Time: 10m 22s | LR: 0.05
	Train Loss: 1.933 | Train Acc: 46.84%
	 Val. Loss: 2.114 |  Val. Acc: 43.25%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 8 | Epoch Time: 10m 22s | LR: 0.05
	Train Loss: 1.683 | Train Acc: 52.68%
	 Val. Loss: 2.023 |  Val. Acc: 45.13%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 9 | Epoch Time: 10m 22s | LR: 0.05
	Train Loss: 1.493 | Train Acc: 57.21%
	 Val. Loss: 1.973 |  Val. Acc: 47.36%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 10 | Epoch Time: 10m 21s | LR: 0.05
	Train Loss: 1.331 | Train Acc: 61.27%
	 Val. Loss: 1.727 |  Val. Acc: 52.70%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 11 | Epoch Time: 10m 21s | LR: 0.05
	Train Loss: 1.188 | Train Acc: 64.95%
	 Val. Loss: 1.781 |  Val. Acc: 52.05%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 12 | Epoch Time: 10m 21s | LR: 0.05
	Train Loss: 1.046 | Train Acc: 68.82%
	 Val. Loss: 1.902 |  Val. Acc: 49.73%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 13 | Epoch Time: 10m 20s | LR: 0.05
	Train Loss: 0.935 | Train Acc: 71.73%
	 Val. Loss: 1.737 |  Val. Acc: 52.86%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 14 | Epoch Time: 10m 19s | LR: 0.05
	Train Loss: 0.851 | Train Acc: 74.18%
	 Val. Loss: 1.834 |  Val. Acc: 51.13%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 15 | Epoch Time: 10m 20s | LR: 0.05
	Train Loss: 0.770 | Train Acc: 76.56%
	 Val. Loss: 1.820 |  Val. Acc: 51.90%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 16 | Epoch Time: 10m 22s | LR: 0.05
	Train Loss: 0.711 | Train Acc: 78.21%
	 Val. Loss: 2.133 |  Val. Acc: 49.16%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 17 | Epoch Time: 10m 26s | LR: 0.05
	Train Loss: 0.666 | Train Acc: 79.58%
	 Val. Loss: 1.850 |  Val. Acc: 52.74%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 18 | Epoch Time: 10m 22s | LR: 0.05
	Train Loss: 0.612 | Train Acc: 81.04%
	 Val. Loss: 2.057 |  Val. Acc: 50.30%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 19 | Epoch Time: 10m 21s | LR: 0.05
	Train Loss: 0.588 | Train Acc: 81.91%
	 Val. Loss: 2.106 |  Val. Acc: 50.08%


Training:   0%|          | 0/391 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch: 20 | Epoch Time: 10m 20s | LR: 0.05
	Train Loss: 0.558 | Train Acc: 82.80%
	 Val. Loss: 2.144 |  Val. Acc: 48.86%


# 🌟Evaluation Time

In [None]:
# # Reload the model (Optional)
# trained_model = resnet50_fresh
# trained_model.load_state_dict(torch.load('Best-ResNet50-CIFAR.pt'))

# trained_stats = torch.load('trained_stats_ResNet50_CIFAR.pt', map_location=device)

<All keys matched successfully>

### The Loss and Accuracy Curve

In [None]:
import matplotlib.pyplot as plt

def plot_loss_and_accuracy(trained_stats):
    epochs_to_show = EPOCHS
    epochs = range(1, epochs_to_show + 1)

    # Plot Loss
    plt.figure(figsize=(10, 4))
    plt.plot(epochs, trained_stats["train_losses"][:epochs_to_show], label='Train Loss')
    plt.plot(epochs, trained_stats["valid_losses"][:epochs_to_show], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Plot Accuracy
    plt.figure(figsize=(10, 4))
    plt.plot(epochs, [acc * 100 for acc in trained_stats["train_accuracies"][:epochs_to_show]], label='Train Accuracy')
    plt.plot(epochs, [acc * 100 for acc in trained_stats["valid_accuracies"][:epochs_to_show]], label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.title('Training and Validation Accuracy')
    plt.legend()
    plt.grid(True)
    plt.show()

plot_loss_and_accuracy(trained_stats)

### The Most Incorrect

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import heapq

def plot_most_incorrect(trained_model, test_iterator, device, n_images=16, class_names=None):
    trained_model.eval()

    # Use a min-heap to keep only top N incorrect examples
    # Format: (confidence, index, image, true_label, probs)
    # We use negative confidence for max-heap behavior
    incorrect_heap = []

    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(test_iterator):
            x = x.to(device)
            y_pred = trained_model(x)
            y_prob = F.softmax(y_pred, dim=-1)

            # Move to CPU immediately and process
            y_prob_cpu = y_prob.cpu()
            y_cpu = y.cpu()
            x_cpu = x.cpu()

            pred_labels = torch.argmax(y_prob_cpu, 1)

            # Process each image in the batch
            for i in range(x_cpu.size(0)):
                if pred_labels[i] != y_cpu[i]:
                    # Get the confidence of incorrect prediction
                    incorrect_prob = y_prob_cpu[i, pred_labels[i]].item()

                    # Use negative for max-heap behavior with heapq (min-heap)
                    item = (-incorrect_prob, batch_idx * len(x) + i,
                           x_cpu[i].clone(), y_cpu[i].item(), y_prob_cpu[i].clone())

                    if len(incorrect_heap) < n_images:
                        heapq.heappush(incorrect_heap, item)
                    elif -incorrect_prob > incorrect_heap[0][0]:
                        heapq.heapreplace(incorrect_heap, item)

            # Clear batch from memory
            del x, y_pred, y_prob, y_prob_cpu, y_cpu, x_cpu

    # Sort by confidence (descending)
    incorrect_examples = sorted(incorrect_heap, key=lambda x: x[0])

    # Plot results
    rows = int(np.sqrt(n_images))
    cols = int(np.ceil(n_images / rows))

    fig = plt.figure(figsize=(cols * 1.5, rows * 1.5))
    for i in range(min(len(incorrect_examples), rows * cols)):
        ax = fig.add_subplot(rows, cols, i + 1)

        _, _, image, true_label, probs = incorrect_examples[i]
        true_prob = probs[true_label].item()
        incorrect_prob, incorrect_label = torch.max(probs, dim=0)
        incorrect_prob = incorrect_prob.item()
        incorrect_label = incorrect_label.item()

        # Prepare image for display
        img = image.permute(1, 2, 0).numpy()
        img = np.clip(img, 0, 1)
        ax.imshow(img)

        # Set title
        if class_names:
            title = f'True: {class_names[true_label]} ({true_prob:.2f})\nPred: {class_names[incorrect_label]} ({incorrect_prob:.2f})'
        else:
            title = f'True: {true_label} ({true_prob:.2f})\nPred: {incorrect_label} ({incorrect_prob:.2f})'

        ax.set_title(title, fontsize=9)
        ax.axis('off')

    plt.tight_layout()
    plt.show()

plot_most_incorrect(trained_model, test_iterator, device, 16, class_names=trainset.classes)

### The final loss and accuracy

In [None]:
test_loss, test_acc = evaluate(trained_model, test_iterator, criterion, device)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Test Loss: 1.727 | Test Acc: 52.70%


### Display Feature Maps (TODO)