# üåüInitializing Phase

In this section, we initialize the hyper parameters and load the training data

### Seed Control

In [None]:
import random
import numpy as np
import torch

SEED = 9999
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("Random sample (Python):", random.random())
print("Random sample (NumPy):", np.random.rand())
print("Random sample (PyTorch):", torch.rand(1).item())

Random sample (Python): 0.8347577610922152
Random sample (NumPy): 0.8233890742543671
Random sample (PyTorch): 0.7876027822494507


### Set Hyper Parameters



In [None]:
# According to the paper
EPOCHS = 100
BATCH_SIZE = 64
LEARNING_RATE = 0.1
MOMENTUM = 0.9
WEIGHT_DECAY = 1e-4
LR_MILESTONES = [30, 60, 90]  # decayed by every 30 epochs
LR_GAMMA = 0.1  # "decayed by 0.1" (multiply lr by 0.1)

### Load Food-101

In [None]:
import os
import json
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.utils.data as data

# =======================================================
# ÊéßÂà∂ÊòØÂê¶Âº∑Âà∂ÈáçÁÆó
# =======================================================
USE_PRECOMPUTED_STATS = True
STATS_FILE = "food101_stats.json"

# =======================================================
# 1. Ë®àÁÆó food-101 mean & std ÔºåÂ¶ÇÊûúÂ∑≤Êúâ stats.json Áõ¥Êé•ËÆÄÂèñ
# =======================================================

if USE_PRECOMPUTED_STATS and os.path.exists(STATS_FILE):
    print(f"[INFO] Loading precomputed stats from {STATS_FILE} ...")

    with open(STATS_FILE, "r") as f:
        stats = json.load(f)

    food101_mean = stats["mean"]
    food101_std = stats["std"]

    print("Loaded mean:", food101_mean)
    print("Loaded std :", food101_std)

else:
    print("[INFO] Computing mean/std for Food-101 (this may take a while)...")

    stats_transform = transforms.Compose([
        transforms.ToTensor(),
    ])

    food101_train_for_stats = datasets.Food101(
        root='./data',
        split='train',
        download=True,
        transform=stats_transform,
    )

    # batch_size=1 ÈÅøÂÖç Collate Error
    stats_loader = data.DataLoader(
        food101_train_for_stats,
        batch_size=1,
        shuffle=False,
        num_workers=0,
    )

    def compute_mean_std(loader):
        channel_sum = torch.zeros(3)
        channel_sq_sum = torch.zeros(3)
        total_pixels = 0

        for imgs, _ in loader:
            imgs = imgs.squeeze(0)   # [C, H, W]
            c, h, w = imgs.shape
            pixels = h * w

            channel_sum += imgs.sum(dim=[1, 2])
            channel_sq_sum += (imgs ** 2).sum(dim=[1, 2])
            total_pixels += pixels

        mean = channel_sum / total_pixels
        std = torch.sqrt(channel_sq_sum / total_pixels - mean**2)
        return mean, std

    food101_mean, food101_std = compute_mean_std(stats_loader)

    food101_mean = food101_mean.tolist()
    food101_std = food101_std.tolist()

    print("Computed mean:", food101_mean)
    print("Computed std :", food101_std)

    # =======================================================
    # Â≠òÊàê JSONÔºå‰πãÂæåÂ∞±‰∏çÁî®ÂÜçË∑ë‰∫Ü
    # =======================================================
    stats = {
        "mean": food101_mean,
        "std": food101_std
    }

    with open(STATS_FILE, "w") as f:
        json.dump(stats, f, indent=2)

    print(f"[INFO] Saved mean/std to {STATS_FILE}")

In [None]:
# =======================================================
# 2. Official training transforms (ImageNet-style augmentation)
# =======================================================

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=food101_mean, std=food101_std),
])

valid_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=food101_mean, std=food101_std),
])

# =======================================================
# 3. Load Food-101 dataset
# =======================================================

trainset = datasets.Food101(
    root='./data',
    split='train',
    download=False,
    transform=train_transform,
)

testset = datasets.Food101(
    root='./data',
    split='test',
    download=False,
    transform=valid_transform,
)

train_iterator = data.DataLoader(
    trainset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
)

test_iterator = data.DataLoader(
    testset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
)

# =======================================================
# 4. Sanity check
# =======================================================

images, labels = next(iter(train_iterator))
print(f"Train samples: {len(trainset)}")
print(f"Test samples:  {len(testset)}")
print(f"Batch images shape: {images.shape}")
print(f"Batch labels shape: {labels.shape}")

# üåüModel Define

In this section, we define our PyTorch model! After this section there will be a model named `densenet121_fresh` to train

### Basic model definition

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

densenet121_fresh = models.densenet121(weights=None)  # load from scratch
densenet121_fresh.classifier = nn.Linear(densenet121_fresh.classifier.in_features, 101)  # for Food-101
densenet121_fresh = densenet121_fresh.to(device)

print("Using device:", device)

### More settings

In [None]:
import torch.optim as optim

optimizer = optim.SGD(
    densenet121_fresh.parameters(),
    lr=LEARNING_RATE,
    momentum=MOMENTUM,
    weight_decay=WEIGHT_DECAY
)

scheduler = optim.lr_scheduler.MultiStepLR(
    optimizer,
    milestones=LR_MILESTONES,
    gamma=LR_GAMMA
)

criterion = nn.CrossEntropyLoss()

### Model Metrices (FLOPS, Params...)

In [None]:
!pip install torchinfo

In [None]:
from torchinfo import summary

# Straightly simulate with the input data to see Params count
summary(densenet121_fresh, (1, 3, 224, 224))

In [None]:
# model structure
print(densenet121_fresh)

In [None]:
!pip install thop

In [None]:
# FLOPS
def evaluate_model_complexity(model, input_size=(1, 3, 224, 224), device='cuda'):
    """
    Ë©ï‰º∞Ê®°ÂûãÁöÑÂèÉÊï∏ÈáèËàáË®àÁÆóÈáè (FLOPs)„ÄÇ
    ÈúÄÂÆâË£ù thop: pip install thop
    """
    try:
        from thop import profile, clever_format
    except ImportError:
        print("Error: 'thop' library is not installed. Please run: pip install thop")
        return

    model = model.to(device)
    model.eval()

    dummy_input = torch.randn(input_size).to(device)

    # Ë®àÁÆó FLOPs Âíå Params
    flops, params = profile(model, inputs=(dummy_input, ), verbose=False)

    # Ê†ºÂºèÂåñËº∏Âá∫
    flops_fmt, params_fmt = clever_format([flops, params], "%.3f")

    print("="*40)
    print(f"FLOPs Evaluation")
    print(f"Input Shape: {input_size}")
    print(f"FLOPs: {flops_fmt}")
    print("="*40)

    return flops, params

evaluate_model_complexity(densenet121_fresh, input_size=(1, 3, 224, 224), device=device)

# üåüStart Training!

### Define functions for training / evaluating w.r.t 1 epoch

In [None]:
from tqdm.auto import tqdm

# Getting Acc in a batch
def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

# Train an epoch
def train(model, iterator, optimizer, criterion, device):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for (x, y) in tqdm(iterator, desc="Training", leave=False):
        x = x.to(device)
        y = y.to(device)

        # Reset gradients from the previous iteration (avoid accumulation)
        optimizer.zero_grad()

        # Forward pass: compute logits/predictions for this batch
        y_pred = model(x)
        # Compute scalar loss for this batch (e.g., CrossEntropy)
        loss = criterion(y_pred, y)
        # Compute a metric for monitoring (e.g., top-1 accuracy)
        acc = calculate_accuracy(y_pred, y)
        # Backward pass: compute gradients w.r.t. all learnable parameters
        loss.backward()

        # Optimizer step: update parameters using computed gradients
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# Evaluate an epoch
def evaluate(model, iterator, criterion, device):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():

        for (x, y) in tqdm(iterator, desc="Evaluating", leave=False):

            x = x.to(device)
            y = y.to(device)

            y_pred = model(x)

            loss = criterion(y_pred, y)

            acc = calculate_accuracy(y_pred, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

### Define the main training function

In [None]:
import time

def train_model(model, train_iterator, test_iterator, device,
                optimizer, scheduler, criterion,
                epochs=EPOCHS, model_path='Best-DenseNet121-Food101.pt'):

    model = model.to(device)

    optimizer = optimizer
    scheduler = scheduler
    criterion = criterion

    best_valid_loss = float('inf')

    train_losses = []
    train_accuracies = []
    valid_losses = []
    valid_accuracies = []
    learning_rates = []

    for epoch in tqdm(range(epochs)):
        start_time = time.monotonic()

        train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)
        valid_loss, valid_acc = evaluate(model, test_iterator, criterion, device)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), model_path)

        end_time = time.monotonic()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        current_lr = optimizer.param_groups[0]['lr']

        # Decaying after an Epoch
        scheduler.step()

        print(f'Epoch: {epoch+1} | Epoch Time: {epoch_mins}m {epoch_secs}s | LR: {current_lr:.5f}')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

        train_losses.append(train_loss)
        train_accuracies.append(train_acc)
        valid_losses.append(valid_loss)
        valid_accuracies.append(valid_acc)
        learning_rates.append(current_lr)

    return {
        'train_losses': train_losses,
        'train_accuracies': train_accuracies,
        'valid_losses': valid_losses,
        'valid_accuracies': valid_accuracies,
        'learning_rates': learning_rates,
        'best_model_path': model_path
    }

### Training Begins!!!!!

In [None]:
trained_model = densenet121_fresh.to(device)

# call the train method
trained_stats = train_model(
    model=trained_model,
    train_iterator=train_iterator,
    test_iterator=test_iterator,
    device=device,
    optimizer=optimizer,
    scheduler=scheduler,
    criterion=criterion,
)

torch.save(trained_stats, 'trained_stats_DenseNet121_Food101.pt')

# üåüEvaluation Time

In [None]:
# # Reload the model (Optional)
# trained_model = densenet121_fresh
# trained_model.load_state_dict(torch.load('Best-DenseNet121-Food101.pt'))

# trained_stats = torch.load('trained_stats_DenseNet121_Food101.pt', map_location=device)

### The Loss and Accuracy Curve

In [None]:
import matplotlib.pyplot as plt

def plot_loss_and_accuracy(trained_stats):
    epochs_to_show = EPOCHS
    epochs = range(1, epochs_to_show + 1)

    # Plot Loss
    plt.figure(figsize=(10, 4))
    plt.plot(epochs, trained_stats["train_losses"][:epochs_to_show], label='Train Loss')
    plt.plot(epochs, trained_stats["valid_losses"][:epochs_to_show], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Plot Accuracy
    plt.figure(figsize=(10, 4))
    plt.plot(epochs, [acc * 100 for acc in trained_stats["train_accuracies"][:epochs_to_show]], label='Train Accuracy')
    plt.plot(epochs, [acc * 100 for acc in trained_stats["valid_accuracies"][:epochs_to_show]], label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.title('Training and Validation Accuracy')
    plt.legend()
    plt.grid(True)
    plt.show()

plot_loss_and_accuracy(trained_stats)

### The Most Incorrect

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import heapq

def plot_most_incorrect(trained_model, test_iterator, device, n_images=16, class_names=None):
    trained_model.eval()

    # Use a min-heap to keep only top N incorrect examples
    # Format: (confidence, index, image, true_label, probs)
    # We use negative confidence for max-heap behavior
    incorrect_heap = []

    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(test_iterator):
            x = x.to(device)
            y_pred = trained_model(x)
            y_prob = F.softmax(y_pred, dim=-1)

            # Move to CPU immediately and process
            y_prob_cpu = y_prob.cpu()
            y_cpu = y.cpu()
            x_cpu = x.cpu()

            pred_labels = torch.argmax(y_prob_cpu, 1)

            # Process each image in the batch
            for i in range(x_cpu.size(0)):
                if pred_labels[i] != y_cpu[i]:
                    # Get the confidence of incorrect prediction
                    incorrect_prob = y_prob_cpu[i, pred_labels[i]].item()

                    # Use negative for max-heap behavior with heapq (min-heap)
                    item = (-incorrect_prob, batch_idx * len(x) + i,
                           x_cpu[i].clone(), y_cpu[i].item(), y_prob_cpu[i].clone())

                    if len(incorrect_heap) < n_images:
                        heapq.heappush(incorrect_heap, item)
                    elif -incorrect_prob > incorrect_heap[0][0]:
                        heapq.heapreplace(incorrect_heap, item)

            # Clear batch from memory
            del x, y_pred, y_prob, y_prob_cpu, y_cpu, x_cpu

    # Sort by confidence (descending)
    incorrect_examples = sorted(incorrect_heap, key=lambda x: x[0])

    # Plot results
    rows = int(np.sqrt(n_images))
    cols = int(np.ceil(n_images / rows))

    fig = plt.figure(figsize=(cols * 1.5, rows * 1.5))
    for i in range(min(len(incorrect_examples), rows * cols)):
        ax = fig.add_subplot(rows, cols, i + 1)

        _, _, image, true_label, probs = incorrect_examples[i]
        true_prob = probs[true_label].item()
        incorrect_prob, incorrect_label = torch.max(probs, dim=0)
        incorrect_prob = incorrect_prob.item()
        incorrect_label = incorrect_label.item()

        # Prepare image for display
        img = image.permute(1, 2, 0).numpy()
        img = np.clip(img, 0, 1)
        ax.imshow(img)

        # Set title
        if class_names:
            title = f'True: {class_names[true_label]} ({true_prob:.2f})\nPred: {class_names[incorrect_label]} ({incorrect_prob:.2f})'
        else:
            title = f'True: {true_label} ({true_prob:.2f})\nPred: {incorrect_label} ({incorrect_prob:.2f})'

        ax.set_title(title, fontsize=9)
        ax.axis('off')

    plt.tight_layout()
    plt.show()

plot_most_incorrect(trained_model, test_iterator, device, 16, class_names=trainset.classes)

### The final loss and accuracy

In [None]:
test_loss, test_acc = evaluate(trained_model, test_iterator, criterion, device)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

### Display Feature Maps (TODO)