In [None]:
!pip install torch torchvision tensorboard torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.4.3-py3-none-any.whl.metadata (19 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.7-py3-none-any.whl.metadata (5.2 kB)
Downloading torchmetrics-1.4.3-py3-none-any.whl (869 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m869.5/869.5 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.7-py3-none-any.whl (26 kB)


Import Libraries

In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
import numpy as np

Load CIFAR-100 Dataset

In [None]:
# Define data transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5071, 0.4865, 0.4409), (0.2673, 0.2564, 0.2762))  # CIFAR-100 normalization values
])

# Load CIFAR-100 dataset
batch_size = ...
train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
train_loader = ...

test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)
test_loader = ...

# Define a function to visualize some samples from the dataset
def imshow(img):
    ...

# Show some random training images
dataiter = iter(train_loader)
images, labels = next(dataiter)
imshow(torchvision.utils.make_grid(images[:8]))  # Show 8 images in a grid

Files already downloaded and verified
Files already downloaded and verified


Define the Model

In [None]:
# Load the pre-defined ResNet18 model and adjust it for CIFAR-100
from torchvision.models import resnet18

# Define the model
model = resnet18()  # Set num_classes to 100 for CIFAR-100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ... # set the model on cuda

Change the model

Define the Model

In [None]:
# Load the pre-defined ResNet18 model and adjust it for CIFAR-100
from torchvision.models import resnet18

# Define the model
model = resnet18(pretrained=True)  # Set num_classes to 100 for CIFAR-100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 193MB/s]


In [None]:
def count_parameters_in_millions(model):
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params / 1_000_000  # Divide by 1 million to convert to millions

# Example usage:
total_params_in_millions = count_parameters_in_millions(model)
print(f'Total trainable parameters: {total_params_in_millions:.2f} million')

In [None]:
model.fc = nn.Linear(model.fc.in_features, 100)
model = model.to(device)

Model Parameters

In [None]:
# prompt: print model parameters
print("Model Parameters:")
for name, param in model.named_parameters():
  print(f"Name: {name}, Shape: {param.shape}")


Model Parameters:


In [None]:
model.fc = nn.Linear(model.fc.in_features, 100)
model = model.to(device)

Define Loss Function, Optimizer, and TensorBoard Writer

In [None]:
# Define loss function and optimizer
criterion = ...
# optimizer = optim.Adam(model.fc.parameters(), lr=0.1)
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Initialize TensorBoard writer
writer = SummaryWriter('runs/cifar100_resnet18')

In [None]:
# Function to save a checkpoint in the TensorBoard log directory
def save_checkpoint(model, optimizer, epoch, loss_metric, accuracy_metric, checkpoint_dir='runs/cifar100_resnet18'):
    checkpoint_path = f"{checkpoint_dir}/checkpoint_epoch_{epoch+1}.pth"

    # Use .compute() to get the values of the metrics
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch,
        'loss': loss_metric.compute(),  # Get the computed loss
        'accuracy': accuracy_metric.compute()  # Get the computed accuracy
    }
    torch.save(checkpoint, checkpoint_path)
    print(f'Checkpoint saved at {checkpoint_path}')

# Function to load a checkpoint from the TensorBoard log directory
def load_checkpoint(model, optimizer, checkpoint_dir='runs/cifar100_resnet18'):
    import glob
    # Find the latest checkpoint (e.g., based on the highest epoch number)
    checkpoint_paths = glob.glob(f"{checkpoint_dir}/checkpoint_epoch_*.pth")
    checkpoint_paths.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
    latest_checkpoint = checkpoint_paths[-1]

    checkpoint = torch.load(latest_checkpoint)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1  # Resume from the next epoch
    loss = checkpoint['loss']
    accuracy = checkpoint['accuracy']
    print(f'Checkpoint loaded from {latest_checkpoint}. Resuming training from epoch {start_epoch}')

    return start_epoch, loss, accuracy  # You can return these for reference but don't update accuracy_metric with them

In [None]:
import torchmetrics
from torch.utils.tensorboard import SummaryWriter

# Updated training function with torchmetrics and resuming capability
def train_model(model, train_loader, criterion, optimizer, num_epochs=1, start_epoch=0, resume=False, checkpoint_dir='runs/cifar100_resnet18'):
    # Initialize the metrics
    loss_metric = torchmetrics.MeanMetric()
    accuracy_metric = torchmetrics.Accuracy()

    # Load checkpoint if resuming
    if resume:
        ...

    model.train()
    for epoch in range(start_epoch, num_epochs):
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

In [None]:
import torchmetrics

def evaluate_model(model, test_loader, criterion, optimizer, checkpoint_dir='runs/cifar100_resnet18'):
    # Load the model from the latest checkpoint if needed
    load_checkpoint(model, optimizer, checkpoint_dir)

    # Set the model to evaluation mode
    model.eval()

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

In [None]:
def visualize_feature_maps(model, layer_name, image):
    # Get the specified layer's output
    activation = {}
    def hook_fn(module, input, output):
        activation[layer_name] = output

    layer = dict(model.named_modules())[layer_name]
    layer.register_forward_hook(hook_fn)

    # Pass the image through the model
    image = image.unsqueeze(0).to(device)  # Add batch dimension and move to device
    model(image)

    # Extract and plot the feature maps
    feature_maps = activation[layer_name].squeeze().cpu().detach()
    fig, axes = plt.subplots(1, min(8, feature_maps.size(0)), figsize=(15, 15))
    for i in range(min(8, feature_maps.size(0))):
        axes[i].imshow(feature_maps[i], cmap='viridis')
        axes[i].axis('off')
    plt.show()

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def visualize_tsne(model, loader):
    features = []
    labels = []
    model.eval()

    with torch.no_grad():
        for inputs, targets in loader:
            inputs = inputs.to(device)
            output = model(inputs)
            features.extend(output.cpu().numpy())
            labels.extend(targets.numpy())

    # Convert the list of features into a numpy array
    features = np.array(features)
    labels = np.array(labels)

    # Perform t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(features)

    # Create a DataFrame for visualization
    df = pd.DataFrame({'x': tsne_results[:, 0], 'y': tsne_results[:, 1], 'label': labels})

    # Plot the t-SNE results using seaborn
    plt.figure(figsize=(10, 8))
    sns.scatterplot(x='x', y='y', hue='label', palette='tab10', data=df, legend='full', alpha=0.7)
    plt.title('t-SNE of MobileNetV2 Features')
    plt.show()

In [None]:
def get_incorrect_predictions(model, loader, max_samples=10):
    model.eval()
    incorrect_samples = []
    incorrect_labels = []
    incorrect_preds = []

    with torch.no_grad():
        for inputs, labels in loader:
            # TODO complete this function
            ...

    return incorrect_samples, incorrect_labels, incorrect_preds


def visualize_incorrect_predictions(model, loader, class_names, max_samples=10):
    incorrect_samples, incorrect_labels, incorrect_preds = get_incorrect_predictions(model, loader, max_samples=max_samples)

    # Plot the images with true and predicted labels
    fig, axes = plt.subplots(1, len(incorrect_samples), figsize=(15, 5))
    if len(incorrect_samples) == 1:
        axes = [axes]  # To handle the case where there's only one incorrect sample

    for idx, (img, true_label, pred_label) in enumerate(zip(incorrect_samples, incorrect_labels, incorrect_preds)):
        img = img.permute(1, 2, 0)  # Convert from (C, H, W) to (H, W, C)
        img = img * 0.2673 + 0.5071  # Unnormalize for CIFAR100: (std_dev * image + mean)
        img = np.clip(img, 0, 1)

        axes[idx].imshow(img)
        axes[idx].set_title(f'True: {class_names[true_label]}\nPred: {class_names[pred_label]}', fontsize=10)
        axes[idx].axis('off')

    plt.tight_layout()
    plt.show()

In [None]:
# Start training from scratch
train_model(model, train_loader, criterion, optimizer, num_epochs=10, resume=False, checkpoint_dir='runs/cifar100_resnet18')

# If training gets interrupted, resume from the last checkpoint:
train_model(model, train_loader, criterion, optimizer, num_epochs=10, resume=True, checkpoint_dir='runs/cifar100_resnet18')

In [None]:
# Load the TensorBoard extension
%load_ext tensorboard

# Start TensorBoard and point it to the log directory used by SummaryWriter
%tensorboard --logdir=runs