**Members:**
*   Gaw, Janice Ko
*   Sepe, Jean Marlo
*   Ramos, Paula Angelica
*   Flores, Arvin Christian



### A. LIBRARIES

In [None]:
pip install tensorboardX

Collecting tensorboardX
  Downloading tensorboardx-2.6.4-py3-none-any.whl.metadata (6.2 kB)
Downloading tensorboardx-2.6.4-py3-none-any.whl (87 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboardX
Successfully installed tensorboardX-2.6.4


In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from tensorboardX import SummaryWriter

from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader

In [None]:
# ---- rp16/added!
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

CUDA available: False
GPU name: CPU


### B. MODEL PARAMETERS

In [None]:
"""
Implementation of AlexNet, from paper
"ImageNet Classification with Deep Convolutional Neural Networks" by Alex Krizhevsky et al.

See: https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf


Data
The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
with 6000 images per class. There are 50000 training images and 10000 test images.
See: https://www.cs.toronto.edu/~kriz/cifar.html

"""

# define pytorch device - useful for device-agnostic execution
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# define model parameters
NUM_EPOCHS = 90  # original paper - 90
BATCH_SIZE = 128
MOMENTUM = 0.9
LR_DECAY = 0.0005
LR_INIT = 0.01
IMAGE_DIM = 227  # pixels for the ImageNet dataset. CIFAR-10 dataset images will be resized during data loading
NUM_CLASSES = 10  # 10 classes for CIFAR-10 dataset
DEVICE_IDS = [0, 1, 2, 3]  # GPUs to use

# modify this to point to your data directory
INPUT_ROOT_DIR = 'alexnet_data_in'
TRAIN_IMG_DIR = 'alexnet_data_in/cifar'
OUTPUT_DIR = 'alexnet_data_out'
LOG_DIR = OUTPUT_DIR + '/tblogs'  # tensorboard logs
CHECKPOINT_DIR = OUTPUT_DIR + '/models'  # model checkpoints
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# make checkpoint path directory
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

### C. MODEL ARCHITECTURE


1.   Feature Extractor (Convolution + Pooling Layers)
2.   Classifier (Fully Connected Layers)


In [None]:
class AlexNet(nn.Module):
    """
    Neural network model consisting of layers proposed by AlexNet paper.
    """

    def __init__(self, num_classes=10):
        """
        Define and allocate layers for this neural net.

        Args:
            num_classes (int): number of classes to predict with this model
        """

        # JANICE ----------------------------------------------------------------

        super(AlexNet, self).__init__()

        # To get the output size of a convolutional layer:
        # output = [ (input - kernel + padding at the start + padding at the end)/ stride ] + 1
        # reference: https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks

        # Architercture for the ImageNet dataset.
        # Input size adjusted to (b x 3 x 227 x 227)
        # The image in the original paper states that width and height are 224 pixels,
        # but the dimensions after first convolution layer do not lead to 55 x 55.

        # CIFAR-10 images will be resized to retain the architecture as is.

        # BatchNorm2d reference:
        # https://medium.com/@benjybo7/6-pytorch-normalization-layers-used-in-all-deep-learning-models-b565853b1fbc
        # https://www.ultralytics.com/glossary/batch-normalization

        self.features = nn.Sequential(
            # Convolutional layer 1
            nn.Conv2d(in_channels=3, out_channels=96, kernel_size=11, stride=4),     # (b x 96 x 55 x 55)
            nn.ReLU(),
            nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),  # Can be replaced with nn.BatchNorm2d(96) if any error encountered
            nn.MaxPool2d(kernel_size=3, stride=2),  # (b x 96 x 27 x 27)
            # Convolutional layer 2
            nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, padding=2),   # (b x 256 x 27 x 27)
            nn.ReLU(),
            nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),  # Can be replaced with nn.BatchNorm2d(256) if any error encountered
            nn.MaxPool2d(kernel_size=3, stride=2),  # (b x 256 x 13 x 13)
            # Convolutional layer 3
            nn.Conv2d(in_channels=256, out_channels=384, kernel_size=3, padding=1),  # (b x 384 x 13 x 13)
            nn.ReLU(),
            # Convolutional layer 4
            nn.Conv2d(in_channels=384, out_channels=384, kernel_size=3, padding=1),  # (b x 384 x 13 x 13)
            nn.ReLU(),
            # Convolutional layer 5
            nn.Conv2d(in_channels=384, out_channels=256, kernel_size=3, padding=1),  # (b x 256 x 13 x 13)
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2),  # (b x 256 x 6 x 6)
        )

        # PAULA ----------------------------------------------------------------
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(in_features=(256 * 6 * 6), out_features=4096),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(in_features=4096, out_features=4096),
            nn.ReLU(),
            nn.Linear(in_features=4096, out_features=num_classes),
        )
        self.init_bias()  # initialize bias

    def init_bias(self):
        for layer in self.features:
            if isinstance(layer, nn.Conv2d):
                nn.init.normal_(layer.weight, mean=0, std=0.01)
                nn.init.constant_(layer.bias, 0)
        # original paper = 1 for Conv2d layers 2nd, 4th, and 5th conv layers
        nn.init.constant_(self.features[4].bias, 1)
        nn.init.constant_(self.features[10].bias, 1)
        nn.init.constant_(self.features[12].bias, 1)

    def forward(self, x):
        """
        Pass the input through the net.

        Args:
            x (Tensor): input tensor

        Returns:
            output (Tensor): output tensor
        """
        x = self.features(x)
        x = x.view(-1, 256 * 6 * 6)  # reduce the dimensions for linear layer input.
        return self.classifier(x)

### D. DATA PIPELINE

In [None]:
# JM----------------------------------------------------------------------
# TRANSFORMATIONS, DATASETS AND DATALOADER

# Define transformation and augmentation
train_transform = transforms.Compose([
    transforms.Resize((227, 227)),  # to resize CIFAR-10 images and retain the AlexNet architecture as is
    transforms.RandomHorizontalFlip(), #data augmentation for test data
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize
])

test_transform = transforms.Compose([
    transforms.Resize((227, 227)),  # to resize CIFAR-10 images and retain the AlexNet architecture as is
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize
])

# Load the CIFAR-10 dataset
train_dataset = CIFAR10(root='./data', train=True, transform=train_transform, download=True)
test_dataset = CIFAR10(root='./data', train=False, transform=test_transform, download=True)
print('Datasets created')

# Create data loaders
train_loader = DataLoader(train_dataset,
                          shuffle=True,
                          pin_memory=True,
                          num_workers=8,
                          drop_last=True,
                          batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset,
                          shuffle=True,
                          pin_memory=True,
                          num_workers=8,
                          drop_last=True,
                          batch_size=BATCH_SIZE)
print('Dataloaders created')

100%|██████████| 170M/170M [00:04<00:00, 41.5MB/s]


Datasets created
Dataloaders created




### E. TRAINING PIPELINE

In [None]:
# MODEL

# print the seed value
seed = torch.initial_seed()
print('Used seed : {}'.format(seed))

tbwriter = SummaryWriter(log_dir=LOG_DIR)
print('TensorboardX summary writer created')

# create model
alexnet = AlexNet(num_classes=NUM_CLASSES).to(device)

# train on multiple GPUs. different mechanism compared to the manual model-splitting in the paper
alexnet = torch.nn.parallel.DataParallel(alexnet, device_ids=DEVICE_IDS)
print(alexnet)
print('AlexNet created')

# LOSS FUNCTION AND OPTIMIZER

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.SGD(alexnet.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0005)

# multiply LR by 1 / 10 after every 30 epochs
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
print('LR Scheduler created')

Used seed : 8573561166426702098
TensorboardX summary writer created
DataParallel(
  (module): AlexNet(
    (features): Sequential(
      (0): Conv2d(3, 96, kernel_size=(11, 11), stride=(4, 4))
      (1): ReLU()
      (2): LocalResponseNorm(5, alpha=0.0001, beta=0.75, k=2)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
      (4): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
      (5): ReLU()
      (6): LocalResponseNorm(5, alpha=0.0001, beta=0.75, k=2)
      (7): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
      (8): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (9): ReLU()
      (10): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU()
      (12): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU()
      (14): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (clas

### TEST IMAGE

In [None]:
from PIL import Image
import numpy as np
from torchvision.datasets import ImageFolder
from torchvision import transforms
from torch.utils.data import DataLoader

# create tiny dataset with one 2x2 image per split
for split in ["train", "test"]:
    os.makedirs(f"tiny_data/{split}/class0", exist_ok=True)
    img = Image.fromarray(np.array([[[255,0,0],[0,255,0]],
                                    [[0,0,255],[255,255,0]]], dtype=np.uint8))
    img.save(f"tiny_data/{split}/class0/dummy.png")

tiny_transform = transforms.Compose([
    transforms.Resize((227,227)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])

train_dataset = ImageFolder(root="tiny_data/train", transform=tiny_transform)
test_dataset  = ImageFolder(root="tiny_data/test",  transform=tiny_transform)

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=1, num_workers=0)
test_loader  = DataLoader(test_dataset,  shuffle=False, batch_size=1, num_workers=0)


In [None]:
#TRAINING LOOP

for epoch in range(NUM_EPOCHS):
    lr_scheduler.step()
    alexnet.train()  # set the model to training mode
    running_loss = 0.0  # track the loss for each epoch
    total_steps = 1

    for images, labels in train_loader:

        # move tensors to the configured device
        images, labels = images.to(device), labels.to(device)

        # forward pass, calculate the loss
        outputs = alexnet(images) #forward pass
        loss = criterion(outputs, labels)  # compute loss

        # backward, optimize, and update the parameters
        optimizer.zero_grad()  # zero the gradient buffers
        loss.backward()  # backward pass
        optimizer.step()  # update model weights

        running_loss += loss.item()  # Accumulate loss
        total_steps += 1

    alexnet.eval() # Set the model to evaluation mode
    correct, total = 0, 0
    val_loss = 0.0 # track the validation loss for each epoch

    with torch.no_grad():
        for inputs, labels in test_loader:

            # move tensors to the configured device
            inputs, labels = inputs.to(device), labels.to(device)

            # forward pass, calculate the loss
            outputs = alexnet(inputs) # forward pass
            loss = criterion(outputs, labels)

            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1) # get the class with highest probability
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_train_loss = running_loss / len(train_loader)
    avg_val_loss = val_loss / len(test_loader)
    accuracy = 100 * correct / total

    total_steps += 1

    # print train loss, test loss and accuracy for the epoch
    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss}, Test Loss: {avg_val_loss}, Accuracy: {accuracy}")

    # save checkpoints
    checkpoint_path = os.path.join(CHECKPOINT_DIR, 'alexnet_states_e{}.pkl'.format(epoch+1))
    state = {
        'epoch': epoch,
        'total_steps': total_steps,
        'optimizer': optimizer.state_dict(),
        'model': alexnet.state_dict(),
        'seed': seed,
    }
    torch.save(state, checkpoint_path)



### F. INFERENCE AND EVALUATION

In [None]:
# ARVIN --------------------------------------------------------------------------------
# start training!!
    print('Starting training...')
    total_steps = 1
    for epoch in range(NUM_EPOCHS):
        lr_scheduler.step()
        for imgs, classes in train_loader:

            # move tensors to the configured device
            imgs, classes = imgs.to(device), classes.to(device)

            # forward pass, calculate the loss
            output = alexnet(imgs)
            loss = criterion(output, classes)

            # backward, optimize, and update the parameters
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # log the information and add to tensorboard
            if total_steps % 10 == 0:
                with torch.no_grad():
                    _, preds = torch.max(output, 1)
                    accuracy = torch.sum(preds == classes)

                    print('Epoch: {} \tStep: {} \tLoss: {:.4f} \tAcc: {}'
                        .format(epoch + 1, total_steps, loss.item(), accuracy.item()))
                    tbwriter.add_scalar('loss', loss.item(), total_steps)
                    tbwriter.add_scalar('accuracy', accuracy.item(), total_steps)

            # print out gradient values and parameter average values
            if total_steps % 100 == 0:
                with torch.no_grad():
                    # print and save the grad of the parameters
                    # also print and save parameter values
                    print('*' * 10)
                    for name, parameter in alexnet.named_parameters():
                        if parameter.grad is not None:
                            avg_grad = torch.mean(parameter.grad)
                            print('\t{} - grad_avg: {}'.format(name, avg_grad))
                            tbwriter.add_scalar('grad_avg/{}'.format(name), avg_grad.item(), total_steps)
                            tbwriter.add_histogram('grad/{}'.format(name),
                                    parameter.grad.cpu().numpy(), total_steps)
                        if parameter.data is not None:
                            avg_weight = torch.mean(parameter.data)
                            print('\t{} - param_avg: {}'.format(name, avg_weight))
                            tbwriter.add_histogram('weight/{}'.format(name),
                                    parameter.data.cpu().numpy(), total_steps)
                            tbwriter.add_scalar('weight_avg/{}'.format(name), avg_weight.item(), total_steps)

            total_steps += 1

        # save checkpoints
        checkpoint_path = os.path.join(CHECKPOINT_DIR, 'alexnet_states_e{}.pkl'.format(epoch + 1))
        state = {
            'epoch': epoch,
            'total_steps': total_steps,
            'optimizer': optimizer.state_dict(),
            'model': alexnet.state_dict(),
            'seed': seed,
        }
        torch.save(state, checkpoint_path)