In [1]:
'''
So, this is the basic explanation of this architecture:

Architecture Overview:
    - This one has 8 layers in total: 5 convolutionals one followed by 3 fully-connected ones.
    - Uses ReLU activation (f(x) = max(0, x))
    - Input images are of 224x224x3 RGB ratio images which is center-cropped of 256x256

Key Points:
    1. Convolutional Layers:
        - Layer 1: 96 kernels of size 11x11x3, stride 4
        - Layer 2: 256 kernels of size 5x5x48
        - Layer 3: 384 kernels of size 3x3x256
        - Layer 4: 384 kernels of size 3x3x192
        - Layer 5: 256 kernels of size 3x3x192

    2. Fully Connected:
        - First two fully-connected: 4096 neurons each
        - Final layer is 1000-way softmax

    3. GPU Implementation:
        - Used 2 GPUs
        - Communicate only at certain layers
        - Handles half of the kernels

    4. Local Response Normalization:
        - Formula provides in paper:
            - k = 2  - n = 5
            - alpha = 10^ - 4
            - beta = 0.75

    5. Overlapping Pooling:
        - Stride (s) = 2
        - Filter Size (z) = 3
        - Max pooling follows both response-normalization layers and fifth layer

Training Details:
    1. Optimization:
        - Stochastic gradient descent
        - Batch size: 128
        - Momentum: 0.9
        - Weight Decay: 0.0005
        - Initial alpha: 0.01

    2. Initialization:
        - Weights: Gaussian Distribution
        - Bias Values
            - 1 for layers 2, 4, 5 and fully connected layers
            - 0 for rest of the layers

    3. Dropout:
        - Applied to first two fully-connected
        - Dropout probability: 0.5

    4. Data Augmentation:
        - Random crops of 224x224 from 256x256
        - Horizontal reflections
        - PCA color augmentation
        - 10-crop evaluation at test time

Training Time and Hardware:
    - 5-6 days on two NVIDIA GTX 580 3GB GPUs
    - ~90 epochs through 1.2 million training images

    - Achieved top-5 error rates of:
        - 17.0% on ILSVRC-2010
        - 15.3% on ILSVRC-2012
'''

'\nSo, this is the basic explanation of this architecture:\n\nArchitecture Overview:\n    - This one has 8 layers in total: 5 convolutionals one followed by 3 fully-connected ones.\n    - Uses ReLU activation (f(x) = max(0, x))\n    - Input images are of 224x224x3 RGB ratio images which is center-cropped of 256x256\n\nKey Points:\n    1. Convolutional Layers:\n        - Layer 1: 96 kernels of size 11x11x3, stride 4\n        - Layer 2: 256 kernels of size 5x5x48\n        - Layer 3: 384 kernels of size 3x3x256\n        - Layer 4: 384 kernels of size 3x3x192\n        - Layer 5: 256 kernels of size 3x3x192\n        \n    2. Fully Connected:\n        - First two fully-connected: 4096 neurons each\n        - Final layer is 1000-way softmax\n        \n    3. GPU Implementation:\n        - Used 2 GPUs\n        - Communicate only at certain layers\n        - Handles half of the kernels\n        \n    4. Local Response Normalization:\n        - Formula provides in paper:\n            - k = 2  - 

In [2]:
'''
Implementation:

1. Complete architecture:
    - 5 convolutional layers with specified parameters
    - 3 fully connected layers
    - Which totals the 8 layers
    - Local Response Normalization
    - ReLU activations
    - Max pooling layers
    - Dropout

2. Training Utilites:
    - SGD optimizer
    - Weight initialization
    - Random crops, Flips, Color Jittering - Data Augmentation
    - Training and validation loops

3. Key hyperparameters:
    - Batch size: 128
    - Learning rate: 0.01
    - Momentum: 0.9
    - Weight Decay: 0.0005
    - Dropout rate: 0.5


Note: I don't have any specialized GPUs to perform this implementation on full extent
'''

"\nImplementation:\n\n1. Complete architecture:\n    - 5 convolutional layers with specified parameters\n    - 3 fully connected layers \n    - Which totals the 8 layers\n    - Local Response Normalization\n    - ReLU activations\n    - Max pooling layers\n    - Dropout\n\n2. Training Utilites:\n    - SGD optimizer \n    - Weight initialization\n    - Random crops, Flips, Color Jittering - Data Augmentation\n    - Training and validation loops\n\n3. Key hyperparameters:\n    - Batch size: 128\n    - Learning rate: 0.01\n    - Momentum: 0.9\n    - Weight Decay: 0.0005\n    - Dropout rate: 0.5\n\n\nNote: I don't have any specialized GPUs to perform this implementation on full extent\n"

In [5]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import numpy as np

In [9]:
class ImageNet(nn.Module):
    def __init__(self, num_classes = 1000):
        super(ImageNet, self).__init__()

      # First Convolutional Layer
      # Input: 224x224x3, Output: 55x55x96
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size = 11, stride = 4, padding = 2),
            nn.ReLU(inplace = True),
            nn.LocalResponseNorm(size = 5, alpha = 0.0001, beta = 0.75, k = 2),
            nn.MaxPool2d(kernel_size = 3, stride = 2)
        )

        # Second Convolutional Layer
        # Output: 27x27x256
        self.conv2 = nn.Sequential(
            nn.Conv2d(96, 256, kernel_size = 5, padding = 2),
            nn.ReLU(inplace = True),
            nn.LocalResponseNorm(size = 5, alpha = 0.0001, beta = 0.75, k = 2),
            nn.MaxPool2d(kernel_size = 3, stride = 2)
        )

        # Third Convolutional Layer
        # Output: 13x13x384
        self.conv3 = nn.Sequential(
            nn.Conv2d(96, 256, kernel_size = 5, padding = 2),
            nn.ReLU(inplace = True),
        )

        # Fourth Convolutional Layer
        # Output: 13x13x384
        self.conv4 = nn.Sequential(
            nn.Conv2d(384, 384, kernel_size = 3, padding = 1),
            nn.ReLU(inplace = True),
            nn.MaxPool2d(kernel_size = 3, stride = 2)
        )

        # Fifth Convolutional Layer
        # Output: 13x13x256
        self.conv5 = nn.Sequential(
            nn.Conv2d(384, 256, kernel_size = 3, padding = 1),
            nn.ReLU(inplace = True),
            nn.MaxPool2d(kernel_size = 3, stride = 2)
        )

        # Fully Connected Layers
        self.classifier = nn.Sequential(
            nn.Dropout(p = 0.5),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace = True),
            nn.Dropout(p = 0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace = True),
            nn.Linear(4096, num_classes)
        )

        # Initialize Weights
        self._initialize_weights()

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.normal_(m.weight, mean = 0, std = 0.01)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 1)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, mean = 0, std = 0.01)
                nn.init.constant_(m.bias, 1)

In [11]:
class ImageNetTrainer:
  def __init__(self, model, num_epochs = 90, batch_size = 128, learning_rate = 0.01,
               momentum = 0.9, weight_decay = 0.0005):
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.model = model.to(self.device)
    self.num_epochs = num_epochs
    self.batch_size = batch_size
    self.learning_rate = learning_rate
    self.momentum = momentum
    self.criterion = nn.CrossEntropyLoss()
    self.optimizer = torch.optim.SGD(
        model.parameters(),
        lr = learning_rate,
        momentum = momentum,
        weight_decay = weight_decay
    )

    self.train_transformer = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness = 0.1, contrast = 0.1, saturation = 0.1, hue = 0.1),
        transforms.ToTensor(),
        transforms.Normalize(
            mean = [0.485, 0.456, 0.406],
            std = [0.229, 0.224, 0.225])
    ])

    self.val_transfrom = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(
            mean = [0.485, 0.456, 0.406],
            std = [0.229, 0.224, 0.225]
        )
    ])

def train_epoch(self, train_loader):
  self.model.train()
  running_loss = 0.0
  for i, (images, labels) in enumerate(train_loader):
    images = images.to(self.device)
    labels = labels.to(self.device)

    self.optimizer.zero_grad()
    outputs = self.model(images)
    loss = self.criterion(outputs, labels)
    loss.backward()
    self.optimizer.step()

    running_loss += loss.item()
  return running_loss / len(train_loader)

def validate(self, val_loader):
  self.model.eval()
  running_loss = 0.0
  correct = 0
  total = 0
  with torch.no_grad():
    for images, labels in val_loader:
      images = images.to(self.device)
      labels = labels.to(self.device)
      outputs = self.model(images)
      loss = self.criterion(outputs, labels)
      running_loss += loss.item()
      _, predicted = outputs.max(1)
      total += labels.size()
      correct += predicted.eq(labels).sum().item()
  accuracy = 100.0 * correct / total
  return running_loss / len(val_loader), accuracy

In [12]:
def main():
  model = ImageNet(num_classes = 1000)
  trainer = ImageNetTrainer(model)
  train_dataset =
  train_loader = DataLoader(
      train_dataset,
      batch_size = trainer.batch_size,
      shuffle = True,
      num_workers = 4
  )
  for epoch in range(trainer.num_epochs):
        train_loss = trainer.train_epoch(train_loader)
        val_loss, val_accuracy = trainer.validate(val_loader)

        print(f'Epoch [{epoch+1} / {trainer.num_epochs}]')
        print(f'Training Loss: {train_loss:.4f}')
        print(f'Validation Loss: {val_loss:.4f}')
        print(f'Validation Accuracy: {val_accuracy:.2f}%')

In [13]:
if __name__ == '__main__':
  main()