<a href="https://colab.research.google.com/github/abdullahnaderetman/Alex_Net-vs-VGG/blob/main/vgg_vs_alex_net.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import time
import matplotlib.pyplot as plt
import numpy as np
from tabulate import tabulate
import os
from collections import OrderedDict

# Create directory for saving results
os.makedirs('results', exist_ok=True)

# Check for GPU availability
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Hyperparameters
batch_size = 128
num_epochs = 10
learning_rate = 0.001

# Data transformations
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
])

# Load CIFAR-10 dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# Define AlexNet for CIFAR-10
class AlexNet(nn.Module):
    def __init__(self, num_classes=10):
        super(AlexNet, self).__init__()
        # Modified AlexNet for 32x32 input
        self.features = nn.Sequential(
            # Layer 1
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: 16x16

            # Layer 2
            nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: 8x8

            # Layer 3
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),

            # Layer 4
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),

            # Layer 5
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: 4x4
        )

        # Classifier
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(256 * 4 * 4, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(4096, 1024),
            nn.ReLU(inplace=True),
            nn.Linear(1024, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

# Define VGG model with configurable depth and batch normalization
class VGG(nn.Module):
    def __init__(self, cfg, batch_norm=False, num_classes=10):
        super(VGG, self).__init__()
        self.features = self._make_layers(cfg, batch_norm)

        # Calculate final feature map size based on the number of max pooling layers
        output_size = 32 // (2 ** len([1 for x in cfg if x == 'M']))

        # Calculate the feature dimension after flattening
        flat_features = cfg[-2] * output_size * output_size  # cfg[-2] is the last conv layer's output channels

        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(flat_features, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )
        self._initialize_weights()
        self.batch_norm = batch_norm

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

    def _make_layers(self, cfg, batch_norm=False):
        layers = []
        in_channels = 3
        for v in cfg:
            if v == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
                if batch_norm:
                    layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
                else:
                    layers += [conv2d, nn.ReLU(inplace=True)]
                in_channels = v
        return nn.Sequential(*layers)

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

# VGG configurations - 'M' stands for max pooling
vgg_configs = {
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG8': [64, 'M', 128, 'M', 256, 'M', 512, 'M', 512, 'M']
}

# Function to visualize filters
def visualize_filters(model, layer_idx, title, save_path):
    # Get the weights of the specified convolutional layer
    layers = list(model.features.children())
    conv_layer = None

    for i, layer in enumerate(layers):
        if isinstance(layer, nn.Conv2d):
            if layer_idx == 0:
                conv_layer = layer
                break
            layer_idx -= 1

    if conv_layer is None:
        print("Convolutional layer not found")
        return

    # Get the weights
    weights = conv_layer.weight.data.cpu().numpy()

    # Normalize the weights for better visualization
    min_val = weights.min()
    max_val = weights.max()
    weights = (weights - min_val) / (max_val - min_val)

    # Plot the first 64 filters (or all if less than 64)
    n_filters = min(64, weights.shape[0])
    n_cols = 8
    n_rows = n_filters // n_cols + (1 if n_filters % n_cols != 0 else 0)

    plt.figure(figsize=(15, 15))
    for i in range(n_filters):
        plt.subplot(n_rows, n_cols, i+1)

        # For RGB input (3 channels)
        if weights.shape[1] == 3:
            # Convert filter to RGB image format
            filter_img = np.transpose(weights[i], (1, 2, 0))
            plt.imshow(filter_img)
        else:
            # For single channel, just show the first input channel's filter
            plt.imshow(weights[i, 0], cmap='viridis')

        plt.axis('off')

    plt.suptitle(f'{title} - First Layer Filters', fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(save_path)
    plt.close()

# Function to visualize feature maps
def visualize_feature_maps(model, layer_idx, image, title, save_path):
    # Create a new model that outputs the feature maps of the specified layer
    features = []

    def hook_fn(module, input, output):
        features.append(output.detach().cpu().numpy())

    # Find the specific layer
    layer_count = 0
    target_layer = None

    for name, module in model.named_modules():
        if isinstance(module, nn.Conv2d):
            if layer_count == layer_idx:
                target_layer = module
                break
            layer_count += 1

    if target_layer is None:
        print(f"Could not find convolutional layer at index {layer_idx}")
        return

    # Register hook
    hook = target_layer.register_forward_hook(hook_fn)

    # Forward pass with our image
    model.eval()
    with torch.no_grad():
        model(image.unsqueeze(0).to(device))

    # Remove the hook
    hook.remove()

    # Get the feature maps
    feature_maps = features[0][0]

    # Plot the feature maps
    n_features = min(64, feature_maps.shape[0])
    n_cols = 8
    n_rows = n_features // n_cols + (1 if n_features % n_cols != 0 else 0)

    plt.figure(figsize=(15, 15))
    for i in range(n_features):
        plt.subplot(n_rows, n_cols, i+1)
        plt.imshow(feature_maps[i], cmap='viridis')
        plt.axis('off')

    plt.suptitle(f'{title} - Feature Maps of Layer {layer_idx+1}', fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(save_path)
    plt.close()

# Function to train models
def train_and_evaluate(model, model_name, optimizer, criterion, epochs):
    train_losses = []
    train_accuracies = []
    test_losses = []
    test_accuracies = []
    times_per_epoch = []

    best_accuracy = 0.0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        start_time = time.time()

        for i, data in enumerate(trainloader, 0):
            inputs, labels = data[0].to(device), data[1].to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            if i % 100 == 99:
                print(f'[{model_name} - Epoch {epoch+1}, Batch {i+1}] Loss: {running_loss/100:.3f}, Acc: {100*correct/total:.2f}%')
                running_loss = 0.0

        epoch_time = time.time() - start_time
        times_per_epoch.append(epoch_time)

        # Calculate train accuracy
        train_accuracy = 100 * correct / total
        train_accuracies.append(train_accuracy)

        # Calculate test accuracy
        model.eval()
        test_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for data in testloader:
                images, labels = data[0].to(device), data[1].to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                test_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        test_accuracy = 100 * correct / total
        test_accuracies.append(test_accuracy)
        test_losses.append(test_loss / len(testloader))

        print(f'{model_name} Epoch {epoch+1} - '
              f'Train Acc: {train_accuracy:.2f}%, Test Acc: {test_accuracy:.2f}%, '
              f'Time: {epoch_time:.2f}s')

        # Save the best model
        if test_accuracy > best_accuracy:
            best_accuracy = test_accuracy
            torch.save(model.state_dict(), f'results/{model_name}_best.pth')

        # Visualize feature maps after half training and at the end
        if epoch == epochs // 2 - 1 or epoch == epochs - 1:
            # Get a sample image from the test set
            sample_data = next(iter(testloader))
            sample_image = sample_data[0][0].to(device)

            visualize_feature_maps(
                model, 0, sample_image,
                f'{model_name} - Epoch {epoch+1}',
                f'results/{model_name}_feature_maps_epoch{epoch+1}.png'
            )

    # Calculate average time per epoch
    avg_time_per_epoch = sum(times_per_epoch) / len(times_per_epoch)

    # Save the final model
    torch.save(model.state_dict(), f'results/{model_name}_final.pth')

    # Visualize filters of the first layer
    visualize_filters(model, 0, model_name, f'results/{model_name}_filters.png')

    return {
        'train_accuracies': train_accuracies,
        'test_accuracies': test_accuracies,
        'test_losses': test_losses,
        'avg_time_per_epoch': avg_time_per_epoch,
        'final_test_accuracy': test_accuracies[-1]
    }

# Plot training curves
def plot_training_curves(results_dict):
    plt.figure(figsize=(20, 10))

    # Plot training accuracy
    plt.subplot(2, 2, 1)
    for model_name, results in results_dict.items():
        plt.plot(range(1, len(results['train_accuracies'])+1), results['train_accuracies'], marker='o', label=model_name)
    plt.title('Training Accuracy', fontsize=16)
    plt.xlabel('Epochs', fontsize=14)
    plt.ylabel('Accuracy (%)', fontsize=14)
    plt.grid(True)
    plt.legend()

    # Plot test accuracy
    plt.subplot(2, 2, 2)
    for model_name, results in results_dict.items():
        plt.plot(range(1, len(results['test_accuracies'])+1), results['test_accuracies'], marker='o', label=model_name)
    plt.title('Test Accuracy', fontsize=16)
    plt.xlabel('Epochs', fontsize=14)
    plt.ylabel('Accuracy (%)', fontsize=14)
    plt.grid(True)
    plt.legend()

    # Plot test loss
    plt.subplot(2, 2, 3)
    for model_name, results in results_dict.items():
        plt.plot(range(1, len(results['test_losses'])+1), results['test_losses'], marker='o', label=model_name)
    plt.title('Test Loss', fontsize=16)
    plt.xlabel('Epochs', fontsize=14)
    plt.ylabel('Loss', fontsize=14)
    plt.grid(True)
    plt.legend()

    # Plot average time per epoch
    plt.subplot(2, 2, 4)
    model_names = list(results_dict.keys())
    avg_times = [results_dict[name]['avg_time_per_epoch'] for name in model_names]
    plt.bar(model_names, avg_times)
    plt.title('Average Time per Epoch', fontsize=16)
    plt.xlabel('Model', fontsize=14)
    plt.ylabel('Time (seconds)', fontsize=14)
    for i, time_val in enumerate(avg_times):
        plt.text(i, time_val + 0.05, f'{time_val:.2f}s', ha='center')

    plt.tight_layout()
    plt.savefig('results/training_curves_comparison.png')
    plt.close()

# Compare final results in a table
def compare_final_results(results_dict):
    # Prepare data for tabulation
    data = []
    headers = ['Model', 'Final Test Accuracy', 'Avg Time/Epoch', 'Overfitting (Train-Test Acc)']

    for model_name, results in results_dict.items():
        final_train_acc = results['train_accuracies'][-1]
        final_test_acc = results['test_accuracies'][-1]
        avg_time = results['avg_time_per_epoch']
        overfitting_measure = final_train_acc - final_test_acc

        data.append([
            model_name,
            f"{final_test_acc:.2f}%",
            f"{avg_time:.2f}s",
            f"{overfitting_measure:.2f}%"
        ])

    # Sort by test accuracy (descending)
    data.sort(key=lambda x: float(x[1][:-1]), reverse=True)

    # Print table
    print("\nModel Comparison Results:\n")
    print(tabulate(data, headers, tablefmt="grid"))

    # Save to file
    with open('results/model_comparison_results.txt', 'w') as f:
        f.write("Model Comparison Results:\n\n")
        f.write(tabulate(data, headers, tablefmt="grid"))
        f.write("\n\nAnalysis:\n")

        # Add analysis of the best model
        best_model = data[0][0]
        f.write(f"\n1. {best_model} achieved the best performance with {data[0][1]} test accuracy.")

        # Analyze overfitting
        min_overfitting_idx = min(range(len(data)), key=lambda i: float(data[i][3][:-1]))
        f.write(f"\n2. {data[min_overfitting_idx][0]} showed the least overfitting ({data[min_overfitting_idx][3]}).")

        # Analyze efficiency
        min_time_idx = min(range(len(data)), key=lambda i: float(data[i][2][:-1]))
        f.write(f"\n3. {data[min_time_idx][0]} was the most efficient, taking only {data[min_time_idx][2]} per epoch.")

        # Concluding remark
        f.write("\n\nConclusion: ")
        if best_model == data[min_overfitting_idx][0] and best_model == data[min_time_idx][0]:
            f.write(f"{best_model} offers the best balance of accuracy, generalization, and efficiency.")
        else:
            f.write(f"Different models excel in different aspects. Consider the trade-offs between accuracy ({data[0][0]}), "
                   f"generalization ({data[min_overfitting_idx][0]}), and efficiency ({data[min_time_idx][0]}).")

# Main execution
def main():
    print(f"CIFAR-10 CNN Architecture Experiment")
    print(f"===================================")
    print(f"Training on device: {device}")

    # Initialize results dictionary
    all_results = {}

    # Define and train AlexNet
    print("\n[AlexNet Training]")
    alexnet_model = AlexNet().to(device)
    alexnet_optimizer = optim.Adam(alexnet_model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    alexnet_results = train_and_evaluate(alexnet_model, "AlexNet", alexnet_optimizer, criterion, num_epochs)
    all_results["AlexNet"] = alexnet_results

    # Define and train VGG16 (without batch norm)
    print("\n[VGG16 Training]")
    vgg16_model = VGG(vgg_configs['VGG16'], batch_norm=False).to(device)
    vgg16_optimizer = optim.Adam(vgg16_model.parameters(), lr=learning_rate)
    vgg16_results = train_and_evaluate(vgg16_model, "VGG16", vgg16_optimizer, criterion, num_epochs)
    all_results["VGG16"] = vgg16_results

    # Define and train VGG16 with batch normalization
    print("\n[VGG16 with BatchNorm Training]")
    vgg16bn_model = VGG(vgg_configs['VGG16'], batch_norm=True).to(device)
    vgg16bn_optimizer = optim.Adam(vgg16bn_model.parameters(), lr=learning_rate)
    vgg16bn_results = train_and_evaluate(vgg16bn_model, "VGG16_BN", vgg16bn_optimizer, criterion, num_epochs)
    all_results["VGG16_BN"] = vgg16bn_results

    # Define and train VGG8 (reduced depth)
    print("\n[VGG8 Training]")
    vgg8_model = VGG(vgg_configs['VGG8'], batch_norm=True).to(device)
    vgg8_optimizer = optim.Adam(vgg8_model.parameters(), lr=learning_rate)
    vgg8_results = train_and_evaluate(vgg8_model, "VGG8", vgg8_optimizer, criterion, num_epochs)
    all_results["VGG8"] = vgg8_results

    # Plot all training curves
    plot_training_curves(all_results)

    # Compare final results
    compare_final_results(all_results)

    print("\nExperiment completed. Results saved in the 'results' directory.")

if __name__ == "__main__":
    main()

Using device: cuda:0


100%|██████████| 170M/170M [00:03<00:00, 44.2MB/s]


CIFAR-10 CNN Architecture Experiment
Training on device: cuda:0

[AlexNet Training]
[AlexNet - Epoch 1, Batch 100] Loss: 2.063, Acc: 21.12%
[AlexNet - Epoch 1, Batch 200] Loss: 1.721, Acc: 27.94%
[AlexNet - Epoch 1, Batch 300] Loss: 1.576, Acc: 32.14%
AlexNet Epoch 1 - Train Acc: 35.31%, Test Acc: 50.10%, Time: 20.90s
[AlexNet - Epoch 2, Batch 100] Loss: 1.382, Acc: 49.61%
[AlexNet - Epoch 2, Batch 200] Loss: 1.340, Acc: 50.35%
[AlexNet - Epoch 2, Batch 300] Loss: 1.272, Acc: 51.65%
AlexNet Epoch 2 - Train Acc: 52.43%, Test Acc: 60.78%, Time: 20.01s
[AlexNet - Epoch 3, Batch 100] Loss: 1.169, Acc: 57.80%
[AlexNet - Epoch 3, Batch 200] Loss: 1.161, Acc: 58.26%
[AlexNet - Epoch 3, Batch 300] Loss: 1.141, Acc: 58.43%
AlexNet Epoch 3 - Train Acc: 59.14%, Test Acc: 64.83%, Time: 18.63s
[AlexNet - Epoch 4, Batch 100] Loss: 1.056, Acc: 62.20%
[AlexNet - Epoch 4, Batch 200] Loss: 1.047, Acc: 62.43%
[AlexNet - Epoch 4, Batch 300] Loss: 1.018, Acc: 63.08%
AlexNet Epoch 4 - Train Acc: 63.47%, Tes