In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from contextlib import nullcontext
import uuid


class VGG(nn.Module):
    """
    A custom neural network architecture implemented using PyTorch.
    This class inherits from nn.Module, which is the base class for all neural network modules in PyTorch.
    """
    
    def __init__(self, input_channels, num_classes):
        """
        Initialize the network architecture.
        """
        super().__init__()

        self.block1 = nn.Sequential(
            nn.Conv2d(input_channels, 32, kernel_size=3, padding=1, stride=1),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.Conv2d(32, 32, kernel_size = 3, padding = 1, stride = 1),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(p=0.25)
        )

        self.block2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, padding=1, stride=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.Conv2d(64, 64, kernel_size=3, padding=1, stride=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(p=0.25)
        )

        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),  # Global Average Pooling
            nn.Flatten(),
            nn.Dropout(p=0.5),
            nn.Linear(64, num_classes)  # Direct mapping from channels to classes
        )


    def forward(self, x):
        """
        Define the forward pass of the network. 
        """
        x = self.block1(x)
        x = self.block2(x)
        x = self.classifier(x)
        return x



class ImageDataset(Dataset):
    """
    Custom Dataset for loading image data from CSV file.
    The CSV file should contain image pixel values and labels.
    """
    def __init__(self, csv_file, transform=None, test=False):
        """
        Args:
            csv_file (str): Path to the csv file with image data
            transform (callable, optional): Optional transform to be applied on a sample
            test (bool): Whether this is test data (no labels)
        """
        # Read the CSV file
        self.data = pd.read_csv(csv_file)
        self.transform = transform
        self.test = test
        
        # Separate features (images) and labels
        if not test:
            self.images = self.data.iloc[:, 2:].values  # Skip id and label columns
            self.labels = self.data.iloc[:, 1].values   # Labels are in second column
        else:
            self.images = self.data.iloc[:, 1:].values  # Skip id column
        
        # Dictionary for class names
        self.class_map = {
            0: 'Airplane', 1: 'Automobile', 2: 'Bird', 3: 'Cat',
            4: 'Deer', 5: 'Dog', 6: 'Frog', 7: 'Horse',
            8: 'Ship', 9: 'Truck'
        }

    def __len__(self):
        """Returns the size of the dataset"""
        return len(self.data)

    def __getitem__(self, idx):
        """
        Returns a tuple (image, label) for the given index
        For test data, returns (image, 0) as a dummy label
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # First reshape to (32, 32, 3) for proper image layout
        image = self.images[idx].reshape(32, 32, 3).astype(np.uint8)
        
        # Get label if not test data, otherwise use dummy label
        label = self.labels[idx] if not self.test else 0

        # Convert to tensor and transpose to (3, 32, 32) for PyTorch
        image = torch.from_numpy(image).float() / 255.0
        image = image.permute(2, 0, 1)  # Change from (H,W,C) to (C,H,W)
        
        # Apply transforms if any
        if self.transform:
            image = self.transform(image)

        return image, label

    def save_sample(self, idx):
        """
        Display an image from the dataset
        Args:
            idx (int): Index of the image to display
        """
        image, label = self.__getitem__(idx)
        
        # Convert from (3, 32, 32) back to (32, 32, 3) for plotting
        image_np = (image.permute(1, 2, 0).numpy() * 255).astype(np.uint8)
        
        plt.figure(figsize=(4, 4))
        plt.imshow(image_np)
        plt.title(f'Class: {self.class_map[label]}')
        plt.axis('off')
        # plt.savefig(f'sample_{idx}.png')
        plt.imsave(f'sample_{idx}.png', image_np)


# Implement the datasets for train and test

class ModelManager:
    def __init__(self):
        print("Initializing Model")
        self.model = VGG(input_channels=3, num_classes=10)
        print("Initializing Train Dataset")
        self.train_dataset = ImageDataset('./competition_data/train.csv')
        self.batch_size = 32
        self.lr = 0.0001
        self.epochs = 100
        self.loss_fn = nn.CrossEntropyLoss()
        
        # Device selection logic
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
            print("Using CUDA GPU")
        elif torch.backends.mps.is_available():
            self.device = torch.device('mps')
            print("Using Apple Metal (MPS)")
        else:
            self.device = torch.device('cpu')
            print("Using CPU")
            
        # Move model to the selected device
        self.model = self.model.to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        
        self.train_dataloader = torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True
        )
        self.val_dataset = ImageDataset('./competition_data/val.csv')
        self.val_dataloader = torch.utils.data.DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=True
        )
        os.makedirs('./models', exist_ok=True)
        self.model_save_path = './models'

    def train_model(self):
        train_lossses = []
        train_accuracies = []
        val_losses = []
        val_accuracies = []

        train_total_steps = len(self.train_dataloader)
        val_total_steps = len(self.val_dataloader)

        for epoch in range(self.epochs):
            self.model.train(True)

            train_loss, train_acc = self.train_validate(epoch, self.epochs, train_total_steps)
            val_loss, val_acc = self.train_validate(epoch, self.epochs, val_total_steps, validation=True)
            print("--------------------------------")
            print(f"End of Epoch {epoch + 1} / {self.epochs}")
            print (f"Epoch {epoch + 1} / {self.epochs} --> Val Loss: {val_loss} Val Acc: {val_acc}")
            print("--------------------------------")
            self.save_best_model(val_acc, val_accuracies, epoch)
            train_lossses.append(train_loss)
            train_accuracies.append(train_acc)
            val_losses.append(val_loss)
            val_accuracies.append(val_acc)


        return train_lossses, train_accuracies, val_losses, val_accuracies

    def train_validate(self, epoch: int, total_epochs: int, total_steps: int, validation : bool = False):
        
        running_acc = 0.0
        running_loss = 0.0
        
        # Set model to eval mode during validation
        self.model.train(not validation)
        
        context = torch.no_grad() if validation else nullcontext()
        with context:
            for i, (images, labels) in enumerate(self.val_dataloader if validation else self.train_dataloader):
                images = images.to(self.device)
                labels = labels.to(self.device)

                # Only zero gradients and do backward pass during training
                if not validation:
                    self.optimizer.zero_grad()
                
                outputs = self.model(images)
                loss = self.loss_fn(outputs, labels)
                accuracy = self.compute_accuracy(outputs, labels)

                # Only do backward pass and optimization during training
                if not validation:
                    loss.backward()
                    self.optimizer.step()

                running_loss += loss.item()
                running_acc += accuracy

                if (i + 1) % 100 == 0:
                    print(
                        f"{'Validation' if validation else 'Training'} --> " +
                        f"Epoch {epoch + 1} / {total_epochs} " +
                        f"Step {i + 1} / {total_steps} " +
                        f"Loss: {running_loss / (i+1):.4f} " +
                        f"Accuracy: {running_acc / (i+1):.4f}"
                    )
            
            running_loss = running_loss / total_steps
            running_acc = running_acc / total_steps
            return running_loss, running_acc


    def save_best_model(self, val_acc, val_accuracies, epoch):
        
        model_name = f'Model_MiniVGG_{epoch + 1}_{str(uuid.uuid4())[:4]}.pt'
        if len(val_accuracies) == 0:
            pass
        elif val_acc > max(val_accuracies):
            torch.save(self.model.state_dict(), os.path.join(self.model_save_path, model_name)) 
            with open('summary.txt', 'a') as f:
                f.write(f'{model_name},{val_acc:.4f}\n')



    def compute_accuracy(self, outputs, labels):
        predictions = torch.argmax(outputs, dim=1)

        num_predictions = len(predictions)
        num_incorrect = torch.count_nonzero(predictions - labels)

        accuracy = (num_predictions - num_incorrect) / num_predictions
        return accuracy


import torch
import time
import pandas as pd
from MiniVGG import ImageDataset, VGG
import os
import torch.quantization
import torch.nn.utils.prune as prune

def apply_pruning(model, amount=0.3):
    """Apply PyTorch's built-in pruning"""
    for name, module in model.named_modules():
        # Prune 30% of connections in all Conv2d layers
        if isinstance(module, torch.nn.Conv2d):
            prune.l1_unstructured(module, name='weight', amount=amount)
            # Make pruning permanent
            prune.remove(module, 'weight')
        # Prune 30% of connections in Linear layers
        elif isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=amount)
            # Make pruning permanent
            prune.remove(module, 'weight')
    return model

def load_best_model():
    # Read summary.txt to find the best model
    with open('./summary.txt', 'r') as f:
        lines = f.readlines()
    
    # Parse model names and accuracies
    models = [(line.split(',')[0], float(line.split(',')[1])) for line in lines]
    # Get the model with highest accuracy
    best_model_name, best_accuracy = max(models, key=lambda x: x[1])
    print(f"Loading best model: {best_model_name} (accuracy: {best_accuracy:.4f})")
    expected_acc = best_accuracy
    
    # Load the model
    model = VGG(input_channels=3, num_classes=10)
    model.load_state_dict(torch.load(os.path.join('models', best_model_name), weights_only=True))
    model.eval()
    
    # Apply pruning
    # model = apply_pruning(model, amount=0.3)  # Prune 30% of weights
    print("Model pruned using PyTorch pruning!")
    
    # Convert to FP16
    model = model.half()
    print("Converted to FP16!")
    
    # Set device
    if torch.cuda.is_available():
        device = torch.device('cuda')
    elif torch.backends.mps.is_available():
        device = torch.device('mps')
        # Optimize model for inference using TorchScript
        try:
            print("Optimizing model for MPS...")
            model = model.to(device)
            # Create example input for tracing
            example_input = torch.randn(1, 3, 32, 32, dtype=torch.float16).to(device)
            # Script and optimize the model
            model = torch.jit.optimize_for_inference(
                torch.jit.script(model)
            )
            torch.backends.mps.graph_executor_enabled = True
            print("Model optimized successfully!")
        except Exception as e:
            print(f"Optimization failed, using standard model: {str(e)}")
    else:
        device = torch.device('cpu')
        model = model.to(device)
    # model = torch.compile(model, mode="reduce-overhead")

    return model, device, expected_acc

def run_benchmark():
    # Load model
    print("Loading best model...")
    model, device, expected_acc = load_best_model()
    
    # Load test dataset
    test_dataset = ImageDataset('./competition_data/test.csv', test=True)
    test_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=1,  # Process one sample at a time to measure latency
        shuffle=False
    )
    
    # Warm-up phase
    print("\nWarming up...")
    dummy_input = torch.randn(1, 3, 32, 32, dtype=torch.float16).to(device)
    with torch.no_grad():
            for _ in range(5):  # Run 100 warm-up inferences
                _ = model(dummy_input)
    print("Warm-up complete!")
    
    results = []
    print("\nRunning inference on test dataset...")
    # with torch.no_grad():  # Disable gradient computation for inference
    with torch.inference_mode(mode = True):
        for i, (image, _) in enumerate(test_dataloader):
            image = image.half().to(device)  # Convert to FP16
            
            # Measure inference time
            start_time = time.time()
            output = model(image)
            end_time = time.time()
            
            # Get prediction and latency
            prediction = torch.argmax(output, dim=1).item()
            latency = (end_time - start_time) * 1000  # Convert to milliseconds
            
            results.append({
                'id': i,
                'label': prediction,
                'latency': latency
            })
            
            if (i + 1) % 100 == 0:
                print(f"Processed {i + 1} samples...")
    
    # Save results
    df = pd.DataFrame(results)
    output_file = 'submission.csv'
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")
    
    # Print summary statistics
    print("\nBenchmark Summary:")
    print(f"Average latency: {df['latency'].mean():.2f} ms")
    print(f"Min latency: {df['latency'].min():.2f} ms")
    print(f"Max latency: {df['latency'].max():.2f} ms")
    print(f"Expected Score: {expected_acc / df['latency'].mean()}")

# if __name__ == "__main__":
#     manager = ModelManager()
#     train_lossses, train_accuracies, val_losses, val_accuracies = manager.train_model()
#     run_benchmark()

### Initialize Model

In [2]:
manager = ModelManager()
train_lossses, train_accuracies, val_losses, val_accuracies = manager.train_model()

Initializing Model
Initializing Train Dataset
Using Apple Metal (MPS)
Training --> Epoch 1 / 100 Step 100 / 1313 Loss: 2.2738 Accuracy: 0.1781
Training --> Epoch 1 / 100 Step 200 / 1313 Loss: 2.1800 Accuracy: 0.2041
Training --> Epoch 1 / 100 Step 300 / 1313 Loss: 2.1214 Accuracy: 0.2242
Training --> Epoch 1 / 100 Step 400 / 1313 Loss: 2.0713 Accuracy: 0.2416
Training --> Epoch 1 / 100 Step 500 / 1313 Loss: 2.0356 Accuracy: 0.2522
Training --> Epoch 1 / 100 Step 600 / 1313 Loss: 2.0024 Accuracy: 0.2629
Training --> Epoch 1 / 100 Step 700 / 1313 Loss: 1.9733 Accuracy: 0.2729
Training --> Epoch 1 / 100 Step 800 / 1313 Loss: 1.9456 Accuracy: 0.2832
Training --> Epoch 1 / 100 Step 900 / 1313 Loss: 1.9239 Accuracy: 0.2911
Training --> Epoch 1 / 100 Step 1000 / 1313 Loss: 1.9019 Accuracy: 0.3003
Training --> Epoch 1 / 100 Step 1100 / 1313 Loss: 1.8788 Accuracy: 0.3091
Training --> Epoch 1 / 100 Step 1200 / 1313 Loss: 1.8602 Accuracy: 0.3166
Training --> Epoch 1 / 100 Step 1300 / 1313 Loss: 1

### Bench Marking

In [3]:
run_benchmark()

Loading best model...
Loading best model: Model_MiniVGG_97_d12d.pt (accuracy: 0.7944)
Model pruned using PyTorch pruning!
Converted to FP16!
Optimizing model for MPS...
Model optimized successfully!

Warming up...
Warm-up complete!

Running inference on test dataset...
Processed 100 samples...
Processed 200 samples...
Processed 300 samples...
Processed 400 samples...
Processed 500 samples...
Processed 600 samples...
Processed 700 samples...
Processed 800 samples...
Processed 900 samples...
Processed 1000 samples...
Processed 1100 samples...
Processed 1200 samples...
Processed 1300 samples...
Processed 1400 samples...
Processed 1500 samples...
Results saved to submission.csv

Benchmark Summary:
Average latency: 0.52 ms
Min latency: 0.47 ms
Max latency: 1.22 ms
Expected Score: 1.5259507159516512


#### I started by making a VGG11 model and got accuracy of about 65%. Then I made VGG19 that resulted in better accuracy of about 85% but the latency was too high. So I tried to make the MiniVGG that gave me the accuracy of 80% and a better latency. I levereged my macbook air GPU by converting the model into the coreML model which resulted in a 3x speed up. I tried various optimization techniques. I started by the built in optimization flags and functions that pytorch provides. I also applied pruning that resulted in a very bad accuracy drop. I changed from FP32 to FP16 that made my model faster without a drop in accuracy. I also tried to apply quantization but it wasn't supported for mac devices.