<a href="https://colab.research.google.com/github/andreunifi/Deployment-of-Quantized-Neural-Networks-on-FPGA/blob/main/cnn_quant_qat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install brevitas

Collecting brevitas
  Downloading brevitas-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Collecting dependencies==2.0.1 (from brevitas)
  Downloading dependencies-2.0.1-py2.py3-none-any.whl.metadata (3.9 kB)
Collecting numpy<=1.26.4 (from brevitas)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting setuptools<70.0 (from brevitas)
  Downloading setuptools-69.5.1-py3-none-any.whl.metadata (6.2 kB)
Collecting unfoldNd (from brevitas)
  Downloading unfoldNd-0.2.3-py3-none-any.whl.metadata (1.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.9.1->brevitas)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.9.1->brevitas)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86

In [None]:
import os

# Specify the directory for saving checkpoints
checkpoint_dir = './checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

# Save model and optimizer state
def save_checkpoint(epoch, model, optimizer, loss, checkpoint_dir, filename="checkpoint.pth.tar"):
    checkpoint_path = os.path.join(checkpoint_dir, filename)
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_path}")

In [None]:
import torch
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

# Brevitas imports
from brevitas.nn import QuantConv2d, QuantLinear, QuantReLU, QuantIdentity
from brevitas.quant import Int8ActPerTensorFixedPoint, Int8WeightPerTensorFixedPoint


In [None]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
num_epochs = 1
batch_size = 4
learning_rate = 0.001

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                             download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                            download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size, shuffle=False)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


In [None]:
# Quantized CNN
class QuantCNN(torch.nn.Module):
    def __init__(self):
        super(QuantCNN, self).__init__()
        bit_width = 8
        self.quant_input = QuantIdentity(act_quant=Int8ActPerTensorFixedPoint, bit_width=bit_width, return_quant_tensor=True)
        self.relu = QuantReLU(act_quant=Int8ActPerTensorFixedPoint, bit_width=bit_width, return_quant_tensor=True)

        self.conv1 = QuantConv2d(3, 32, 3, padding=1, weight_quant=Int8WeightPerTensorFixedPoint)
        self.conv2 = QuantConv2d(32, 64, 3, padding=1, weight_quant=Int8WeightPerTensorFixedPoint)
        self.conv3 = QuantConv2d(64, 128, 3, padding=1, weight_quant=Int8WeightPerTensorFixedPoint)
        self.conv4 = QuantConv2d(128, 256, 3, padding=1, weight_quant=Int8WeightPerTensorFixedPoint)

        self.pool = torch.nn.MaxPool2d(2, 2)
        self.dropout = torch.nn.Dropout(0.5)

        self._get_flatten_size()

        self.fc1 = QuantLinear(self.flatten_size, 256, weight_quant=Int8WeightPerTensorFixedPoint)

        self.fc2 = QuantLinear(256, 128, weight_quant=Int8WeightPerTensorFixedPoint)

        self.fc3 = QuantLinear(128, 10, weight_quant=Int8WeightPerTensorFixedPoint)

    def _get_flatten_size(self):
        with torch.no_grad():
            dummy = torch.zeros(1, 3, 32, 32)
            dummy = self.pool(self.relu(self.conv1(dummy)))
            dummy = self.pool(self.relu(self.conv2(dummy)))
            dummy = self.pool(self.relu(self.conv3(dummy)))
            dummy = self.pool(self.relu(self.conv4(dummy)))
            self.flatten_size = dummy.view(1, -1).size(1)

    def forward(self, x):
        x = self.quant_input(x)
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = self.pool(self.relu(self.conv3(x)))
        x = self.pool(self.relu(self.conv4(x)))
        x = x.view(x.size(0), -1)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        x = self.fc3(x)
        return x


# Create and train the model
model = QuantCNN().to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 2000 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')
    save_checkpoint(epoch + 1, model, optimizer, loss.item(), checkpoint_dir, filename=f'checkpoint_epoch_{epoch+1}.pth.tar')

print('Finished Training')

# Evaluation
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    n_class_correct = [0 for _ in range(10)]
    n_class_samples = [0 for _ in range(10)]
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

        for i in range(len(labels)):
            label = labels[i]
            pred = predicted[i]
            if label == pred:
                n_class_correct[label] += 1
            n_class_samples[label] += 1

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network: {acc:.2f} %')

    for i in range(10):
        acc = 100.0 * n_class_correct[i] / n_class_samples[i]
        print(f'Accuracy of {classes[i]}: {acc:.2f} %')