In [None]:
pip install einops

In [None]:
#@title necessary imports
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json
Paths used for configuration of notebook: 
    	
      - Validating: [32mOK[0m
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json


In [None]:
#@title necessary imports
import einops
from tqdm.notebook import tqdm

from torchsummary import summary

import time
import torch
import torchvision
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn
import torchvision
import torch.optim as optim
from torchvision.transforms import Compose, Resize, ToTensor, Normalize, RandomHorizontalFlip, RandomCrop

In [None]:
#@title configuration
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

patch_size = 16         # Patch size (P) = 16
latent_size = 768       # Latent vector (D). ViT-Base uses 768
n_channels = 3          # Number of channels for input images
num_heads = 12          # ViT-Base uses 12 heads
num_encoders = 12       # ViT-Base uses 12 encoder layers
dropout = 0.1           # Dropout = 0.1 is used with ViT-Base & ImageNet-21k
num_classes = 10        # Number of classes in CIFAR10 dataset
size = 224            # Size used for training = 224

epochs = 50            # Number of epochs
base_lr = 9e-3         # Base LR
weight_decay = 0.03     # Weight decay for ViT-Base (on ImageNet-21k)
batch_size = 4


In [None]:
from transformers import ViTConfig, ViTModel

# Initializing a ViT vit-base-patch16-224 style configuration
configuration = ViTConfig()

# Initializing a model (with random weights) from the vit-base-patch16-224 style configuration
model = ViTModel(configuration)

# Accessing the model configuration
configuration = model.config




model_pretrained = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

In [None]:
transform_training_data = Compose(
    [RandomCrop(32, padding=4),Resize((224)), ToTensor(), Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                            download=True, transform=transform_training_data)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


def image_entropy(image):

  # Flatten the image into a 1D array
  flat_image = image.flatten()

  # Calculate the probability of each pixel intensity
  hist, _ = np.histogram(flat_image, bins=256)
  probabilities = hist / np.sum(hist)

  # Calculate the entropy using the Shannon entropy formula
  entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))

  return entropy


entropies = []
for i in range(len(trainset)):
    image, label = trainset[i]
    entropy = image_entropy(image.numpy())  # Convert to NumPy array for entropy calculation
    entropies.append((entropy, i))  # Store entropy and image index

# Sort the images in decreasing order of entropy
entropies.sort(reverse=True)

# Create a new dataset with the sorted images
sorted_trainset = torch.utils.data.Subset(trainset, [index for entropy, index in entropies])

# Create a dataloader for the sorted dataset
sorted_trainloader = torch.utils.data.DataLoader(sorted_trainset, batch_size=batch_size, shuffle=False)

Files already downloaded and verified


In [None]:
class InputEmbedding(nn.Module):
    def __init__(self, patch_size=patch_size, n_channels=n_channels, device=device, latent_size=latent_size, batch_size=batch_size):
        super(InputEmbedding, self).__init__()
        self.latent_size = latent_size
        self.patch_size = patch_size
        self.n_channels = n_channels
        self.device = device
        self.batch_size = batch_size
        self.input_size = self.patch_size * self.patch_size * self.n_channels

        self.linearProjection = nn.Linear(self.input_size, self.latent_size)

        # Random initialization of of [class] token that is prepended to the linear projection vector.
        self.class_token = nn.Parameter(torch.randn(self.batch_size, 1, self.latent_size)).to(self.device)

        # Positional embedding
        self.pos_embedding = nn.Parameter(torch.randn(self.batch_size, 1, self.latent_size)).to(self.device)


    def forward(self, input_data):

        input_data = input_data.to(self.device)

        # Re-arrange image into patches.
        patches = einops.rearrange(
            input_data, 'b c (h h1) (w w1) -> b (h w) (h1 w1 c)', h1=self.patch_size, w1=self.patch_size)

        linear_projection = self.linearProjection(patches).to(self.device)
        b, n, _ = linear_projection.shape

        # Prepend the [class] token to the original linear projection
        linear_projection = torch.cat((self.class_token, linear_projection), dim=1)
        pos_embed = einops.repeat(self.pos_embedding, 'b 1 d -> b m d', m=n+1)

        # Add positional embedding to linear projection
        linear_projection += pos_embed

        return linear_projection



class EncoderBlock(nn.Module):
    def __init__(self, latent_size=latent_size, num_heads=num_heads, device=device, dropout=dropout):
        super(EncoderBlock, self).__init__()

        self.latent_size = latent_size
        self.num_heads = num_heads
        self.device = device
        self.dropout = dropout

        # Normalization layer for both sublayers
        self.norm = nn.LayerNorm(self.latent_size)

        # Multi-Head Attention layer
        self.multihead = nn.MultiheadAttention(
            self.latent_size, self.num_heads, dropout=self.dropout)

        # MLP_head layer in the encoder. I use the same configuration as that
        # used in the original VitTransformer implementation. The ViT-Base
        # variant uses MLP_head size 3072, which is latent_size*4.
        self.enc_MLP = nn.Sequential(
            nn.Linear(self.latent_size, self.latent_size*4),
            nn.GELU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.latent_size*4, self.latent_size),
            nn.Dropout(self.dropout)
        )

    def forward(self, embedded_patches):

        # First sublayer: Norm + Multi-Head Attention + residual connection.
        # We take the first element ([0]) of the returned output from nn.MultiheadAttention()
        # because this module returns 'Tuple[attention_output, attention_output_weights]'.
        # Refer to here for more info: https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
        firstNorm_out = self.norm(embedded_patches)
        attention_output = self.multihead(firstNorm_out, firstNorm_out, firstNorm_out)[0]

        # First residual connection
        first_added_output = attention_output + embedded_patches

        # Second sublayer: Norm + enc_MLP (Feed forward)
        secondNorm_out = self.norm(first_added_output)
        ff_output = self.enc_MLP(secondNorm_out)

        # Return the output of the second residual connection
        return ff_output + first_added_output




class VitTransformer(nn.Module):
    def __init__(self, num_encoders=num_encoders, latent_size=latent_size, device=device, num_classes=num_classes, dropout=dropout):
        super(VitTransformer, self).__init__()
        self.num_encoders = num_encoders
        self.latent_size = latent_size
        self.device = device
        self.num_classes = num_classes
        self.dropout = dropout

        self.embedding = InputEmbedding()

        # Create a stack of encoder layers
        self.encStack = nn.ModuleList([EncoderBlock() for i in range(self.num_encoders)])

        # MLP_head at the classification stage has 'one hidden layer at pre-training time
        # and by a single linear layer at fine-tuning time'. For this implementation I will
        # use what was used for training, so I'll have a total of two layers, one hidden
        # layer and one output layer.
        self.MLP_head = nn.Sequential(
            nn.LayerNorm(self.latent_size),
            nn.Linear(self.latent_size, self.latent_size),
            nn.Linear(self.latent_size, self.num_classes),nn.Softmax(dim=1)
        )

    def forward(self, test_input):

        # Apply input embedding (patchify + linear projection + position embeding)
        # to the input image passed to the model
        enc_output = self.embedding(test_input)

        # Loop through all the encoder layers
        for enc_layer in self.encStack:
            enc_output = enc_layer.forward(enc_output)

        # Extract the output embedding information of the [class] token
        cls_token_embedding = enc_output[:, 0]

        # Finally, return the classification vector for all image in the batch
        return self.MLP_head(cls_token_embedding)




model = VitTransformer(num_encoders, latent_size, device, num_classes).to(device)

# Betas used for Adam in paper are 0.9 and 0.999, which are the default in PyTorch
optimizer = optim.Adam(model.parameters(), lr=base_lr, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.LinearLR(optimizer)

OutOfMemoryError: ignored

In [None]:
#@title image_entropy
 #import numpy as np

# def image_entropy(image):

#   # Flatten the image into a 1D array
#   flat_image = image.flatten()

#   # Calculate the probability of each pixel intensity
#   hist, _ = np.histogram(flat_image, bins=256)
#   probabilities = hist / np.sum(hist)

#   # Calculate the entropy using the Shannon entropy formula
#   entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))

#   return entropy



# import torch
# import torchvision
# import matplotlib.pyplot as plt
# import numpy as np

# entropies = []
# for i in range(len(trainset)):
#     image, label = trainset[i]
#     entropy = image_entropy(image.numpy())  # Convert to NumPy array for entropy calculation
#     entropies.append((entropy, i))  # Store entropy and image index

# # Sort the images in decreasing order of entropy
# entropies.sort(reverse=True)

# # Create a new dataset with the sorted images
# sorted_trainset = torch.utils.data.Subset(trainset, [index for entropy, index in entropies])

# # Create a dataloader for the sorted dataset
# sorted_trainloader = torch.utils.data.DataLoader(sorted_trainset, batch_size=batch_size, shuffle=False)  # Don't shuffle

# # Now you can iterate through sorted_trainloader to access the images in decreasing order of entropy

In [None]:
def main2():
    model.train().to(device)

    for epoch in tqdm(range(epochs), total=epochs):
        running_loss = 0.0
        correct = 0
        total = 0

        for batch_idx, (inputs, targets) in enumerate(tqdm(trainloader)):

            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)

            loss = criterion(outputs, targets)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

            if batch_idx % 20 == 0:
                print('Batch {} epoch {} has loss = {}'.format(batch_idx, epoch, running_loss/20))
                running_loss = 0

        scheduler.step()

        # Print epoch accuracy
        epoch_accuracy = 100 * correct / total
        print('Epoch {} accuracy = {}%'.format(epoch, epoch_accuracy))


In [None]:
main2()

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

OutOfMemoryError: ignored

In [None]:
model_pretrained

In [None]:
for param in model.parameters():
  print(param.data)

In [None]:
print(model)

In [None]:
print(model_pretrained)

In [None]:

torch.save(model_pretrained.state_dict(), 'model_weights.pth')

In [None]:
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()