In [None]:
!pip install transformers



Image processing from url to tensor (1,3,224,224)

In [None]:
from io import BytesIO
import numpy as np
from PIL import Image
import requests
import torch

# URL of the image
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'

# Load and preprocess the image from the URL
response = requests.get(url)
image = Image.open(BytesIO(response.content))

# Resize the image with resampling
new_size = (224, 224)
image = image.resize(new_size, resample=2)  # BILINEAR resampling

# Convert the image to a NumPy array and rescale
image = np.array(image)
image = image /255  # Rescale pixel values to [0, 1] based on the specified factor

# Standardize pixel values based on the specified mean and standard deviation
image_mean = [0.5, 0.5, 0.5]
image_std = [0.5, 0.5, 0.5]
image = (image - image_mean) / image_std

# Rearrange dimensions to (1, 3, 224, 224) to match the model input shape
image = np.transpose(image, (2, 0, 1))  # Channels-first format
image = image[np.newaxis, ...]  # Add a batch dimension

# Convert the NumPy array to a PyTorch tensor
image_tensor = torch.tensor(image, dtype=torch.float32)

In [None]:
image_tensor

tensor([[[[ 0.1137,  0.1686,  0.1843,  ..., -0.1922, -0.1843, -0.1843],
          [ 0.1373,  0.1686,  0.1843,  ..., -0.1922, -0.1922, -0.2078],
          [ 0.1137,  0.1529,  0.1608,  ..., -0.2314, -0.2235, -0.2157],
          ...,
          [ 0.8353,  0.7882,  0.7333,  ...,  0.7020,  0.6471,  0.6157],
          [ 0.8275,  0.7961,  0.7725,  ...,  0.5843,  0.4667,  0.3961],
          [ 0.8196,  0.7569,  0.7569,  ...,  0.0745, -0.0510, -0.1922]],

         [[-0.8039, -0.8118, -0.8118,  ..., -0.8902, -0.8902, -0.8980],
          [-0.7882, -0.7882, -0.7882,  ..., -0.8745, -0.8745, -0.8824],
          [-0.8118, -0.8039, -0.7882,  ..., -0.8902, -0.8902, -0.8902],
          ...,
          [-0.2706, -0.3176, -0.3647,  ..., -0.4275, -0.4588, -0.4824],
          [-0.2706, -0.2941, -0.3412,  ..., -0.4824, -0.5451, -0.5765],
          [-0.2784, -0.3412, -0.3490,  ..., -0.7333, -0.7804, -0.8353]],

         [[-0.5451, -0.4667, -0.4824,  ..., -0.7412, -0.6941, -0.7176],
          [-0.5529, -0.5137, -

Defining custom model

In [None]:
import torch
import torch.nn as nn
from transformers import ViTForImageClassification

class CustomViTModel(nn.Module):
    def __init__(self, model_name):
        super(CustomViTModel, self).__init__()

        # Load model
        self.model = ViTForImageClassification.from_pretrained(model_name)

    def forward(self, images):
        # Separate components
        embeddings = self.model.vit.embeddings(images)
        encoder_output = self.model.vit.encoder(embeddings)
        layernorm_output = self.model.vit.layernorm(encoder_output.last_hidden_state)
        classifier_output = self.model.classifier(layernorm_output[:, 0])

        return classifier_output

# Usage example
model_name = 'google/vit-base-patch16-224'
custom_model = CustomViTModel(model_name)

# Assuming 'image_tensor' is a torch tensor with shape (1, 3, 224, 224)
outputs = custom_model(image_tensor)


(…)ase-patch16-224/resolve/main/config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Converting custom_model to custom_vit.ptl

In [None]:
from torch.utils.mobile_optimizer import optimize_for_mobile

dummy_input = torch.zeros(1, 3, 224, 224)
ts_model = torch.jit.trace(custom_model, dummy_input)
optimized_torchscript_model = optimize_for_mobile(ts_model)
optimized_torchscript_model._save_for_lite_interpreter("custom_vit.ptl")

  if num_channels != self.num_channels:
  if height != self.image_size[0] or width != self.image_size[1]:


Loading model from custom_model.ptl and verifying results for cats image

In [None]:
# Load the TorchScript Lite model
model = torch.jit.load("/content/custom_vit.ptl")

logits = model(image_tensor)

In [None]:
for index, value in enumerate(logits[0]):
    print(f"Index {index}: Logit Value {value}")