In [None]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
import torch
import numpy as np
import time
import cv2
# add the following to avoid ssl issues from the server
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [3]:
device = "cuda:1" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

### Random image for internet of size 224x224

In [4]:
# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# image = Image.open(requests.get(url, stream=True).raw)
image = np.random.randint(0, 256, (7, 7, 3), dtype=np.uint8) # Generate a random 7x7x3 image

### Single image

In [5]:
# preprocess
start_time_processing = time.time()
inputs_single = processor(images=image, return_tensors="pt")
inputs_single = {k: v.to(device) for k, v in inputs_single.items()}
elapsed_time_processing = time.time() - start_time_processing
# forward pass
start_time_single = time.time()
with torch.no_grad():
    image_features_single = model.get_image_features(**inputs_single)
elapsed_time_single = time.time() - start_time_single

print(f"Forward pass time for single image: {elapsed_time_single:.4f} seconds")
print(f"Processing time for single image: {elapsed_time_processing:.4f} seconds")

print()
print('Input single image shape:', np.array(image).shape)
print('Preprocessed single image shape:', inputs_single['pixel_values'].shape)
print('Output single image features:', image_features_single.shape)

Forward pass time for single image: 1.3584 seconds
Processing time for single image: 0.0666 seconds

Input single image shape: (7, 7, 3)
Preprocessed single image shape: torch.Size([1, 3, 224, 224])
Output single image features: torch.Size([1, 512])


### Batched image

In [6]:
# preprocess
batch_size = 500
images_batch = [image for _ in range(batch_size)]
start_time_batch_processing = time.time()
inputs_batch = processor(images=images_batch, return_tensors="pt")
inputs_batch = {k: v.to(device) for k, v in inputs_batch.items()}
elapsed_time_batch_processing = time.time() - start_time_batch_processing
# forward pass
start_time_batch = time.time()
with torch.no_grad():
    image_features_batch = model.get_image_features(**inputs_batch)
elapsed_time_batch = time.time() - start_time_batch

print(f"Forward pass time for batch of {batch_size} images: {elapsed_time_batch:.4f} seconds")
print(f"Processing time for batch image: {elapsed_time_batch_processing:.4f} seconds")

print()
print('Input single image shape:', np.array(image).shape)
print('Preprocessed batch image shape:', inputs_batch['pixel_values'].shape)
print('Output batch image features:', image_features_batch.shape)

Forward pass time for batch of 500 images: 0.3913 seconds
Processing time for batch image: 2.7464 seconds

Input single image shape: (7, 7, 3)
Preprocessed batch image shape: torch.Size([500, 3, 224, 224])
Output batch image features: torch.Size([500, 512])


# With hand-crafted processing

In [7]:
def processor_manual(images_np, device):
    """
    Process a batch of images to be suitable for CLIP model input:
    - Resize each image using nearest neighbor interpolation.
    - Transpose each image from HWC to CHW.
    - Convert the images to a PyTorch tensor.
    - Move the tensor to the specified device.
    - Prepare the dictionary expected by CLIP.
    
    Parameters:
    - images_np (numpy.ndarray): The batch of images in BHWC format.
    - device (str): The device to which the tensor will be moved ('cuda:0', 'cuda:1', 'cpu', etc.).
    
    Returns:
    - dict: A dictionary with the tensor ready to be input into CLIP.
    """
    # Initialize an empty list to hold processed images
    processed_images = []

    # Iterate over each image in the batch
    for image_np in images_np:
        # Resize the image
        resized_image = cv2.resize(image_np, (224, 224), interpolation=cv2.INTER_NEAREST)
        
        # Transpose the image from HWC to CHW
        transposed_image = np.transpose(resized_image, (2, 0, 1))
        
        # Append the processed image to the list
        processed_images.append(transposed_image)

    stacked_images = np.stack(processed_images, axis=0)  # Stack along a new first dimension
    # Convert the numpy array to a torch tensor
    tensor = torch.from_numpy(stacked_images).float()

    # Move the tensor to the specified device
    tensor = tensor.to(device)
    
    # Prepare the dictionary for CLIP
    inputs_batch = {'pixel_values': tensor}
    
    return inputs_batch

### Single

In [8]:
# preprocess
start_time_processing = time.time()
inputs_single = processor_manual([np.array(image)], device)
elapsed_time_processing = time.time() - start_time_processing
# forward pass
start_time_single = time.time()
with torch.no_grad():
    image_features_single = model.get_image_features(**inputs_single)
elapsed_time_single = time.time() - start_time_single

print(f"Forward pass time for single image: {elapsed_time_single:.4f} seconds")
print(f"Processing time for single image: {elapsed_time_processing:.4f} seconds")

print()
print('Input single image shape:', np.array(image).shape)
print('Preprocessed single image shape:', inputs_single['pixel_values'].shape)
print('Output single image features:', image_features_single.shape)

Forward pass time for single image: 0.2541 seconds
Processing time for single image: 0.0218 seconds

Input single image shape: (7, 7, 3)
Preprocessed single image shape: torch.Size([1, 3, 224, 224])
Output single image features: torch.Size([1, 512])


### Batch

In [17]:
# preprocess
batch_size = 3000
# images_batch = [np.array(image) for _ in range(batch_size)]
images_batch = np.random.randint(0, 256, (batch_size, 7, 7, 3), dtype=np.uint8) # Generate a random 7x7x3 image
start_time_batch_processing = time.time()
inputs_batch = processor_manual(images_batch, device)
elapsed_time_batch_processing = time.time() - start_time_batch_processing
# forward pass
start_time_batch = time.time()
with torch.no_grad():
    image_features_batch = model.get_image_features(**inputs_batch)
elapsed_time_batch = time.time() - start_time_batch

print(f"Forward pass time for batch of {batch_size} images: {elapsed_time_batch:.4f} seconds")
print(f"Processing time for batch image: {elapsed_time_batch_processing:.4f} seconds")

print()
print('Input single image shape:', np.array(image).shape)
print('Preprocessed batch image shape:', inputs_batch['pixel_values'].shape)
print('Output batch image features:', image_features_batch.shape)

Forward pass time for batch of 3000 images: 0.0117 seconds
Processing time for batch image: 1.4068 seconds

Input single image shape: (4000, 7, 7, 3)
Preprocessed batch image shape: torch.Size([3000, 3, 224, 224])
Output batch image features: torch.Size([3000, 512])
