In [1]:
import torch
import os
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import torchvision.transforms as T
from torch.amp import autocast

# Set seeds for reproducibility
torch.manual_seed(0)
np.random.seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Set environment variables for memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()

# Load CLIP model and processor
model_name = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(model_name)
model = CLIPModel.from_pretrained(model_name).to("cuda")
model.eval()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [2]:
def compute_gradcam_clip(image_path, texts, target_label, model, processor):
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")
    inputs = processor(text=texts, images=image, return_tensors="pt", padding=True).to("cuda")

    # Get the target label index
    try:
        target_label_idx = texts.index(target_label)
    except ValueError:
        print(f"Target label '{target_label}' not found in texts")
        return

    # Get all Vision Transformer layers
    num_layers = len(model.vision_model.encoder.layers)
    print(f"Total layers in CLIP Vision Transformer: {num_layers}")

    # Dictionary to store feature maps for each layer
    feature_maps_dict = {}
    hooks = []

    # Register hooks for all layers
    for layer_idx in range(num_layers):
        layer = model.vision_model.encoder.layers[layer_idx]
        def hook_fn(module, input, output, idx=layer_idx):
            # Output is a tuple; take the first element (hidden states)
            feature_maps = output[0]
            feature_maps_dict[idx] = feature_maps
            feature_maps_dict[idx].retain_grad()
            print(f"Layer {idx} - Hooked shape: {feature_maps.shape}, requires_grad: {feature_maps.requires_grad}")
        hook = layer.register_forward_hook(hook_fn)
        hooks.append(hook)

    # Forward and backward pass
    with torch.enable_grad(), autocast('cuda', dtype=torch.float32):
        try:
            outputs = model(**inputs)
            logits_per_image = outputs.logits_per_image
            target_logit = logits_per_image[0, target_label_idx].float()
            model.zero_grad()
            target_logit.backward()
            print(f"VRAM after backward: {torch.cuda.memory_allocated() / 1024**3:.2f} GiB")
        except Exception as e:
            print(f"Forward/backward error: {e}")
            for hook in hooks:
                hook.remove()
            return

    # Compute and save heatmap for each layer
    for layer_idx in range(num_layers):
        feature_maps = feature_maps_dict.get(layer_idx)
        if feature_maps is None:
            print(f"Layer {layer_idx} - Feature maps not captured")
            continue
        if feature_maps.grad is None:
            print(f"Layer {layer_idx} - Gradients not captured")
            continue

        gradients = feature_maps.grad
        print(f"Layer {layer_idx} - Gradients shape: {gradients.shape}")
        # Shape: (batch_size, num_patches + 1, hidden_size), ignore CLS token
        weights = gradients[:, 1:, :].mean(dim=1, keepdim=True)  # Average over patches
        heatmap = torch.relu((feature_maps[:, 1:, :] * weights).sum(dim=2))
        # Reshape to 7x7 grid (49 patches = 7x7 for 224x224 image with patch size 32)
        heatmap = heatmap.view(7, 7)
        heatmap = heatmap / (heatmap.max() + 1e-6)

        # Upsample to image size
        upsample = T.Resize((224, 224), interpolation=T.InterpolationMode.BICUBIC)
        heatmap = upsample(heatmap.to(torch.float32).unsqueeze(0)).squeeze(0).detach().cpu().numpy()

        # Visualize
        heatmap = np.uint8(255 * heatmap)
        heatmap_colored = plt.get_cmap('jet')(heatmap / 255.0)[:, :, :3]
        image_np = np.array(image.resize((224, 224))) / 255.0
        superimposed_img = heatmap_colored * 0.4 + image_np * 0.6
        plt.figure(figsize=(8, 6))
        plt.imshow(superimposed_img)
        plt.axis('off')
        plt.title(f"Grad-CAM for CLIP Layer {layer_idx}")
        plt.colorbar(plt.cm.ScalarMappable(cmap='jet'), label='Attention Intensity')
        #os.makedirs('/home/bboulbarss/gradcam/clip/clip_gradcam_', exist_ok=True)
        plt.savefig(f"/home/bboulbarss/gradcam_results/image2/clip-rel/thesis_clip_gradcam_rel_2_predicted_label/gradcam_clip_layer_{layer_idx}.png")
        plt.close()

        torch.cuda.empty_cache()
        print(f"Layer {layer_idx} - VRAM after heatmap: {torch.cuda.memory_allocated() / 1024**3:.2f} GiB")

    # Clean up
    for hook in hooks:
        hook.remove()
    torch.cuda.empty_cache()
    print(f"VRAM after cleanup: {torch.cuda.memory_allocated() / 1024**3:.2f} GiB")


##############################################################################################################


###### IMAGE 1 #####
### RELATIONAL ##
#image_path = "/home/bboulbarss/large_dataset/relational/train/cube_left_sphere/CLEVR_rel_000065.png"
#texts = [
#    "A photo of a cube left of a sphere",
#    "A photo of a cube right of a sphere",
#    "A photo of a sphere left of a cube",
#    "A photo of a cube left of a cone",
#    "A photo of a cylinder right of a cube"
#]
#target_label = "A photo of a cube left of a sphere"
#
### TWO OBJECT ##
#texts = [
#    "A photo of a gray cube",
#    "A photo of a yellow cube",
#    "A photo of a gray sphere",
#    "A photo of a purple cube",
#    "A photo of a red cone"
#]
#target_label = "A photo of a gray cube"
#
#
###### IMAGE 2 #####
### RELATIONAL ##
#image_path = "/home/bboulbarss/large_dataset/relational/train/cylinder_left_cone/CLEVR_rel_000085.png"
#texts = [
#    "A photo of a cylinder left of a cone",
#    "A photo of a cone left of a cylinder",
#    "A photo of a cylinder right of a cone",
#    "A photo of a cube left of a cone",
#    "A photo of a cylinder right of a cube"
#]
#target_label = "A photo of a cylinder left of a cone"
#
### TWO OBJECT ##
#texts = [
#    "A photo of a blue cone",
#    "A photo of a purple cone",
#    "A photo of a blue cylinder",
#    "A photo of a gray cube",
#    "A photo of a red sphere"
#]
#target_label = "A photo of a blue cone"
#
#
###### IMAGE 3 #####
### RELATIONAL ##
#image_path = "/home/bboulbarss/large_dataset/relational/train/cube_right_cylinder/CLEVR_rel_000057.png"
#texts = [
#    "A photo of a cube right of a cylinder",
#    "A photo of a cube left of a cylinder",
#    "A photo of a cone right of a cylinder",
#    "A photo of a cube left of a cone",
#    "A photo of a cylinder right of a cone"
#]
#target_label = "A photo of a cube right of a cylinder"
#
### TWO OBJECT ##
#texts = [
#    "A photo of a blue cylinder",
#    "A photo of a cyan cylinder",
#    "A photo of a blue cube",
#    "A photo of a brown sphere",
#    "A photo of a yellow cube"
#]
#target_label = "A photo of a blue cylinder"
#
#
###### IMAGE 4 #####
### RELATIONAL ##
#image_path = "/home/bboulbarss/large_dataset/relational/train/sphere_left_cone/CLEVR_rel_000007.png"
#texts = [
#    "A photo of a sphere left of a cone",
#    "A photo of a sphere right of a cone",
#    "A photo of a cone left of a sphere",
#    "A photo of a cube left of a cone",
#    "A photo of a cylinder right of a cone"
#]
#target_label = "A photo of a sphere left of a cone"
#
### TWO OBJECT ##
#texts = [
#    "A photo of a purple cone",
#    "A photo of a blue cone",
#    "A photo of a purple sphere",
#    "A photo of a green cylinder",
#    "A photo of a yellow cube"
#]
#target_label = "A photo of a purple cone"


########################################
############# FINAL IMAGES #############
########################################

################################################################################################
## Image 1 in final_gradcam
## Relational
#texts = [
#    "A photo of a cylinder left of a cone",
#    "A photo of a cylinder right of a cone",
#    "A photo of a cone left of a cylinder",
#    "A photo of a cube right a cylinder",
#    "A photo of a sphere right of a cone",]
#target_label = "A photo of a cylinder left of a cone"
#image_path = "/home/bboulbarss/large_dataset/relational/train/cylinder_left_cone/CLEVR_rel_000020.png"
#
## Two object
#texts = [
#    "A photo of a purple cylinder",
#    "A photo of a green cylinder",
#    "A photo of a purple cone",
#    "A photo of a red sphere",
#    "A photo of a blue cube"
#]
#target_label = "A photo of a purple cylinder"
#image_path = "/home/bboulbarss/large_dataset/relational/train/cylinder_left_cone/CLEVR_rel_000020.png"

################################################################################################
# Image 2 in final_gradcam
# Relational, correct label
texts = [
    "A photo of a cylinder left of a sphere",
    "A photo of a cylinder right of a sphere",
    "A photo of a sphere left of a cylinder",
    "A photo of a cube right a cone",
    "A photo of a sphere right of a cone",]
target_label = "A photo of a cylinder left of a sphere"
image_path = "/home/bboulbarss/large_dataset/relational/train/cylinder_left_sphere/CLEVR_rel_000031.png"


# Relational, predicted label
texts = [
    "A photo of a cylinder left of a sphere",
    "A photo of a cylinder right of a sphere",
    "A photo of a sphere left of a cylinder",
    "A photo of a cube right a cone",
    "A photo of a sphere right of a cone",]
target_label = "A photo of a cylinder right of a sphere"
image_path = "/home/bboulbarss/large_dataset/relational/train/cylinder_left_sphere/CLEVR_rel_000031.png"





## Two object
#texts = [
#    "A photo of a green sphere",
#    "A photo of a blue sphere",
#    "A photo of a green cylinder",
#    "A photo of a red cone",
#    "A photo of a purple cube",
#]
#target_label = "A photo of a green sphere"
#image_path = "/home/bboulbarss/large_dataset/relational/train/cylinder_left_sphere/CLEVR_rel_000031.png"

#################################################################################################

compute_gradcam_clip(image_path, texts, target_label, model, processor)


Total layers in CLIP Vision Transformer: 12
Layer 0 - Hooked shape: torch.Size([1, 50, 768]), requires_grad: True
Layer 1 - Hooked shape: torch.Size([1, 50, 768]), requires_grad: True
Layer 2 - Hooked shape: torch.Size([1, 50, 768]), requires_grad: True
Layer 3 - Hooked shape: torch.Size([1, 50, 768]), requires_grad: True
Layer 4 - Hooked shape: torch.Size([1, 50, 768]), requires_grad: True
Layer 5 - Hooked shape: torch.Size([1, 50, 768]), requires_grad: True
Layer 6 - Hooked shape: torch.Size([1, 50, 768]), requires_grad: True
Layer 7 - Hooked shape: torch.Size([1, 50, 768]), requires_grad: True
Layer 8 - Hooked shape: torch.Size([1, 50, 768]), requires_grad: True
Layer 9 - Hooked shape: torch.Size([1, 50, 768]), requires_grad: True
Layer 10 - Hooked shape: torch.Size([1, 50, 768]), requires_grad: True
Layer 11 - Hooked shape: torch.Size([1, 50, 768]), requires_grad: True
VRAM after backward: 1.19 GiB
Layer 0 - Gradients shape: torch.Size([1, 50, 768])


  plt.colorbar(plt.cm.ScalarMappable(cmap='jet'), label='Attention Intensity')


Layer 0 - VRAM after heatmap: 1.19 GiB
Layer 1 - Gradients shape: torch.Size([1, 50, 768])
Layer 1 - VRAM after heatmap: 1.19 GiB
Layer 2 - Gradients shape: torch.Size([1, 50, 768])
Layer 2 - VRAM after heatmap: 1.19 GiB
Layer 3 - Gradients shape: torch.Size([1, 50, 768])
Layer 3 - VRAM after heatmap: 1.19 GiB
Layer 4 - Gradients shape: torch.Size([1, 50, 768])
Layer 4 - VRAM after heatmap: 1.19 GiB
Layer 5 - Gradients shape: torch.Size([1, 50, 768])
Layer 5 - VRAM after heatmap: 1.19 GiB
Layer 6 - Gradients shape: torch.Size([1, 50, 768])
Layer 6 - VRAM after heatmap: 1.19 GiB
Layer 7 - Gradients shape: torch.Size([1, 50, 768])
Layer 7 - VRAM after heatmap: 1.19 GiB
Layer 8 - Gradients shape: torch.Size([1, 50, 768])
Layer 8 - VRAM after heatmap: 1.19 GiB
Layer 9 - Gradients shape: torch.Size([1, 50, 768])
Layer 9 - VRAM after heatmap: 1.19 GiB
Layer 10 - Gradients shape: torch.Size([1, 50, 768])
Layer 10 - VRAM after heatmap: 1.19 GiB
Layer 11 - Gradients shape: torch.Size([1, 50, 76