In [None]:
import torch
import os
from transformers import AutoModelForCausalLM, AutoConfig
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import random
import numpy as np
import matplotlib.pyplot as plt
import torchvision.transforms as T
from torch.amp import autocast

torch.manual_seed(0)
np.random.seed(0)
torch.cuda.manual_seed_all(0) 
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()

print(f"Initial VRAM: {torch.cuda.memory_allocated() / 1024**3:.2f} GiB")

config = AutoConfig.from_pretrained("AIDC-AI/Ovis2-8B", trust_remote_code=True)
config.llm_attn_implementation = "eager"
model = AutoModelForCausalLM.from_pretrained(
    "AIDC-AI/Ovis2-8B",
    config=config,
    torch_dtype=torch.bfloat16,
    multimodal_max_length=32768,
    trust_remote_code=True,
    attn_implementation="eager",
    device_map="auto",
    low_cpu_mem_usage=True
).cuda()
model.gradient_checkpointing_enable()

text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()
processor = (text_tokenizer, visual_tokenizer)
print(f"VRAM after model: {torch.cuda.memory_allocated() / 1024**3:.2f} GiB")

Initial VRAM: 0.00 GiB


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

VRAM after model: 16.64 GiB


In [2]:
def compute_gradcam_ovis(image_path, texts, target_answer, model, processor):
    try:
        image = Image.open(image_path).convert("RGB")
        image = image.resize((224, 224))
    except Exception as e:
        print(f"Image error: {e}")
        return
    images = [image]

    shuffled_texts = texts.copy()
    random.shuffle(shuffled_texts)
    question_string = "Task: Identify the correct label for this image from the following choices:\n" + "\n".join(
        [f"{chr(65+i)}. {text}" for i, text in enumerate(shuffled_texts)]
    ) + "\nAnswer with the letter of the correct choice."
    query = f'<image>\n{question_string}'

    try:
        prompt, input_ids, pixel_values = model.preprocess_inputs(query, images, max_partition=12)
    except Exception as e:
        print(f"Preprocess error: {e}")
        return
    attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
    input_ids = input_ids.unsqueeze(0).to(device=model.device)
    attention_mask = attention_mask.unsqueeze(0).to(device=model.device)
    pixel_values = pixel_values.to(dtype=torch.bfloat16, device=model.device)
    pixel_values = [pixel_values]

    print(f"VRAM after input: {torch.cuda.memory_allocated() / 1024**3:.2f} GiB")

    target_token_id = text_tokenizer.convert_tokens_to_ids(target_answer)
    if target_token_id is None:
        print(f"Invalid answer: {target_answer}")
        return

    # Get all layers and set up hooks
    blocks = model.visual_tokenizer.backbone.trunk.blocks
    num_layers = len(blocks)
    print(f"Total layers to process: {num_layers}")

    # Dictionary to store feature maps for each layer
    feature_maps_dict = {}
    hooks = []
    for layer_idx, block in enumerate(blocks):
        def hook_fn(module, input, output, layer_idx=layer_idx):
            feature_maps_dict[layer_idx] = output
            feature_maps_dict[layer_idx].retain_grad()
            print(f"Layer {layer_idx} - Hooked shape: {feature_maps_dict[layer_idx].shape}, requires_grad: {feature_maps_dict[layer_idx].requires_grad}")
        hook = block.register_forward_hook(hook_fn)
        hooks.append(hook)

    # Forward and backward pass
    model.eval()
    with torch.enable_grad(), autocast('cuda', dtype=torch.bfloat16):
        try:
            output = model(
                input_ids=input_ids,
                pixel_values=pixel_values,
                attention_mask=attention_mask,
                labels=None
            )
            print(f"VRAM after forward: {torch.cuda.memory_allocated() / 1024**3:.2f} GiB")
            logits = output.logits[:, -1, :]
            target_logit = logits[0, target_token_id].float()
            model.zero_grad()
            target_logit.backward()
            print(f"VRAM after backward: {torch.cuda.memory_allocated() / 1024**3:.2f} GiB")
        except Exception as e:
            print(f"Forward/backward error: {e}")
            for hook in hooks:
                hook.remove()
            return

    # Compute and save heatmap for each layer
    for layer_idx in range(num_layers):
        feature_maps = feature_maps_dict.get(layer_idx)
        if feature_maps is None:
            print(f"Layer {layer_idx} - Feature maps not captured")
            continue
        if feature_maps.grad is None:
            print(f"Layer {layer_idx} - Gradients not captured")
            continue

        gradients = feature_maps.grad
        print(f"Layer {layer_idx} - Gradients shape: {gradients.shape}")
        weights = gradients.mean(dim=1, keepdim=True)
        heatmap = torch.relu((feature_maps * weights).sum(dim=2))
        heatmap = heatmap.view(32, 32)
        heatmap = heatmap / (heatmap.max() + 1e-6)

        upsample = T.Resize((224, 224), interpolation=T.InterpolationMode.BICUBIC)
        heatmap = upsample(heatmap.to(torch.float32).unsqueeze(0)).squeeze(0).detach().cpu().numpy()

        heatmap = np.uint8(255 * heatmap)
        heatmap_colored = plt.get_cmap('jet')(heatmap / 255.0)[:, :, :3]
        image_np = np.array(image) / 255.0
        superimposed_img = heatmap_colored * 0.4 + image_np * 0.6
        plt.figure(figsize=(8, 6))
        plt.imshow(superimposed_img)
        plt.axis('off')
        plt.title(f"Grad-CAM for Ovis Layer {layer_idx}")
        plt.colorbar(plt.cm.ScalarMappable(cmap='jet'), label='Attention Intensity')
        os.makedirs('/home/bboulbarss/gradcam_results/gradcam', exist_ok=True)
        plt.savefig(f"/home/bboulbarss/gradcam_results/gradcam/gradcam_layer_{layer_idx}.png")
        plt.close()

        torch.cuda.empty_cache()
        print(f"Layer {layer_idx} - VRAM after heatmap: {torch.cuda.memory_allocated() / 1024**3:.2f} GiB")

    for hook in hooks:
        hook.remove()
    torch.cuda.empty_cache()
    print(f"VRAM after cleanup: {torch.cuda.memory_allocated() / 1024**3:.2f} GiB")

In [3]:
##############################################################################################################

########################################
############# FINAL IMAGES #############
########################################

################################################################################################
## Image 1 in final_gradcam
## Relational
#texts = [
#    "A photo of a cylinder left of a cone",
#    "A photo of a cylinder right of a cone",
#    "A photo of a cone left of a cylinder",
#    "A photo of a cube right a cylinder",
#image_path = "/home/bboulbarss/large_dataset/relational/train/cylinder_left_cone/CLEVR_rel_000020.png"
#
# Two object
#texts = [
#    "A photo of a purple cylinder",
#    "A photo of a green cylinder",
#    "A photo of a purple cone",
#    "A photo of a red sphere",
#    "A photo of a blue cube"
#]
#image_path = "/home/bboulbarss/large_dataset/relational/train/cylinder_left_cone/CLEVR_rel_000020.png"

################################################################################################
# Image 2 in final_gradcam
# Relational
#texts = [
#    "A photo of a cylinder left of a sphere",
#    "A photo of a cylinder right of a sphere",
#    "A photo of a sphere left of a cylinder",
#    "A photo of a cube right a cone",
#    "A photo of a sphere right of a cone",]
#image_path = "/home/bboulbarss/large_dataset/relational/train/cylinder_left_sphere/CLEVR_rel_000031.png"
#
# Two object
#texts = [
#    "A photo of a green sphere",
#    "A photo of a blue sphere",
#    "A photo of a green cylinder",
#    "A photo of a red cone",
#    "A photo of a purple cube",
#]
#image_path = "/home/bboulbarss/large_dataset/relational/train/cylinder_left_sphere/CLEVR_rel_000031.png"

################################################################################################
# Image 3 in final_gradcam
# Relational
#texts = [
#    "A photo of a cone left of a cylinder",
#    "A photo of a cylinder left of a cone",
#    "A photo of a cone right of a cylinder",
#    "A photo of a cone right of a sphere",
#    "A photo of a cube left of a cylinder"
#]
#image_path = "/home/bboulbarss/large_dataset/relational/ood_test/cone_left_cylinder/cone left cylinder/CLEVR_rel_000634.png"

## Two object
#texts = [
#    "A photo of a yellow cone",
#    "A photo of a blue cone",
#    "A photo of a yellow cylinder",
#    "A photo of a red sphere",
#    "A photo of a brown cube",
#]
#image_path = "/home/bboulbarss/large_dataset/relational/ood_test/cone_left_cylinder/cone left cylinder/CLEVR_rel_000634.png"

################################################################################################
# Image 5 in final_gradcam
# Relational
#texts = [
#    "A photo of a cone left of a cube",
#    "A photo of a cone right of a cube",
#    "A photo of a cube left of a cone",
#    "A photo of a sphere right of a cylinder",
#    "A photo of a cube left of a cylinder",
#]
#i#mage_path = "/home/bboulbarss/large_dataset/relational/ood_test/cone_left_cube/cone left cube/CLEVR_rel_000466.png"

# Two object
texts=[
    "A photo of a purple cube",
    "A photo of a cyan cube",
    "A photo of a purple cone",
    "A photo of a gray sphere",
    "A photo of a brown cylinder"
]
image_path = "/home/bboulbarss/large_dataset/relational/ood_test/cone_left_cube/cone left cube/CLEVR_rel_000466.png"



target_answer = "A"
compute_gradcam_ovis(image_path, texts, target_answer, model, processor)

VRAM after input: 16.64 GiB
Total layers to process: 24
Layer 0 - Hooked shape: torch.Size([1, 1024, 1536]), requires_grad: True
Layer 1 - Hooked shape: torch.Size([1, 1024, 1536]), requires_grad: True
Layer 2 - Hooked shape: torch.Size([1, 1024, 1536]), requires_grad: True
Layer 3 - Hooked shape: torch.Size([1, 1024, 1536]), requires_grad: True
Layer 4 - Hooked shape: torch.Size([1, 1024, 1536]), requires_grad: True
Layer 5 - Hooked shape: torch.Size([1, 1024, 1536]), requires_grad: True
Layer 6 - Hooked shape: torch.Size([1, 1024, 1536]), requires_grad: True
Layer 7 - Hooked shape: torch.Size([1, 1024, 1536]), requires_grad: True
Layer 8 - Hooked shape: torch.Size([1, 1024, 1536]), requires_grad: True
Layer 9 - Hooked shape: torch.Size([1, 1024, 1536]), requires_grad: True
Layer 10 - Hooked shape: torch.Size([1, 1024, 1536]), requires_grad: True
Layer 11 - Hooked shape: torch.Size([1, 1024, 1536]), requires_grad: True
Layer 12 - Hooked shape: torch.Size([1, 1024, 1536]), requires_gra

  plt.colorbar(plt.cm.ScalarMappable(cmap='jet'), label='Attention Intensity')


Layer 1 - VRAM after heatmap: 33.80 GiB
Layer 2 - Gradients shape: torch.Size([1, 1024, 1536])
Layer 2 - VRAM after heatmap: 33.80 GiB
Layer 3 - Gradients shape: torch.Size([1, 1024, 1536])
Layer 3 - VRAM after heatmap: 33.80 GiB
Layer 4 - Gradients shape: torch.Size([1, 1024, 1536])
Layer 4 - VRAM after heatmap: 33.80 GiB
Layer 5 - Gradients shape: torch.Size([1, 1024, 1536])
Layer 5 - VRAM after heatmap: 33.80 GiB
Layer 6 - Gradients shape: torch.Size([1, 1024, 1536])
Layer 6 - VRAM after heatmap: 33.80 GiB
Layer 7 - Gradients shape: torch.Size([1, 1024, 1536])
Layer 7 - VRAM after heatmap: 33.80 GiB
Layer 8 - Gradients shape: torch.Size([1, 1024, 1536])
Layer 8 - VRAM after heatmap: 33.80 GiB
Layer 9 - Gradients shape: torch.Size([1, 1024, 1536])
Layer 9 - VRAM after heatmap: 33.80 GiB
Layer 10 - Gradients shape: torch.Size([1, 1024, 1536])
Layer 10 - VRAM after heatmap: 33.80 GiB
Layer 11 - Gradients shape: torch.Size([1, 1024, 1536])
Layer 11 - VRAM after heatmap: 33.80 GiB
Layer 

In [4]:
##### IMAGE 1 #####
## RELATIONAL ##
image_path = "/home/bboulbarss/large_dataset/relational/train/cube_left_sphere/CLEVR_rel_000065.png"
texts = [
    "A photo of a cube left of a sphere",
    "A photo of a cube right of a sphere",
    "A photo of a sphere left of a cube",
    "A photo of a cube left of a cone",
    "A photo of a cylinder right of a cube"
]
target_label = "A photo of a cube left of a sphere"

## TWO OBJECT ##
texts = [
    "A photo of a gray cube",
    "A photo of a yellow cube",
    "A photo of a gray sphere",
    "A photo of a purple cube",
    "A photo of a red cone"
]
target_label = "A photo of a grey cube"


##### IMAGE 2 #####
## RELATIONAL ##
image_path = "/home/bboulbarss/large_dataset/relational/train/cylinder_left_cone/CLEVR_rel_000085.png"
texts = [
    "A photo of a cylinder left of a cone",
    "A photo of a cone left of a cylinder",
    "A photo of a cylinder right of a cone",
    "A photo of a cube left of a cone",
    "A photo of a cylinder right of a cube"
]
target_label = "A photo of a cylinder left of a cone"

## TWO OBJECT ##
texts = [
    "A photo of a blue cone",
    "A photo of a purple cone",
    "A photo of a blue cylinder",
    "A photo of a gray cube",
    "A photo of a red sphere"
]
target_label = "A photo of a blue cone"


##### IMAGE 3 #####
## RELATIONAL ##
image_path = "/home/bboulbarss/large_dataset/relational/train/cube_right_cylinder/CLEVR_rel_000057.png"
texts = [
    "A photo of a cube right of a cylinder",
    "A photo of a cube left of a cylinder",
    "A photo of a cone right of a cylinder",
    "A photo of a cube left of a cone",
    "A photo of a cylinder right of a cone"
]
target_label = "A photo of a cube right of a cylinder"

## TWO OBJECT ##
texts = [
    "A photo of a blue cylinder",
    "A photo of a cyan cylinder",
    "A photo of a blue cube",
    "A photo of a brown sphere",
    "A photo of a yellow cube"
]
target_label = "A photo of a blue cylinder"


##### IMAGE 4 #####
## RELATIONAL ##
image_path = "/home/bboulbarss/large_dataset/relational/train/sphere_left_cone/CLEVR_rel_000007.png"
texts = [
    "A photo of a sphere left of a cone",
    "A photo of a sphere right of a cone",
    "A photo of a cone left of a sphere",
    "A photo of a cube left of a cone",
    "A photo of a cylinder right of a cone"
]
target_label = "A photo of a sphere left of a cone"

## TWO OBJECT ##
texts = [
    "A photo of a purple cone",
    "A photo of a blue cone",
    "A photo of a purple sphere",
    "A photo of a green cylinder",
    "A photo of a yellow cube"
]
target_label = "A photo of a purple cone"