# VLM Failure Modes â€” Self-Contained Colab Execution

This notebook runs the VLM failure modes experiments entirely within the Colab runtime.
No Google Drive mounting is required. All necessary code is written to the environment.

In [None]:
# 1. Setup Environment & Install LLaVA
!git clone https://github.com/haotian-liu/LLaVA.git
!pip install --upgrade pip
!pip install -e LLaVA

In [None]:
# 2. Install Project Dependencies
%%writefile requirements.txt
# Core ML
torch
torchvision
torchaudio

# HuggingFace / VLM
transformers>=4.36.0
accelerate
sentencepiece

# LLaVA deps
einops
timm
opencv-python
pillow
scipy
numpy
tqdm
pyyaml

# Optional (GPU efficiency on Colab)
bitsandbytes

In [None]:
!pip install -r requirements.txt

In [None]:
# 3. Create Project Structure
import os
os.makedirs("probes", exist_ok=True)
os.makedirs("attacks", exist_ok=True)
os.makedirs("experiments", exist_ok=True)
os.makedirs("results", exist_ok=True)

In [None]:
# 4. Write Code: Probes (Entropy)
%%writefile probes/entropy.py
import torch
import torch.nn.functional as F

def token_entropy(logits):
    # Ensure logits are float32 for stability
    logits = logits.to(torch.float32)
    probs = F.softmax(logits, dim=-1)
    entropy = -torch.sum(probs * torch.log(probs + 1e-12), dim=-1)
    return entropy.mean().item()

In [None]:
# 5. Write Code: Attacks (PGD Visual)
%%writefile attacks/pgd_visual.py
import torch
import torch.nn.functional as F

def pgd_attack(model, images, input_ids, epsilon=0.1, alpha=0.01, num_iter=20):
    """
    Projected Gradient Descent (PGD) attack on visual inputs.
    Objective: Diverge from the original model predictions (Maximize KL Divergence).
    
    Args:
        model: VLM model
        images: Preprocessed image tensor [B, C, H, W]
        input_ids: Tokenized input prompt
        epsilon: Maximum perturbation magnitude
        alpha: Step size
        num_iter: Number of iterations
        
    Returns:
        perturbed_images: Adversarial image tensor
    """
    # Clone and detach
    perturbed_images = images.clone().detach()
    perturbed_images.requires_grad = True
    
    # Get clean logits (Target to diverge from)
    with torch.no_grad():
        clean_outputs = model(input_ids, images=images)
        clean_logits = clean_outputs.logits
        
    # Optimization loop
    for _ in range(num_iter):
        perturbed_images.requires_grad = True
        
        # Forward pass
        outputs = model(input_ids, images=perturbed_images)
        logits = outputs.logits
        
        # Loss: KL Divergence (Unreduced first to handle batches properly if needed, but reduction batchmean is fine)
        # We want to maximize the distance between clean and perturbed distributions.
        # F.kl_div(input, target) expects input to be log-probs.
        loss = F.kl_div(
            F.log_softmax(logits, dim=-1),
            F.softmax(clean_logits, dim=-1),
            reduction='batchmean'
        )
        
        # Gradient Ascent: Maximize KL Divergence
        grad = torch.autograd.grad(loss, perturbed_images)[0]
        
        # Update image
        perturbed_images = perturbed_images.detach() + alpha * grad.sign()
        
        # Project back to epsilon ball (L-inf)
        delta = torch.clamp(perturbed_images - images, -epsilon, epsilon)
        
        # Clamp to valid range (Approximate 0-1 range logic, though images are normalized)
        # Ideally we should un-normalize, clamp, and re-normalize, but simpler clamping prevents explosion.
        # Using original min/max as bounds ensures we don't drift too far from valid pixel space scale.
        perturbed_images = torch.clamp(images + delta, images.min(), images.max()).detach()
        
    return perturbed_images

In [None]:
# 6. Write Code: Sanity Check
%%writefile experiments/sanity_check.py
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path

model_path = "liuhaotian/llava-v1.5-7b"
model_name = get_model_name_from_path(model_path)

print("Starting model load (this may take time)...")
tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path, 
    None, 
    model_name,
    device_map="auto",
    offload_folder="offload"  # Handle offloading for memory-constrained environments
)

print("Model loaded successfully")

In [None]:
# 7. Write Code: Entropy Analysis
%%writefile experiments/entropy_analysis.py
import torch
from PIL import Image
import os
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from probes.entropy import token_entropy
from attacks.pgd_visual import pgd_attack

# Configuration
MODEL_PATH = "liuhaotian/llava-v1.5-7b"
IMAGE_PATH = "LLaVA/images/llava_logo.png"
PROMPT = "Describe this image in detail."

def main():
    print("Loading model...")
    model_name = get_model_name_from_path(MODEL_PATH)
    tokenizer, model, image_processor, context_len = load_pretrained_model(
        MODEL_PATH, 
        None, 
        model_name,
        device_map="auto",
        offload_folder="offload"
    )
    # Ensure model is in eval mode
    model.eval()

    # Load & Preprocess Image
    print(f"Loading image from {IMAGE_PATH}")
    image = Image.open(IMAGE_PATH).convert('RGB')
    image_tensor = process_images([image], image_processor, model.config)
    image_tensor = image_tensor.to(model.device, dtype=torch.float16)

    # Prepare Prompt
    if model.config.mm_use_im_start_end:
        qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + PROMPT
    else:
        qs = DEFAULT_IMAGE_TOKEN + '\n' + PROMPT

    input_ids = tokenizer_image_token(qs, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)

    # 1. Clean Baseline
    print("Running Clean Forward Pass...")
    with torch.no_grad():
        clean_outputs = model(input_ids, images=image_tensor)
        clean_entropy = token_entropy(clean_outputs.logits)
    print(f"Clean Entropy: {clean_entropy:.4f}")

    # 2. Adversarial Attack
    print("Running PGD Attack (Visual)...")
    # PGD attack
    print(f"  Starting PGD (1 iteration)...")
    adv_image_tensor = pgd_attack(model, image_tensor, input_ids, epsilon=0.1, alpha=0.01, num_iter=1)

    # 3. Adversarial Pass
    print("Running Adversarial Forward Pass...")
    with torch.no_grad():
        adv_outputs = model(input_ids, images=adv_image_tensor)
        adv_entropy = token_entropy(adv_outputs.logits)
    print(f"Adversarial Entropy: {adv_entropy:.4f}")

    # Results
    delta = adv_entropy - clean_entropy
    print(f"Entropy Delta: {delta:.4f}")
    
    # Save simple report
    if not os.path.exists("results"):
        os.makedirs("results")
    with open("results/entropy_report.txt", "w") as f:
        f.write(f"Clean Entropy: {clean_entropy:.4f}\n")
        f.write(f"Adversarial Entropy: {adv_entropy:.4f}\n")
        f.write(f"Delta: {delta:.4f}\n")

if __name__ == "__main__":
    main()

In [None]:
# 8. Run Sanity Check
!python experiments/sanity_check.py

In [None]:
# 9. Run Entropy Analysis
!python experiments/entropy_analysis.py

In [None]:
# 10. Verify Results
!cat results/entropy_report.txt