In [None]:
import torch
from torch.nn import functional as F
import numpy as np
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq

In [None]:
model_id = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForVision2Seq.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

image = Image.open("coco/images/val2014/COCO_val2014_000000000042.jpg").convert("RGB")
prompt = "<image>\nDescribe this image in detail."

# ✅ Corrected processor call
inputs = processor(text=prompt, images=[image], return_tensors="pt").to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=100)
output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("LLaVA output:", output)

In [None]:
def pgd_attack_image(
    model,
    processor,
    image_pil,
    prompt="<image>\nDescribe this image in detail.",
    epsilon=8/255,
    alpha=2/255,
    num_steps=20,
    device="cuda"
):
    """
    Final corrected PGD attack on a single image for LLaVA-1.5-7B (HF).
    
    Args:
        model: HuggingFace LLaVA model
        processor: Corresponding AutoProcessor
        image_pil: Original PIL image
        prompt: Prompt string including <image> token
        epsilon: Max L∞ perturbation (in [0,1] pixel space)
        alpha: PGD step size (in [0,1] pixel space)
        num_steps: Number of PGD iterations
        device: PyTorch device string
        
    Returns:
        adv_pil: Adversarial PIL image
        adv_caption: Model's output on adversarial image
        clean_caption: Original caption
    """
    model.eval().to(device)

    # Step 1: Get clean caption on the original image
    with torch.no_grad():
        clean_inputs = processor(text=prompt, images=[image_pil], return_tensors="pt").to(device)
        clean_output_ids = model.generate(**clean_inputs, max_new_tokens=100)
        clean_caption = processor.batch_decode(clean_output_ids, skip_special_tokens=True)[0].strip()
        # Post-process to remove the prompt from the model's output
        prompt_str_to_remove = prompt.replace('<image>\n', '')
        if clean_caption.startswith(prompt_str_to_remove):
             clean_caption = clean_caption[len(prompt_str_to_remove):].strip()

    attack_text = prompt + " " + clean_caption
    
    inputs = processor(text=attack_text, images=[image_pil], return_tensors="pt").to(device)
    
    original_pixel_values = inputs['pixel_values'].clone().detach()
    adv_pixel_values = original_pixel_values.clone().detach()
    adv_pixel_values.requires_grad = True

    labels = inputs.input_ids.clone()
    prompt_token_len = processor(text=prompt, return_tensors="pt").input_ids.shape[1]
    labels[:, :prompt_token_len] = -100

    # PGD Attack Loop
    for _ in range(num_steps):
        model.zero_grad()
        attack_inputs = {
            "pixel_values": adv_pixel_values,
            "input_ids": inputs.input_ids,
            "attention_mask": inputs.attention_mask,
            "labels": labels
        }
        
        outputs = model(**attack_inputs)
        loss = outputs.loss
        
        loss.backward()

        with torch.no_grad():
            grad_sign = adv_pixel_values.grad.sign()
            adv_pixel_values.data = adv_pixel_values.data + alpha * grad_sign
            perturbation = torch.clamp(adv_pixel_values.data - original_pixel_values, -epsilon, epsilon)
            adv_pixel_values.data = original_pixel_values + perturbation
            
        adv_pixel_values.grad.zero_()


    # Step 3: Create final adversarial image and get its caption
    image_processor = processor.image_processor
    mean = torch.tensor(image_processor.image_mean, device=device).view(1, 3, 1, 1)
    std = torch.tensor(image_processor.image_std, device=device).view(1, 3, 1, 1)
    
    unnormalized_adv_pixels = adv_pixel_values * std + mean
    unnormalized_adv_pixels = torch.clamp(unnormalized_adv_pixels, 0, 1)
    
    adv_np = unnormalized_adv_pixels[0].detach().cpu().permute(1, 2, 0).numpy()
    adv_pil = Image.fromarray((adv_np * 255).astype(np.uint8))

    adv_pil = adv_pil.resize(image_pil.size)

    with torch.no_grad():
        final_inputs = processor(text=prompt, images=[adv_pil], return_tensors="pt").to(device)
        final_output_ids = model.generate(**final_inputs, max_new_tokens=100)
        adv_caption = processor.batch_decode(final_output_ids, skip_special_tokens=True)[0].strip()

    return adv_pil, adv_caption, clean_caption

In [None]:
image = Image.open("coco/images/val2014/COCO_val2014_000000552842.jpg").convert("RGB")

adv_img_tensor, adv_caption, clean_caption = pgd_attack_image(
    model, processor, image,
    prompt="<image>\nDescribe this image in detail.",
    epsilon=4/255, alpha=1/255, num_steps=50, device="cuda"
)

print("🟢 Clean Caption:", clean_caption)
print("🔴 Adversarial Caption:", adv_caption)