In [12]:
# Core Hugging Face library
!pip install transformers

# LLaVA checkpoints are hosted on the hub, so you need this too
!pip install huggingface_hub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [7]:
import os
os.environ["HF_HOME"] = "/workspace/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/workspace/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/workspace/hf_cache"

import torch
from torch.nn import functional as F
from PIL import Image
import numpy as np
from transformers import AutoProcessor, AutoModelForVision2Seq

device = "cuda"

In [8]:
def _find_subseq_start(big_ids, small_ids):
    B, S = big_ids.tolist(), small_ids.tolist()
    for i in range(0, len(B) - len(S) + 1):
        if B[i:i+len(S)] == S:
            return i
    return -1

@torch.no_grad()
def _generate_caption(model, processor, image_pil, prompt, device):
    inputs = processor(text=prompt, images=[image_pil], return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    out_ids = model.generate(**inputs, max_new_tokens=120)
    txt = processor.batch_decode(out_ids, skip_special_tokens=True)[0].strip()
    # Best-effort prompt echo strip (optional)
    return txt

In [9]:
# --- PGD attack (with sanity checks + .to fix) ---
def pgd_attack_image_llava(
    model,
    processor,
    image_pil,
    prompt = "<image>\nDescribe this image in detail.",
    epsilon = 8/255,       # L∞ pixel-space
    alpha   = 1/255,       # step size in pixel-space
    num_steps = 40,
    device = "cuda",
    random_start = True,
    verbose = True
):
    model.eval()
    if hasattr(model.config, "use_cache"):
        model.config.use_cache = False  # reduce mem during loss/backprop

    # 1) Clean caption to derive the answer tokens we’ll attack
    clean_caption = _generate_caption(model, processor, image_pil, prompt, device)

    # 2) Pack prompt + answer as the training text
    attack_text = prompt + " " + clean_caption
    packed = processor(text=attack_text, images=[image_pil], return_tensors="pt")

    # Get normalization stats
    image_processor = processor.image_processor
    mean = torch.tensor(image_processor.image_mean, device=device, dtype=torch.float32).view(1,3,1,1)
    std  = torch.tensor(image_processor.image_std,  device=device, dtype=torch.float32).view(1,3,1,1)

    # Convert normalized pixel_values -> pixel space [0,1] with proper .to usage
    with torch.no_grad():
        norm_px = packed["pixel_values"]  # float32 normalized
        # to() fix: specify both device and dtype as keywords
        norm_px = norm_px.to(device=device, dtype=model.dtype)
        mean_   = mean.to(device=device, dtype=model.dtype)
        std_    = std.to(device=device, dtype=model.dtype)
        orig_px = (norm_px * std_ + mean_).clamp(0, 1)

    # Text tensors -> device
    inputs = {
        "input_ids": packed["input_ids"].to(device),
        "attention_mask": packed["attention_mask"].to(device),
    }

    # Build labels: mask everything before the answer starts
    ans_ids = processor(text=clean_caption, return_tensors="pt", add_special_tokens=False).input_ids[0].to(device)
    full_ids = inputs["input_ids"][0]
    start = _find_subseq_start(full_ids, ans_ids)
    if start < 0:
        # Fallback: compute prompt length using the same call shape (with image)
        tmp = processor(text=prompt, images=[image_pil], return_tensors="pt")
        start = tmp.input_ids.shape[1]
    labels = inputs["input_ids"].clone()
    labels[:, :start] = -100  # ignore prompt tokens

    # 3) Initialize adversarial pixels in pixel space
    adv_px = orig_px.clone()
    if random_start:
        adv_px = (adv_px + (2*torch.rand_like(adv_px)-1) * epsilon).clamp(0, 1)
    adv_px.requires_grad_(True)

    # Pre-cast mean/std to model dtype for loop
    mean_ = mean_.to(dtype=model.dtype)
    std_  = std_.to(dtype=model.dtype)

    for step in range(num_steps):
        # Normalize on-the-fly for forward pass
        norm_adv = (adv_px - mean_) / std_
        outputs = model(pixel_values=norm_adv, **inputs, labels=labels)
        loss = outputs.loss

        grad = torch.autograd.grad(loss, adv_px, retain_graph=False, create_graph=False)[0]

        with torch.no_grad():
            adv_px += alpha * grad.sign()                # L∞ step in pixel space
            delta = (adv_px - orig_px).clamp(-epsilon, epsilon)  # project to ε-ball
            adv_px = (orig_px + delta).clamp(0, 1)       # clip to valid pixel range

        adv_px.requires_grad_(True)

        if verbose and (step % 5 == 0 or step == num_steps-1):
            # Sanity prints
            print(f"[Step {step}] loss={loss.item():.4f} "
                  f"grad_norm={grad.norm().item():.4f} "
                  f"delta_max={delta.abs().max().item():.5f}")

    # 4) Convert to PIL and get adversarial caption
    adv_np = adv_px[0].detach().to(dtype=torch.float32).cpu().permute(1,2,0).numpy()
    adv_pil = Image.fromarray((adv_np * 255).clip(0,255).astype(np.uint8))
    adv_pil = adv_pil.resize(image_pil.size)

    with torch.no_grad():
        final = processor(text=prompt, images=[adv_pil], return_tensors="pt")
        final = {k: v.to(device) for k, v in final.items()}
        out_ids = model.generate(**final, max_new_tokens=120)
        adv_caption = processor.batch_decode(out_ids, skip_special_tokens=True)[0].strip()

    return adv_pil, adv_caption, clean_caption

In [18]:
# --- Load model/processor with /workspace cache ---
model_id = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(model_id, cache_dir="/workspace/hf_cache")

model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map=None,
    cache_dir="/workspace/hf_cache"
).to(device).eval()

# --- Point to an image under /workspace ---
image_path = "val2014/COCO_val2014_000000290734.jpg"
image = Image.open(image_path).convert("RGB")

# --- Run attack ---
adv_img, adv_cap, clean_cap = pgd_attack_image_llava(
    model, processor, image,
    prompt="<image>\nDescribe this image in detail.",
    epsilon=16/255,
    alpha=1/255,
    num_steps=100,
    device=device,
    verbose=True
)

print("Clean:", clean_cap)
print("Adversarial:", adv_cap)

# Save the adversarial image to /workspace
adv_out = "/workspace/adv_example.jpg"
adv_img.save(adv_out)
print(f"Saved adversarial image to: {adv_out}")

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

[Step 0] loss=0.8054 grad_norm=43.6250 delta_max=0.06274
[Step 5] loss=1.1848 grad_norm=5.5273 delta_max=0.06274
[Step 10] loss=1.3706 grad_norm=15.0234 delta_max=0.06274
[Step 15] loss=1.4939 grad_norm=7.0391 delta_max=0.06274
[Step 20] loss=1.5823 grad_norm=11.4375 delta_max=0.06274
[Step 25] loss=1.6233 grad_norm=12.1172 delta_max=0.06274
[Step 30] loss=1.6535 grad_norm=6.3594 delta_max=0.06274
[Step 35] loss=1.6742 grad_norm=6.4609 delta_max=0.06274
[Step 40] loss=1.6339 grad_norm=11.8594 delta_max=0.06274
[Step 45] loss=1.6535 grad_norm=9.2656 delta_max=0.06274
[Step 50] loss=1.6817 grad_norm=8.7344 delta_max=0.06274
[Step 55] loss=1.7111 grad_norm=6.3516 delta_max=0.06274
[Step 60] loss=1.7314 grad_norm=3.9590 delta_max=0.06274
[Step 65] loss=1.7222 grad_norm=3.8691 delta_max=0.06274
[Step 70] loss=1.7595 grad_norm=7.2539 delta_max=0.06274
[Step 75] loss=1.7800 grad_norm=7.3359 delta_max=0.06274
[Step 80] loss=1.7896 grad_norm=4.2852 delta_max=0.06274
[Step 85] loss=1.7825 grad_n

In [19]:
orig_out = "clean_example.jpg"
image.save(orig_out)