In [None]:
%pip install openai-clip
import torch
from torchvision import transforms
from torch.optim import Adam
from PIL import Image
import clip
import torch.nn.functional as F
import os

Collecting openai-clip
  Downloading openai-clip-1.0.1.tar.gz (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from openai-clip)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: openai-clip
  Building wheel for openai-clip (setup.py) ... [?25l[?25hdone
  Created wheel for openai-clip: filename=openai_clip-1.0.1-py3-none-any.whl size=1368605 sha256=f38393007f5ae67660eb82bf98dc0aa52825d6a035df86a4960e844b2585b394
  Stored in directory: /root/.cache/pip/wheels/ab/49/bc/c2342e8e14878210ba4825cf314a53f2570f6fb18b91fce3cf
Successfully built openai-clip
Installing collected packages: ftfy, openai-clip


## Inputs and loading models for the system

In [None]:
# === Configuration ===
TEXT_PROMPTS = [
    "A cute golden retriever",
    "A snow-covered mountain peak",
    "A bright red tomato"
]
LEARNING_RATE = 0.05
STEPS = 10000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Running on {DEVICE}...")

#Load teh clip model
model, preprocess = clip.load("ViT-B/32", device=DEVICE)

# Normalization for CLIP
normalize = transforms.Normalize(
    mean=(0.48145466, 0.4578275, 0.40821073),
    std=(0.26862954, 0.26130258, 0.27577711)
)

Running on cuda...


100%|███████████████████████████████████████| 338M/338M [00:04<00:00, 78.9MiB/s]


tensor([[[[0.5080, 0.6720, 0.3642,  ..., 0.6221, 0.4477, 0.5291],
          [0.4432, 0.5744, 0.5169,  ..., 0.6379, 0.4051, 0.6163],
          [0.2848, 0.5548, 0.5117,  ..., 0.4377, 0.5577, 0.4499],
          ...,
          [0.6345, 0.4259, 0.4236,  ..., 0.5127, 0.5038, 0.5690],
          [0.5159, 0.5919, 0.4680,  ..., 0.6645, 0.6231, 0.5335],
          [0.5288, 0.4143, 0.5266,  ..., 0.4188, 0.7073, 0.3801]],

         [[0.3594, 0.4635, 0.5053,  ..., 0.5248, 0.6038, 0.5113],
          [0.4089, 0.6099, 0.5139,  ..., 0.4381, 0.4819, 0.4761],
          [0.6078, 0.4535, 0.3812,  ..., 0.4665, 0.6172, 0.5109],
          ...,
          [0.4356, 0.5410, 0.6412,  ..., 0.5657, 0.6107, 0.5048],
          [0.5197, 0.3948, 0.6146,  ..., 0.5294, 0.6776, 0.5017],
          [0.4241, 0.5099, 0.5190,  ..., 0.6632, 0.6902, 0.3966]],

         [[0.5244, 0.3351, 0.5709,  ..., 0.4261, 0.4291, 0.6065],
          [0.7589, 0.3488, 0.5787,  ..., 0.4562, 0.5262, 0.4155],
          [0.4720, 0.5831, 0.4367,  ..., 0

### Image generation loop

In [None]:
def generate_unconstrained_image(prompt, model_name="ViT-B"):
    print(f"\n=== Generating (Unconstrained) for: '{prompt}' with model: {model_name} ===")
    
    text_token = clip.tokenize([prompt]).to(DEVICE)
    with torch.no_grad():
        text_emb = model.encode_text(text_token)
        text_emb = text_emb / text_emb.norm(dim=-1, keepdim=True)

    image = torch.full((1, 3, 224, 224), 0.5, device=DEVICE)
    image = image + (torch.randn_like(image) * 0.1)
    image.requires_grad_(True)

    optimizer = Adam([image], lr=LEARNING_RATE)

    print(f"Optimizing noise for: '{prompt}' with {model_name}")

    for step in range(STEPS):
        optimizer.zero_grad()
        total_loss = 0
        for _ in range(4):
            aug_img = image
            image_norm = normalize(aug_img)
            img_emb = model.encode_image(image_norm)
            img_emb = img_emb / img_emb.norm(dim=-1, keepdim=True)
            loss = (1 - torch.cosine_similarity(img_emb, text_emb).mean())
            total_loss += loss
        
        final_loss = total_loss / 4
        final_loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            image.data.clamp_(0, 1)
            
        if step % 1000 == 0:
             with torch.no_grad():
                raw_emb = model.encode_image(normalize(image))
                raw_emb /= raw_emb.norm(dim=-1, keepdim=True)
                raw_sim = torch.cosine_similarity(raw_emb, text_emb).item()
             print(f"Step {step} | Loss: {final_loss.item():.4f} | Current Raw Score: {raw_sim:.4f}")

    filename = os.path.join("results", "ViT-B", f"unconstrained_{model_name}_{prompt.replace(' ', '_')}.png")
    out_img = image.detach().cpu().squeeze()
    out_pil = transforms.ToPILImage()(out_img)
    out_pil.save(filename)
    print(f"Saved {filename}")
    
    verify_img = preprocess(Image.open(filename)).unsqueeze(0).to(DEVICE)
    with torch.no_grad():
        v_img_feat = model.encode_image(verify_img)
        v_img_feat /= v_img_feat.norm(dim=-1, keepdim=True)
        final_sim = torch.cosine_similarity(v_img_feat, text_emb).item()
    print(f"Final Similarity: {final_sim:.4f}")

Optimizing noise for: 'A cute golden retriever'
Step 0 | Loss: 0.8066 | Current Raw Score: 0.2280
Step 100 | Loss: 0.2593 | Current Raw Score: 0.7803
Step 200 | Loss: 0.1187 | Current Raw Score: 0.8394
Step 300 | Loss: 0.0508 | Current Raw Score: 0.9600
Step 400 | Loss: 0.0479 | Current Raw Score: 0.9595
Step 500 | Loss: 0.0298 | Current Raw Score: 0.9819
Step 600 | Loss: 0.0156 | Current Raw Score: 0.9814
Step 700 | Loss: 0.0146 | Current Raw Score: 0.9844
Step 800 | Loss: 0.0181 | Current Raw Score: 0.9824
Step 900 | Loss: 0.0171 | Current Raw Score: 0.9834
Step 1000 | Loss: 0.0107 | Current Raw Score: 0.9907
Step 1100 | Loss: 0.0132 | Current Raw Score: 0.9883
Step 1200 | Loss: 0.0098 | Current Raw Score: 0.9854
Step 1300 | Loss: 0.0063 | Current Raw Score: 0.9941
Step 1400 | Loss: 0.0161 | Current Raw Score: 0.9883
Step 1500 | Loss: 0.0122 | Current Raw Score: 0.9888
Step 1600 | Loss: 0.0098 | Current Raw Score: 0.9912
Step 1700 | Loss: 0.0049 | Current Raw Score: 0.9946
Step 1800 

In [None]:
for prompt in TEXT_PROMPTS:
    generate_unconstrained_image(prompt)

## Saving the image