
# Multimodal Transformers Demo (CLIP)
**Course:** INFO I418 — HONORS PROJECT


**Author:** Arash Mousavi

**Mentor:** Prof. Bryan Stephens

This is a mini demo that allows you lets type one or more text prompts (through captions) and then you can upload an image of it.  
It uses a CLIP model from Hugging Face to compute text–image cosine similarities


In [None]:

# Imports and model load
import torch
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from transformers import CLIPProcessor, CLIPModel
import os
import io
from typing import List

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# There are other options of model such as laion/CLIP-ViT-B-32-laion2B-s34B-b79K.
MODEL_NAME = "openai/clip-vit-base-patch32"

model = CLIPModel.from_pretrained(MODEL_NAME).to(device)
processor = CLIPProcessor.from_pretrained(MODEL_NAME)
model.eval()


In [None]:


def compute_similarities(image: Image.Image, texts: List[str]):
    inputs = processor(text=texts, images=image, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        image_embeds = outputs.image_embeds / outputs.image_embeds.norm(p=2, dim=-1, keepdim=True)
        text_embeds = outputs.text_embeds / outputs.text_embeds.norm(p=2, dim=-1, keepdim=True)
        # the cosine similarity (batch_size_text x 1)
        sims = (text_embeds @ image_embeds.T).squeeze(-1).detach().cpu().numpy()
    return sims

def plot_bar(similarities, labels, title="CLIP Text–Image Similarity"):
    plt.figure(figsize=(8, 4))
    idx = np.argsort(similarities)[::-1]
    sims_sorted = similarities[idx]
    labels_sorted = [labels[i] for i in idx]
    plt.bar(range(len(sims_sorted)), sims_sorted)
    plt.xticks(range(len(sims_sorted)), labels_sorted, rotation=30, ha='right')
    plt.ylim(0, 1)
    plt.ylabel("Cosine similarity")
    plt.title(title)
    plt.tight_layout()
    plt.show()


In [None]:
from io import BytesIO

try:
    from google.colab import files as colab_files
    colab_env = True
except Exception:
    colab_env = False

uploaded_img = None

up = colab_files.upload()

name = next(iter(up.keys()))
uploaded_img = Image.open(BytesIO(up[name]))


caption_input = "a dog; a shark; a guy riding a bike; a bowl of fruit; Key"       # CAPTION HERE, TYPE IN THE ANYTHING then run and upload the picture.
captions = [c.strip() for c in caption_input.split(";") if c.strip()]

display(uploaded_img.resize((min(uploaded_img.width, 512), min(uploaded_img.height, 512))))

sims = compute_similarities(uploaded_img.convert("RGB"), captions)
plot_bar(sims, captions, title="CLIP Text–Image similarity")

best_idx = int(np.argmax(sims))
print(f" the most potential match: '{captions[best_idx]}' (similarity={sims[best_idx]:.3f})")
