**Implement a basic multimodal system for image captioning using CLIP.**

In [None]:
!pip install torch torchvision
!pip install Pillow
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-r6lwct30
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-r6lwct30
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import torch
import clip
from PIL import Image
import requests
from io import BytesIO

In [None]:
def get_caption_from_file(image_path, candidate_file, model_name="ViT-B/32"):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Load the CLIP model and its preprocessing function.
    model, preprocess = clip.load(model_name, device=device)

    # Load and preprocess the image.
    image = Image.open(image_path)
    image_input = preprocess(image).unsqueeze(0).to(device)

    # Read candidate captions from file (one caption per line).
    with open(candidate_file, "r", encoding="utf-8") as f:
        candidate_captions = [line.strip() for line in f if line.strip()]

    # Tokenize the candidate captions.
    text_inputs = clip.tokenize(candidate_captions).to(device)

    # Compute the image and text features.
    with torch.no_grad():
        image_features = model.encode_image(image_input)
        text_features = model.encode_text(text_inputs)

    # Normalize the features to get cosine similarity as the dot product.
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)

    # Compute cosine similarity between the image and each candidate caption.
    similarities = (image_features @ text_features.T).squeeze(0)

    # Select the caption with the highest similarity.
    best_caption_index = torch.argmax(similarities).item()
    best_caption = candidate_captions[best_caption_index]

    return best_caption

In [None]:
def chat():
    image_url = input("Please enter the image URL: ").strip()
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        image_data = BytesIO(response.content)
        return image_data
    except Exception as e:
        print("Failed to retrieve the image. Error:", e)
        exit(1)

In [None]:
while(True):
    image_source = chat()
    candidate_file = "candidate_captions.txt"

    caption = get_caption_from_file(image_source, candidate_file)
    print("Selected Caption:", caption)

    print("Would you like to continue? (y/n)")
    choice = input().strip().lower()
    if choice != 'y':
        print("Bye 👋")
        break

Please enter the image URL: https://media.istockphoto.com/id/517188688/photo/mountain-landscape.jpg?s=1024x1024&w=0&k=20&c=z8_rWaI8x4zApNEEG9DnWlGXyDIXe-OmsAyQ5fGPVV8=
Selected Caption: A dense forest with rays of sunlight filtering through.
Would you like to continue? (y/n)
n
Bye 👋
