### Importing Libraries

In [1]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModelForCausalLM
import pandas as pd

### Image Loading and Preprocessing

In [2]:
def load_and_preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    inputs = processor(images=image, return_tensors="pt")
    return inputs, processor

### Generating Image Embeddings

In [3]:
def generate_image_embeddings(inputs):
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)

    return image_features, model

### Matching Image with Text Embeddings

In [4]:
def match_captions(image_features, captions, clip_model, processor):
    # 1. get text embeddings for the captions:
    text_inputs = processor(text=captions, return_tensors="pt", padding=True)
    with torch.no_grad():
        text_features = clip_model.get_text_features(**text_inputs)

    # 2. calculate cosine similarity between image and text features:
    image_features = image_features.detach().cpu().numpy()
    text_features = text_features.detach().cpu().numpy()

    similarities = cosine_similarity(image_features, text_features)

    # 3. find the best matching captions:
    best_indices = similarities.argsort(axis=1)[0][::-1]  
    best_captions = [captions[i] for i in best_indices]

    return best_captions, similarities[0][best_indices].tolist()

### Main Function

In [5]:
def image_captioning(image_path, candidate_captions):  
    inputs, processor = load_and_preprocess_image(image_path)
    image_features, clip_model = generate_image_embeddings(inputs)

    best_captions, similarities = match_captions(image_features, candidate_captions, clip_model, processor)
    return best_captions, similarities

### Candidate Captions

In [6]:
captions = pd.read_excel("social_media_captions.xlsx",header=None)
candidate_captions = captions.iloc[:, 0].dropna().tolist()

### Testing

In [7]:
from sklearn.metrics.pairwise import cosine_similarity 

best_captions, similarities = image_captioning("C:/Users/dhevi/OneDrive/Pictures/saree.jpg", candidate_captions)

# get the top 5 results
top_n = min(5, len(best_captions))
top_best_captions = best_captions[:top_n]
top_similarities = similarities[:top_n]

print("Top 5 Best Captions:")
for i, (caption, similarity) in enumerate(zip(top_best_captions, top_similarities)):
    print(f"{i+1}. {caption} (Similarity: {similarity:.4f})")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Top 5 Best Captions:
1. Peace begins with me. (Similarity: 0.2287)
2. Office vibes. (Similarity: 0.2278)
3. Sometimes you have to create your own sunshine. (Similarity: 0.2274)
4. Meeting mode activated. (Similarity: 0.2261)
5. The journey is the destination. (Similarity: 0.2256)


In [8]:
from sklearn.metrics.pairwise import cosine_similarity 

best_captions, similarities = image_captioning("D:/pictures/yashu phone/2023-05-16 yashu phone/yashu phone 164.jpg", candidate_captions)

# get the top 5 results
top_n = min(5, len(best_captions))
top_best_captions = best_captions[:top_n]
top_similarities = similarities[:top_n]

print("Top 5 Best Captions:")
for i, (caption, similarity) in enumerate(zip(top_best_captions, top_similarities)):
    print(f"{i+1}. {caption} (Similarity: {similarity:.4f})")

Top 5 Best Captions:
1. Friendship isn’t a big thing—it’s a million little things. (Similarity: 0.2519)
2. A friend is what the heart needs all the time. (Similarity: 0.2516)
3. Friends are the siblings God never gave us. (Similarity: 0.2482)
4. Friends make life so much better. (Similarity: 0.2481)
5. Memories take you back, friendship brings you forward. (Similarity: 0.2469)
