# 🖼️ Image Captioning in Google Colab (Simple & Mobile Friendly)
Generate captions for your images using a pre-trained model.

In [None]:
!pip install transformers==4.21.2 pillow matplotlib

In [None]:

from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image
import requests
import matplotlib.pyplot as plt
from google.colab import files


In [None]:

# Load the model and processor
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:

# Upload an image
uploaded = files.upload()
image_path = next(iter(uploaded))
image = Image.open(image_path).convert("RGB")

# Show the image
plt.imshow(image)
plt.axis('off')
plt.show()


In [None]:

# Generate caption
def generate_caption(image):
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    output_ids = model.generate(pixel_values, max_length=16, num_beams=4)
    preds = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return preds

caption = generate_caption(image)
print("📝 Caption:", caption)
