In [None]:
# Step 1: Install the necessary tools (only need to run this once)
!pip install transformers
!pip install pillow

# Step 2: Load the tools we need
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
from PIL import Image
import requests

# Step 3: Set up the pre-trained image captioning model
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Step 4: Define a function to generate captions
def generate_caption(image_path):
    # Open the image from your file
    image = Image.open(image_path).convert("RGB")

    # Prepare the image for the model
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values

    # Generate the caption
    output_ids = model.generate(pixel_values)
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return caption

# Step 5: Upload your image and generate a caption
from google.colab import files
uploaded = files.upload()  # This opens a window to upload your image

# Step 6: Get the caption for the uploaded image
for filename in uploaded.keys():
    caption = generate_caption(filename)
    print(f"Image: {filename}")
    print(f"Caption: {caption}")