# Automated Image Captioning

In [None]:
import requests
from PIL import Image
import matplotlib.pyplot as plt
from transformers import BlipProcessor, BlipForConditionalGeneration

# Image URL
image_url = "https://picsum.photos/id/237/800/600.jpg"

# Load image from URL
image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")

# Initialize BLIP processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Prepare the image
inputs = processor(image, return_tensors="pt")

# Generate caption
outputs = model.generate(**inputs)
caption = processor.decode(outputs[0], skip_special_tokens=True)

# Show image and caption (inline)
plt.figure()
plt.imshow(image)
plt.axis("off")
plt.title(f"Generated Caption: {caption}")
plt.show()

print("Generated Caption:", caption)


## Image Captioning with Gradio Interface

In [None]:
# install
!pip install gradio

In [None]:
import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

# -----------------------------
# 1) Config
# -----------------------------
MODEL_NAME = "Salesforce/blip-image-captioning-base"

EXAMPLE_IMAGES = [
    "https://images.unsplash.com/photo-1501785888041-af3ef285b470",
    "https://images.unsplash.com/photo-1500530855697-b586d89ba3ee",
]

# -----------------------------
# 2) Load processor + model (once)
# -----------------------------
processor = BlipProcessor.from_pretrained(MODEL_NAME)
model = BlipForConditionalGeneration.from_pretrained(MODEL_NAME)

# -----------------------------
# 3) Captioning logic
# -----------------------------
def generate_caption(image: Image.Image, max_new_tokens: int = 30, num_beams: int = 5) -> str:
    """
    Generate an image caption using BLIP.
    image: PIL.Image
    """
    image = image.convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        num_beams=num_beams,
    )
    caption = processor.decode(output_ids[0], skip_special_tokens=True)
    return caption

def caption_image(image: Image.Image, max_new_tokens: int, num_beams: int) -> str:
    """
    Gradio wrapper: Takes a PIL Image input and returns a caption.
    """
    if image is None:
        return "Please upload an image or pick an example."

    try:
        return generate_caption(image, max_new_tokens=max_new_tokens, num_beams=num_beams)
    except Exception as e:
        return f"An error occurred: {str(e)}"

# -----------------------------
# 4) Gradio UI
# -----------------------------
with gr.Blocks(title="Automated Image Captioning (BLIP)") as demo:
    gr.Markdown(
        """
# Automated Image Captioning (BLIP)
Upload an image (or select an example) to generate a caption using **Salesforce BLIP**.
        """.strip()
    )

    with gr.Row():
        image_in = gr.Image(type="pil", label="Input Image")
        caption_out = gr.Textbox(label="Generated Caption", lines=3)

    with gr.Accordion("Advanced settings", open=False):
        max_new_tokens = gr.Slider(5, 80, value=30, step=1, label="Max new tokens")
        num_beams = gr.Slider(1, 10, value=5, step=1, label="Beam search (num_beams)")

    btn = gr.Button("Generate Caption")

    # Examples appear beside/below the upload widget (depending on screen width)
    gr.Examples(
        examples=[[EXAMPLE_IMAGES[0]], [EXAMPLE_IMAGES[1]]],
        inputs=[image_in],
        label="Try examples",
    )

    # Actions
    btn.click(
        fn=caption_image,
        inputs=[image_in, max_new_tokens, num_beams],
        outputs=[caption_out],
    )

    # Optional: auto-update when image changes (similar to live=True but controlled)
    image_in.change(
        fn=caption_image,
        inputs=[image_in, max_new_tokens, num_beams],
        outputs=[caption_out],
    )

demo.launch()