In [1]:
!pip install transformers gradio torch torchvision pillow

Collecting gradio
  Downloading gradio-5.0.2-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.2-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.0 (from gradio)
  Downloading gradio_client-1.4.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collectin

In [2]:
import torch
from torchvision import transforms
from PIL import Image
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import gradio as gr
from transformers import pipeline

# Sentiment analysis and text classification for actions
sentiment_pipeline = pipeline("sentiment-analysis")
action_pipeline = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Pretrained image captioning model
caption_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
caption_model.to(device)

# Preprocess the image
def preprocess_image(image):
    return image_processor(image, return_tensors="pt").pixel_values.to(device)

# Generate caption
def generate_caption(image):
    pixel_values = preprocess_image(image)
    output_ids = caption_model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

# Predict sentiment from the generated caption
def predict_sentiment(caption):
    sentiment_result = sentiment_pipeline(caption)
    return sentiment_result[0]['label'], sentiment_result[0]['score']

# Predict possible action from the generated caption
def predict_action(caption):
    actions = ["running", "jumping", "walking", "eating", "playing", "talking"]
    action_result = action_pipeline(caption, candidate_labels=actions)
    return action_result['labels'][0], action_result['scores'][0]

# Gradio interface function
def analyze_image(image):
    caption = generate_caption(image)
    sentiment, sentiment_score = predict_sentiment(caption)
    action, action_score = predict_action(caption)

    result = {
        "Caption": caption,
        "Sentiment": f"{sentiment} ({sentiment_score:.2f})",
        "Possible Action": f"{action} ({action_score:.2f})"
    }
    return result

# Gradio UI interface
# Changed gr.inputs.Image to gr.Image
interface = gr.Interface(
    fn=analyze_image,
    inputs=gr.Image(type="pil"),  # This line is changed
    outputs="json",
    title="Image Captioning with Contextual Insights",
    description="Upload an image and receive a descriptive caption with sentiment analysis and possible actions.",
)

interface.launch()

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://459f5dd9a3ceeaf969.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


