<a href="https://colab.research.google.com/github/anamv4/Chatbot-Project/blob/main/Image_CaptioningChatbot_V.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries in Google Colab (Run this cell in your Colab environment)
!pip install gradio
!pip install gTTS
!pip install transformers
!pip install requests
!pip install Pillow
!pip install gradio gTTS transformers requests Pillow
!pip install yolov5
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install ultralytics

Collecting huggingface-hub<0.25.0,>=0.12.0 (from yolov5)
  Using cached huggingface_hub-0.24.7-py3-none-any.whl.metadata (13 kB)
Using cached huggingface_hub-0.24.7-py3-none-any.whl (417 kB)
Installing collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.26.5
    Uninstalling huggingface-hub-0.26.5:
      Successfully uninstalled huggingface-hub-0.26.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 5.9.0 requires huggingface-hub>=0.25.1, but you have huggingface-hub 0.24.7 which is incompatible.[0m[31m
[0mSuccessfully installed huggingface-hub-0.24.7
Looking in indexes: https://download.pytorch.org/whl/cu118


In [2]:
import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
import requests
from PIL import Image
from io import BytesIO  # Ensure this is included
import os
from pydub import AudioSegment
import torch
from yolov5 import load  # YOLOv5 wrapper for object detection

In [3]:
# Load BLIP model for image captioning
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load YOLOv5 for object detection
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s')  # Use a pre-trained small YOLOv5 model


Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-12-14 Python-3.10.12 torch-2.5.1+cu121 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


In [4]:
# Google TTS Function
def synthesize_speech_gtts(text):
    """Generate speech from text using gTTS and save it as an audio file."""
    try:
        tts = gTTS(text=text, lang='en')
        audio_dir = "output_audio"
        os.makedirs(audio_dir, exist_ok=True)
        audio_path = os.path.join(audio_dir, "response.mp3")
        tts.save(audio_path)

        # Convert MP3 to WAV for compatibility
        sound = AudioSegment.from_mp3(audio_path)
        wav_path = audio_path.replace(".mp3", ".wav")
        sound.export(wav_path, format="wav")

        return wav_path
    except Exception as e:
        return str(e)

In [5]:
# Image Captioning Function
def recognize_image_with_caption(image_path=None, image_url=None):
    """Generate a caption for an uploaded image or image URL."""
    try:
        if image_path:
            processed_image = Image.open(image_path).convert("RGB")
        elif image_url:
            response = requests.get(image_url, headers={"User-Agent": "Mozilla/5.0"})
            if response.status_code != 200:
                return None, f"Error fetching image: HTTP {response.status_code}"
            if "image" not in response.headers["Content-Type"]:
                return None, "Error: The provided URL does not point to an image."
            processed_image = Image.open(BytesIO(response.content)).convert("RGB")
        else:
            return None, "No image or URL provided."

        # Generate caption
        inputs = blip_processor(processed_image, return_tensors="pt")
        outputs = blip_model.generate(**inputs)
        caption = blip_processor.decode(outputs[0], skip_special_tokens=True)
        return processed_image, caption
    except Exception as e:
        return None, f"Error processing image: {str(e)}"


In [6]:
# Object Detection Function
def detect_objects(image_path):
    """Detect objects in an image using YOLOv5."""
    try:
        results = yolo_model(image_path)
        detected_objects = results.pandas().xyxy[0]["name"].tolist()  # Extract detected object names
        return detected_objects
    except Exception as e:
        return f"Error in object detection: {str(e)}"

# Chatbot Function
def conversational_image_chatbot(user_input, uploaded_image=None, image_url=None, chat_history=None):
    """Handle chatbot interaction and image captioning with object detection."""
    try:
        if chat_history is None:
            chat_history = []

        response = ""
        image = None
        error_message = None
        detected_objects = []

        if uploaded_image:
            # Process uploaded image
            image, caption = recognize_image_with_caption(image_path=uploaded_image)
            detected_objects = detect_objects(uploaded_image)
            if image is None:
                response = caption  # Error message
                error_message = caption
            else:
                response = f"Image caption: {caption}\nDetected objects: {', '.join(detected_objects)}"
        elif image_url:
            # Process image URL
            image, caption = recognize_image_with_caption(image_url=image_url)
            if image is None:
                response = caption  # Error message
                error_message = caption
            else:
                response = f"Image caption: {caption}"
        elif user_input:
            # Handle text input
            if "objects" in user_input.lower():
                response = f"The detected objects are: {', '.join(detected_objects)}"
            else:
                response = f"You said: {user_input}"
        else:
            response = "Please provide an input (text, image, or image URL)."

        # Add to chat history
        chat_history.append((f"User: {user_input or '[Image/URL provided]'}", f"Chatbot: {response}"))

        # Generate audio response
        if error_message:
            audio_path = synthesize_speech_gtts(error_message)
        else:
            audio_path = synthesize_speech_gtts(response)

        return chat_history, image, audio_path

    except Exception as e:
        # Catch all errors and display them in the chat history, image preview, and audio response
        error_message = f"Error occurred: {str(e)}"
        audio_path = synthesize_speech_gtts(error_message)
        return [(f"Error occurred: {str(e)}",)], None, audio_path


In [7]:
# Gradio UI
def gradio_interface():
    """Set up Gradio chatbot interface."""
    with gr.Blocks() as chatbot_interface:
        gr.Markdown("# Conversational Image Recognition Chatbot with Object Detection and Voice Response")

        # Input components
        with gr.Row():
            user_input = gr.Textbox(label="Chat Message", placeholder="Type your message here...")
            uploaded_image = gr.Image(label="Upload an Image (Optional)", type="filepath")
            image_url = gr.Textbox(label="Image URL (Optional)", placeholder="Paste an image URL here...")
            submit_button = gr.Button("Send")

        # Output components
        with gr.Row():
            chat_history_display = gr.Chatbot(label="Chat History")
            image_display = gr.Image(label="Image Preview", type="numpy", interactive=False)
            # Audio output for the chatbot response
            audio_output = gr.Audio(label="Voice Response", type="filepath")

        # Define interaction
        submit_button.click(
            fn=conversational_image_chatbot,
            inputs=[user_input, uploaded_image, image_url, chat_history_display],
            outputs=[chat_history_display, image_display, audio_output]
        )

    chatbot_interface.launch()

# Run the interface
gradio_interface()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://465dd1644a27d54f77.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
