In [None]:
# Install necessary packages
!pip install gtts transformers torch pillow opencv-python googletrans==4.0.0-rc1

import torch
import os
import cv2
import IPython.display as display
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from gtts import gTTS
from google.colab import files
from google.colab import output
import time
import base64
import numpy as np
import io
from googletrans import Translator

# Load BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Function to capture image using Colab's webcam method
def capture_image_colab():
    from IPython.display import Javascript
    from google.colab.output import eval_js
    from base64 import b64decode

    # JavaScript to capture image
    js = Javascript('''
        async function takePhoto() {
            const div = document.createElement('div');
            const video = document.createElement('video');
            const capture = document.createElement('button');
            capture.textContent = 'Capture Image';
            div.appendChild(video);
            div.appendChild(capture);
            document.body.appendChild(div);

            const stream = await navigator.mediaDevices.getUserMedia({video: true});
            video.srcObject = stream;
            await new Promise((resolve) => video.onloadedmetadata = resolve);
            video.play();

            await new Promise((resolve) => capture.onclick = resolve);
            const canvas = document.createElement('canvas');
            canvas.width = video.videoWidth;
            canvas.height = video.videoHeight;
            canvas.getContext('2d').drawImage(video, 0, 0);
            stream.getTracks().forEach(track => track.stop());
            div.remove();

            return canvas.toDataURL('image/jpeg');
        }
    ''')
    display.display(js)
    image_data = eval_js('takePhoto()')

    # Convert the base64 image to a PIL image
    image_bytes = b64decode(image_data.split(',')[1])
    image = Image.open(io.BytesIO(image_bytes))
    image_path = "captured_image.jpg"
    image.save(image_path)

    print("✅ Image Captured Successfully!")
    return image_path

# Function to generate caption
def generate_caption(image_path):
    if not image_path:
        print("❌ No image selected.")
        return None

    image = Image.open(image_path).convert("RGB")
    display.display(image)  # Show image

    # Process image
    inputs = processor(images=image, return_tensors="pt").to(device)

    # Generate caption
    with torch.no_grad():
        caption_ids = model.generate(**inputs)

    caption = processor.batch_decode(caption_ids, skip_special_tokens=True)[0]
    print(f"📝 Generated Caption: {caption}")
    return caption

# Function to translate text and play audio
def play_audio(text, lang):
    try:
        translator = Translator()

        # Translate only if the language is not English
        if lang == "ta":
            translated_text = translator.translate(text, dest="ta").text
        elif lang == "hi":
            translated_text = translator.translate(text, dest="hi").text
        else:
            translated_text = text  # No translation needed for English

        print(f"📝 Translated Caption: {translated_text}")  # Display translated text

        # Generate speech in the selected language
        tts = gTTS(text=translated_text, lang=lang, slow=False)
        audio_file = f"caption_{lang}.mp3"
        tts.save(audio_file)

        # Play the audio
        display.display(display.Audio(audio_file, autoplay=True))
        print(f"🔊 Playing caption audio in {lang}...")

    except Exception as e:
        print("❌ Error generating audio:", str(e))

# Main program execution
print("Choose an option:")
print("1️⃣ Upload an image")
print("2️⃣ Capture real-time image from webcam")

choice = input("Enter 1 or 2: ")

if choice == "1":
    print("📤 Please upload an image...")
    uploaded = files.upload()
    if uploaded:
        image_path = list(uploaded.keys())[0]  # Get uploaded image path
        print("✅ Image Uploaded Successfully!")
    else:
        print("❌ No image uploaded.")
        image_path = None

elif choice == "2":
    image_path = capture_image_colab()

else:
    print("❌ Invalid choice. Exiting program.")
    exit()

# Process the selected image
caption = generate_caption(image_path)

if caption:
    # Choose language for audio output
    print("\n🔊 Choose an audio language:")
    print("1️⃣ English")
    print("2️⃣ Tamil")
    print("3️⃣ Hindi")

    lang_choice = input("Enter 1, 2, or 3: ")

    lang_map = {"1": "en", "2": "ta", "3": "hi"}
    lang = lang_map.get(lang_choice, "en")  # Default to English if invalid

    play_audio(caption, lang)