In [None]:
!pip install transformers  gradio diffusers torch gtts Pillow deep_translator

Collecting gradio
  Downloading gradio-4.44.1-py3-none-any.whl.metadata (15 kB)
Collecting diffusers
  Downloading diffusers-0.30.3-py3-none-any.whl.metadata (18 kB)
Collecting gtts
  Downloading gTTS-2.5.3-py3-none-any.whl.metadata (4.1 kB)
Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)

In [None]:
# Import necessary libraries
from transformers import BlipProcessor, BlipForConditionalGeneration  # For image captioning
from PIL import Image  # To handle image processing
import gradio as gr  # Gradio for creating web interface
from diffusers import StableDiffusionPipeline  # For generating images from text
import torch  # To enable GPU acceleration
from deep_translator import GoogleTranslator  # For automatic translation
from gtts import gTTS  # Google Text-to-Speech for generating audio from text
import os  # For file handling

# Function to translate text to a target language
def translate_text(text, target_language):
    return GoogleTranslator(source='auto', target=target_language).translate(text)

# Load the BLIP model and processor for image captioning
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load the Stable Diffusion pipeline for text-to-image generation
pipe = StableDiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
)

# Check if GPU is available and move models to GPU for faster computation
if torch.cuda.is_available():
    model.to("cuda")
    pipe.to("cuda")

# Function for generating captions from images, with text-to-speech
def generate_caption(image, target_language):
    inputs = processor(image, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
    out = model.generate(**inputs)

    caption = processor.decode(out[0], skip_special_tokens=True)

    translated_caption = translate_text(caption, target_language)

    tts = gTTS(text=translated_caption, lang=target_language)
    tts.save("output.mp3")

    return translated_caption, "output.mp3"

# Function for generating images from text descriptions
def generate_image_from_text(description, image_input=None):
    translated_description = translate_text(description, "en")

    if image_input is not None:
        inputs = processor(image_input, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
        out = model.generate(**inputs)
        image_caption = processor.decode(out[0], skip_special_tokens=True)
        translated_description += " " + image_caption  # Append the image caption to the text

    image = pipe(translated_description, num_inference_steps=70, guidance_scale=6.5).images[0]

    return image

# Gradio interface for image captioning with Text-to-Speech
iface_image_to_text = gr.Interface(
    fn=generate_caption,  # The function to execute
    inputs=[
        gr.Image(type="pil"),  # Image input
        gr.Dropdown(
            choices=[("English", "en"), ("Arabic", "ar"), ("French", "fr"), ("Spanish", "es"), ("German", "de")],
            label="Select Output Language",
            value="ar"  # Default language is set to Arabic
        )
    ],
    outputs=[
        "text",  # Text output (caption)
        gr.Audio(type="filepath")  # Audio output (speech of caption)
    ],
    title="Image Captioning with Speech Output",
    description="Upload an image and get a caption in the selected language with audio output.",

    # Adding examples (local images paths or URLs)
    examples=[
        ["https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQoQVWVsG_u0dE0IDFQTszJRcSz1kl3PlXb_g&s", "ar"],
        ["https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSwf8ck2f453p_Nt3jJjo6Xfl5Iu4IpprnLJw&s", "en"]
    ]
)


# Example inputs for text-to-image generation
examples = [
    ["A beautiful sunset over the mountains."],
    ["A futuristic cityscape with flying cars."],
    ["منظر للبحر مع قارب صغير يطفو على سطح الماء"],
    ["رجل يقرأ كتابًا تحت شجرة في يوم مشمس"],
    ["A spaceship landing on Mars."],
    ["سفينة تبحر في محيط هادئ تحت سماء زرقاء صافية"],
]

# Gradio interface for text-to-image generation with examples
iface_text_to_image = gr.Interface(
    fn=generate_image_from_text,
    inputs=[
        gr.Textbox(label="Enter your description", placeholder="Type your description here..."),
        gr.Image(type="pil", label="Optional: Upload an image")
    ],
    outputs="image",
    title="Text-to-Image Generation",
    description="Enter a description or upload an image to generate an image.",
    examples=examples  # Added examples for quick access
)

# Combining both interfaces into a tabbed Gradio interface
iface_combined = gr.TabbedInterface(
    [iface_image_to_text, iface_text_to_image],
    tab_names=["Image Captioning with Speech", "Text-to-Image"]
)

# Launch the Gradio interface
iface_combined.launch()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model_index.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

scheduler/scheduler_config.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

(…)ature_extractor/preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.36G [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/939 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.46G [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/824 [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://b317399c3162fba189.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


