In [None]:
# Install the required libraries
!pip install -q transformers==4.41.0
!pip install bitsandbytes==0.41.3
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q gradio
!pip install -q gTTS

[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0m

In [None]:
import torch
from transformers import BitsAndBytesConfig, pipeline

In [None]:
# Define configuration for 4-bit quantization to optimize memory usage and processing speed
quant_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_compute_dtype= torch.float16
)

In [None]:
# Load the model using Hugging Face’s pipeline with image-to-text task
model_id = "llava-hf/llava-1.5-7b-hf"

In [5]:
pipe = pipeline(
    "image-to-text",
    model=model_id,
    model_kwargs={"quantization_config": quant_config}
)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

In [11]:
pipe

<transformers.pipelines.image_to_text.ImageToTextPipeline at 0x7c47001af850>

In [12]:
import whisper
import gradio as gr
import time
import warnings
import os
from gtts import gTTS
from PIL import Image

Loading and Displaying an Example Image

In [13]:
image_path = "/content/1.jpg"

In [14]:
image = Image.open((image_path))

In [15]:
# Download the 'punkt' tokenizer from NLTK, which is used to split text into sentences for easy readability.
import nltk
nltk.download("punkt")
from nltk import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [16]:
max_new_tokens = 250

In [17]:
# Define a detailed prompt to instruct the model on how to describe the image
prompt_instructions = """
Describe the image using as much detail as possible,
You are a helpful AI assistant who is able to answer question about the images.
What is the image all about?
Now generate the helpful answer
"""

In [18]:
prompt = "USER: <image>n" + prompt_instructions + "\nAssistant:"

In [19]:
# Generate a description for the image using the prompt and the pipeline model
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": max_new_tokens})

In [20]:
outputs

[{'generated_text': "USER:  n\nDescribe the image using as much detail as possible,\nYou are a helpful AI assistant who is able to answer question about the images.\nWhat is the image all about?\nNow generate the helpful answer\n\nAssistant: The image features a close-up of a person's face, showing their skin with a red, itchy rash. The rash appears to be on the forehead and around the cheek area. The person's nose is also visible in the foreground. The image captures the discomfort and irritation that the individual is experiencing due to the skin condition."}]

In [21]:
# Print the generated sentences for easy reading
for sent in sent_tokenize(outputs[0]["generated_text"]):
  print(sent)

USER:  n
Describe the image using as much detail as possible,
You are a helpful AI assistant who is able to answer question about the images.
What is the image all about?
Now generate the helpful answer

Assistant: The image features a close-up of a person's face, showing their skin with a red, itchy rash.
The rash appears to be on the forehead and around the cheek area.
The person's nose is also visible in the foreground.
The image captures the discomfort and irritation that the individual is experiencing due to the skin condition.


In [22]:
warnings.filterwarnings("ignore")

Setting up Environment and Torch Device

In [23]:
import numpy as np

In [24]:
# Check if a GPU is available and set the device to CUDA if possible, otherwise use CPU
torch.cuda.is_available()

True

In [25]:
DEVICE = "cude" if torch.cuda.is_available() else "cpu"

In [26]:
print(f"Using torch {torch.__version__} ({DEVICE})")

Using torch 2.5.0+cu121 (cude)


Loading and Testing Whisper Model for Transcription

In [27]:
import whisper

In [28]:
model = whisper.load_model("small", device="cpu")

100%|███████████████████████████████████████| 461M/461M [00:05<00:00, 93.7MiB/s]


In [29]:
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

Model is multilingual and has 240,582,912 parameters.


Logging Function for Saving Conversations

In [30]:
import re
import datetime

In [31]:
# Logger setup to save session data with timestamps
tstamp =datetime.datetime.now()
tstamp = str(tstamp).replace(" ", "_")
logfile = f"log{tstamp}.txt"

In [32]:
def writehistory(text):
  with open(logfile, "a", encoding='utf-8') as f:
    f.write(text)
    f.write("\n")
    f.close()

In [33]:
import requests

Image Description Function (img2txt)

In [35]:
from PIL import Image
import re

def img2txt(input_text, input_image):

    # Load the image
    image = Image.open(input_image)

    # Determine the appropriate prompt instructions
    if isinstance(input_text, tuple):
        prompt_instructions = """
        Describe the image using as much detail as possible. Is it a painting, a photograph? What colors are predominant, and what is the image about?
        """
    else:
        prompt_instructions = """
        Act as an expert in imagery descriptive analysis. Using as much detail as possible from the image, respond to the following prompt:
        """ + input_text

    # Construct the prompt
    prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"

    # Generate response using the model pipeline
    outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})

    # Extract the response text properly
    if outputs and "generated_text" in outputs[0] and len(outputs[0]["generated_text"]) > 0:
        match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
        if match:
            # Extract the text after "ASSISTANT:"
            reply = match.group(1)
        else:
            reply = "No response found."
    else:
        reply = "No response generated."

    return reply


Audio-to-Text Transcription Function (transcribe)

In [36]:
def transcribe(audio):

    # Check if the audio input is None or empty
    if audio is None or audio == '':
        return ('','',None)  # Return empty strings and None audio file

    # language = 'en'

    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    _, probs = model.detect_language(mel)

    options = whisper.DecodingOptions()
    result = whisper.decode(model, mel, options)
    result_text = result.text

    return result_text

Text-to-Speech (TTS) Conversion

In [37]:
def text_to_speech(text, file_path):
    language = 'en'

    audioobj = gTTS(text = text,
                    lang = language,
                    slow = False)

    audioobj.save(file_path)

    return file_path

In [38]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [39]:
!ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame Temp.mp3

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

Gradio Interface



In [40]:
import gradio as gr
import base64
import os

# A function to handle audio and image inputs
def process_inputs(audio_path, image_path):
    # Process the audio file (assuming this is handled by a function called 'transcribe')
    speech_to_text_output = transcribe(audio_path)

    # Handle the image input
    if image_path:
        chatgpt_output = img2txt(speech_to_text_output, image_path)
    else:
        chatgpt_output = "No image provided."

    # Assuming 'transcribe' also returns the path to a processed audio file
    processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3")  # Replace with actual path if different

    return speech_to_text_output, chatgpt_output, processed_audio_path

# Create the interface
iface = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Audio(sources=["microphone"], type="filepath"),
        gr.Image(type="filepath")
    ],
    outputs=[
        gr.Textbox(label="Speech to Text"),
        gr.Textbox(label="AI Output"),
        gr.Audio("Temp.mp3")
    ],
    title="LLM powered Voice Assistant for Multimodal Data",
    description="Upload an image and interact via voice input and audio response."
)

# Launch the interface
iface.launch(debug=True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://ec5774b98c666d25c1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://ec5774b98c666d25c1.gradio.live


