In [1]:
# Install necessary module and dependencies
%pip install -q "git+https://github.com/openai/whisper.git" "gradio" "openai" "gTTS" "torch"

Note: you may need to restart the kernel to use updated packages.


In [2]:
import whisper
import torch
import gradio as gr
import openai
from gtts import gTTS
from voicevox_client import run
from constants import KEYS

  from .autonotebook import tqdm as notebook_tqdm
Matplotlib created a temporary config/cache directory at /tmp/matplotlib-rwxrdzqf because the default path (/home/grammonde/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [3]:
# Generate an mp3 file
!ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame audio_response.mp3

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [4]:
# Set up device agnostic code
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
# Load openai api key
openai.api_key = KEYS['OPEN_AI']

In [6]:
# Load base model of the whisper

model = whisper.load_model('base')
model.device

device(type='cuda', index=0)

In [7]:
# Define a function to pass the input to chatGPT api
def chat_completion_gpt(text: str):
  messages = [{'role': 'user', 'content': text }] if text else [{ 'role': 'system', 'content': 'Goodbye have a nice day' }]

  response = openai.ChatCompletion.create(
    model = 'gpt-3.5-turbo',
    messages = messages,
  )

  return response.choices[0].message.content

In [8]:
def text_voice(input_text, language = 'en'):
  generated_audio = gTTS(
    text = input_text,
    lang = language,
    slow = False,
  )

  generated_audio.save('audio_response.mp3')

  return 'audio_response.mp3'

In [9]:
# Define a function to transcribe the 
def transcribe_audio(audio):

  # Load the audio and trim it to fit 30s
  audio = whisper.load_audio(audio)
  audio = whisper.pad_or_trim(audio)

  # make log-Mel spectrogram and move to the same device as the model
  mel = whisper.log_mel_spectrogram(audio).to(model.device)

  # Detect the language
  _, probs = model.detect_language(mel)
  print(f'Detected Language: {max(probs, key = probs.get)}')
  language = max(probs, key = probs.get)

  # Decode the the audio
  options = whisper.DecodingOptions()
  decode_obj = whisper.decode(model, mel, options)
  text = decode_obj.text
  
  # Pass the result to the chatGPT
  response = chat_completion_gpt(text)

  # Call the text_to_voice function
  result = run(input_text = response) if language == 'ja' else text_voice(input_text = response, language = language)

  return [text, response, result]

In [10]:
# Run with gradio interface

speech_text = gr.Textbox(label = 'Speech to text')
response_text = gr.Textbox(label = 'Response in text')
text_speech = gr.Audio('audio_response.mp3')

gr.Interface(
  title = 'Voice assistant with ChatGPT Whisper and Google Voice Generation',
  fn = transcribe_audio,
  inputs = [gr.inputs.Audio(source = 'microphone', type = 'filepath')],
  outputs = [
    speech_text,
    response_text,
    text_speech,
  ],
  live = True,
).launch(share = True)

  super().__init__(source=source, type=type, label=label, optional=optional)


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://61fd85ba69731c0c48.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces






Detected Language: en
