In [None]:
from dotenv import load_dotenv
load_dotenv()
#dotenv loads environment variables from a .env file at the root of your project

In [None]:
import os
import gradio as gr
import whisper
import openai, subprocess
openai.api_key = os.getenv('OPENAI_API_KEY') 

from gtts import gTTS

from IPython.display import Audio

In [None]:
# This implementation uses whisper on the device
# If you have trouble running on device, consider switching to using the openAI api for whisper speech to text
model = whisper.load_model("small")
def speech_to_text(audio):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    
    _, probs = model.detect_language(mel)
    
    options = whisper.DecodingOptions(fp16 = False)
    result = whisper.decode(model, mel, options)
    return result.text

In [None]:
system_prompt = """
                I want you to act as a comedian.
                Your comedy is inspired by Chris Rock, Dave Chappelle, Jerry Seinfeld, and Louis C.K.
                Your joke structure is more like Dr. Seuss.
                You dont tell any jokes over 50 words.
                """
messages = [{"role": "system", "content": system_prompt}]
def chat_with_gpt(user_input_text):
    messages.append({"role": "user", "content": user_input_text})
    response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)

    system_message = response["choices"][0]["message"]
    messages.append(system_message)
    agent_response = system_message["content"]
    return agent_response

In [None]:
def play_audio():
    command = f"play agent_response.mp3"
    subprocess.run(command,shell=True,stdout=subprocess.PIPE)

In [None]:
language = "en"
def text_to_audio(agent_response):
    gtts_object = gTTS(text = agent_response, 
                       lang = language,
                       slow = False)
    gtts_object.save("agent_response.mp3")

In [None]:
def format_text_output():
    chat_transcript = ""
    for message in messages:
        if message['role'] != 'system':
            chat_transcript += message['role'] + ": " + message['content'] + "\n\n"
    return chat_transcript

In [None]:
def inference(audio):
    user_input = speech_to_text(audio)
    agent_response = chat_with_gpt(user_input)
    text_to_audio(agent_response)
    play_audio()

    chat_transcript = format_text_output()
    return chat_transcript

In [None]:
audio_input = gr.Audio(source="microphone", type="filepath")
demo = gr.Interface(
    fn=inference, 
    inputs=audio_input,
    outputs="text",
    live=True
)
demo.launch()