In [None]:
import gradio as gr
import numpy as np
import io
import soundfile as sf
import pydub
import random
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()


client = OpenAI()

def generate_audio():
    sample_rate = 44100  # 44.1 kHz
    duration = 2  # 2 seconds
    frequency = 440  # Frequency in Hz (A4 note)
    t = np.linspace(0, duration, int(sample_rate * duration), False)
    audio_data = 0.5 * np.sin(2 * np.pi * frequency * t)
    
    # Save to in-memory buffer
    wav_buffer = io.BytesIO()
    sf.write(wav_buffer, audio_data, sample_rate, format='WAV')
    wav_buffer.seek(0)
    
    # Convert WAV to MP3 in memory
    audio = pydub.AudioSegment.from_file(wav_buffer, format="wav")
    mp3_buffer = io.BytesIO()
    audio.export(mp3_buffer, format="mp3")
    mp3_buffer.seek(0)
    
    # Convert MP3 buffer to NumPy array
    audio_array = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0
    
    return sample_rate, audio_array

def generate_audio_binary(text):
    voices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
    selected_voice = random.choice(voices)
    
    with client.audio.speech.with_streaming_response.create(
        model="tts-1", voice=selected_voice, input=text, response_format="mp3"
    ) as response:
        mp3_buffer = io.BytesIO()
        for chunk in response.iter_bytes():
            mp3_buffer.write(chunk)
        mp3_buffer.seek(0)
    
    # Convert MP3 buffer to NumPy array
    audio = pydub.AudioSegment.from_file(mp3_buffer, format="mp3")
    audio_array = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0
    
    return 44100, audio_array

with gr.Blocks() as demo:
    gr.Markdown("## Simple Audio Generator and Player")
    text_input = gr.Textbox(label="Enter Text for TTS")
    audio_output = gr.Audio()
    generate_button = gr.Button("Generate and Play Audio")
    tts_button = gr.Button("Generate TTS Audio")
    
    generate_button.click(fn=generate_audio, outputs=audio_output)
    tts_button.click(fn=generate_audio_binary, inputs=text_input, outputs=audio_output)

demo.launch()
