In [1]:
import os
import re
from dotenv import load_dotenv
from elevenlabs import ElevenLabs, save
from pydub import AudioSegment

In [2]:
load_dotenv()
api_key = os.getenv('ELEVEN_API_KEY')
client = ElevenLabs(api_key=api_key)

dialogue = [
    'Juan Pablo: Hola, ¿puedes darme alguna recomendación sobre lo que ordenar?',
    'Ana: ¡Claro, qué deseas probar hoy? Tenemos un burrito con carne de res, un arepa con queso y una torta de guayaba.',
    'Maestro: Juan Pablo is asking for a recommendation on what to order. Ana is offering three options: a beef burrito, a cheese arepa, and a guava tart.',
    'Juan Pablo: Eso sí, ¿cómo es el burrito con carne?',
    'Ana: Es uno de nuestros platos más populares. Tiene carne de res, arroz, frijoles negros y salsa verde.',
    "Maestro: Juan Pablo wants to know about the beef burrito. Ana explains that it's one of their most popular dishes, containing beef, rice, black beans, and green sauce.",
    'Juan Pablo: Perfecto, lo tomaré. ¿Y si me lo traes con un refresco de piña?',
    'Ana: Claro, nosotros también tenemos un refresco de maracuyá. ¿Te gustaría probarlo?',
    'Juan Pablo: ¡Claro! Lo pediré, entonces.',
    'Ana: Muy bien. ¡Gracias por elegir nuestro café!',
    "Maestro: Juan Pablo orders the beef burrito with a pineapple drink and also asks for a taste of the maracuyá drink. Ana confirms the order and thanks him for choosing their café.",
    'Ana: Aquí está tu pedido, señor Juan Pablo. Disfruta!',
    'Juan Pablo: Muchas gracias, Ana. ¡Espero volver pronto!',
    'Ana: Es un placer tenerte aquí, ¡vuelve siempre!',
    'Maestro: Juan Pablo now starts eating and enjoying his order. Once he finishes, he gets up and leaves the place.'
]

In [3]:
speaker_voices = {
    "Juan Pablo": "Ux2YbCNfurnKHnzlBHGX",
    "Ana": "86V9x9hrQds83qf7zaGn",
    "Maestro": "JBFqnCBsd6RMkjVDRZzb"
}

def synthesize_speech(text, speaker_id, output_path, model_id='eleven_flash_v2_5'):
    """Generate audio from text using ElevenLabs API and save it to a file."""
    audio = client.text_to_speech.convert(
        voice_id=speaker_id,
        output_format="mp3_44100_128",
        text=text,
        model_id=model_id
    )
    save(audio, output_path)


def convert_mp3_to_wav(mp3_path, wav_path):
    """Convert an MP3 file to WAV using pydub."""
    audio = AudioSegment.from_mp3(mp3_path)
    audio.export(wav_path, format="wav")

def combine_audio_with_pauses(input_dir, output_file, pause_duration_ms=800):
    """Combine multiple WAV audio files with pauses in between into a single audio file."""
    
    # List all WAV files and filter out any that don't match the expected pattern
    audio_files = []
    for f in os.listdir(input_dir):
        if f.endswith(".wav"):
            match = re.search(r"line_(\d+)", f)
            if match:
                audio_files.append((int(match.group(1)), os.path.join(input_dir, f)))
            else:
                print(f"Skipping unexpected file: {f}")

    # Sort by line number
    audio_files.sort()

    # Create a silent segment for pauses
    pause = AudioSegment.silent(duration=pause_duration_ms)

    # Initialize an empty audio segment
    combined_audio = AudioSegment.empty()

    # Concatenate each audio file with pauses
    for _, audio_file in audio_files:
        print(f"Adding {audio_file} to the combined audio...")
        audio_segment = AudioSegment.from_wav(audio_file)
        combined_audio += audio_segment + pause

    # Remove the final pause
    combined_audio = combined_audio[:-pause_duration_ms]

    # Export the combined audio
    combined_audio.export(output_file, format="wav")
    print(f"Combined audio saved to {output_file}")

In [4]:
output_dir = "./data/elevenlabs/"
os.makedirs(output_dir, exist_ok=True)

# Generate audio files line by line and convert them to WAV
for i, line in enumerate(dialogue):
    speaker, text = line.split(": ", 1)
    mp3_path = os.path.join(output_dir, f"line_{i+1}.mp3")
    wav_path = os.path.join(output_dir, f"line_{i+1}.wav")
    
    speaker_id = speaker_voices.get(speaker)
    if speaker_id:
        # Generate the audio in MP3 format
        synthesize_speech(text, speaker_id=speaker_id, output_path=mp3_path)
        convert_mp3_to_wav(mp3_path, wav_path)

combine_audio_with_pauses(input_dir=output_dir, output_file="./data/elevenlabs/final_conversation.wav")

Skipping unexpected file: final_conversation.wav
Adding ./data/elevenlabs/line_1.wav to the combined audio...
Adding ./data/elevenlabs/line_2.wav to the combined audio...
Adding ./data/elevenlabs/line_3.wav to the combined audio...
Adding ./data/elevenlabs/line_4.wav to the combined audio...
Adding ./data/elevenlabs/line_5.wav to the combined audio...
Adding ./data/elevenlabs/line_6.wav to the combined audio...
Adding ./data/elevenlabs/line_7.wav to the combined audio...
Adding ./data/elevenlabs/line_8.wav to the combined audio...
Adding ./data/elevenlabs/line_9.wav to the combined audio...
Adding ./data/elevenlabs/line_10.wav to the combined audio...
Adding ./data/elevenlabs/line_11.wav to the combined audio...
Adding ./data/elevenlabs/line_12.wav to the combined audio...
Adding ./data/elevenlabs/line_13.wav to the combined audio...
Adding ./data/elevenlabs/line_14.wav to the combined audio...
Adding ./data/elevenlabs/line_15.wav to the combined audio...
Combined audio saved to ./data