In [1]:
# Check if Fish Speech is available, if not, clone and setup
import os
import sys
import subprocess
from pathlib import Path

# Check if we have the model files
model_path = Path("../openaudio-s1-mini")
if model_path.exists():
    print(f"‚úÖ Model found at: {model_path.resolve()}")
    print(f"üìÅ Model files: {list(model_path.glob('*.pth'))}")
else:
    print("‚ùå Model not found. Please download it first.")

# Check for Fish Speech repository
fish_speech_path = Path("../fish-speech")
if not fish_speech_path.exists():
    print("üì• Cloning Fish Speech repository...")
    subprocess.run([
        "git", "clone", "https://github.com/fishaudio/fish-speech.git", 
        str(fish_speech_path)
    ], check=True)
    print("‚úÖ Fish Speech cloned successfully")
else:
    print(f"‚úÖ Fish Speech found at: {fish_speech_path.resolve()}")

# Add fish-speech to Python path if not already there
if str(fish_speech_path.resolve()) not in sys.path:
    sys.path.insert(0, str(fish_speech_path.resolve()))
    print("üìå Added Fish Speech to Python path")

print("\nüîß Setup complete!")


‚úÖ Model found at: /home/alec/git/talker/tts_api/openaudio-s1-mini
üìÅ Model files: [PosixPath('../openaudio-s1-mini/codec.pth'), PosixPath('../openaudio-s1-mini/model.pth')]
‚úÖ Fish Speech found at: /home/alec/git/talker/tts_api/fish-speech
üìå Added Fish Speech to Python path

üîß Setup complete!


In [2]:
# Start the Fish Speech API Server in the background
import subprocess
import time
import requests
import threading
from pathlib import Path

def start_api_server():
    """Start the Fish Speech API server"""
    fish_speech_path = Path("../fish-speech")
    model_path = Path("../openaudio-s1-mini")
    
    # Change to fish-speech directory
    os.chdir(fish_speech_path)
    
    # Start the API server
    cmd = [
        "python", "-m", "tools.api_server",
        "--listen", "127.0.0.1:8080",
        "--llama-checkpoint-path", str(model_path.resolve()),
        "--decoder-checkpoint-path", str(model_path.resolve() / "codec.pth"),
        "--decoder-config-name", "modded_dac_vq"
    ]
    
    print("üöÄ Starting Fish Speech API Server...")
    print(f"üìã Command: {' '.join(cmd)}")
    
    # Start server in background
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    
    # Wait for server to start
    print("‚è≥ Waiting for server to start...")
    for i in range(30):  # Wait up to 30 seconds
        try:
            response = requests.get("http://127.0.0.1:8080/", timeout=2)
            if response.status_code == 200:
                print("‚úÖ API Server is running!")
                print("üåê Access the WebUI at: http://127.0.0.1:8080/")
                return process
        except:
            time.sleep(1)
            print(f"   Waiting... ({i+1}/30)")
    
    print("‚ùå Server failed to start within 30 seconds")
    return None

# Change back to original directory after
original_dir = os.getcwd()

# Uncomment the line below to start the server
server_process = start_api_server()

print("üí° Uncomment the line above to start the API server")
print("üí° Or run it manually in terminal with the command shown above")


üöÄ Starting Fish Speech API Server...
üìã Command: python -m tools.api_server --listen 127.0.0.1:8080 --llama-checkpoint-path /home/alec/git/talker/tts_api/openaudio-s1-mini --decoder-checkpoint-path /home/alec/git/talker/tts_api/openaudio-s1-mini/codec.pth --decoder-config-name modded_dac_vq
‚è≥ Waiting for server to start...
‚úÖ API Server is running!
üåê Access the WebUI at: http://127.0.0.1:8080/
üí° Uncomment the line above to start the API server
üí° Or run it manually in terminal with the command shown above


In [3]:
# Make TTS API calls to the running server
import requests
import json
import base64
from IPython.display import Audio, display

def generate_speech(text, save_path="output.wav", emotion=None):
    """
    Generate speech using the Fish Speech API
    
    Args:
        text (str): Text to convert to speech
        save_path (str): Path to save the audio file
        emotion (str): Optional emotion marker like "(excited)" or "(whispering)"
    
    Returns:
        Audio: IPython Audio object for playback
    """
    url = "http://127.0.0.1:8080/api/tts"
    
    # Add emotion markers if specified
    if emotion:
        text = f"{emotion} {text}"
    
    # API request payload
    payload = {
        "text": text,
        "format": "wav",
        "reference_id": None,  # Let model choose voice randomly
        "reference_audio": None,
        "reference_text": None,
        "max_new_tokens": 1024,
        "chunk_length": 200,
        "top_p": 0.7,
        "repetition_penalty": 1.2,
        "temperature": 0.7,
        "streaming": False
    }
    
    print(f"üé§ Generating speech for: '{text}'")
    
    try:
        response = requests.post(url, json=payload, timeout=60)
        response.raise_for_status()
        
        # Save the audio file
        with open(save_path, "wb") as f:
            f.write(response.content)
        
        print(f"‚úÖ Audio saved to: {save_path}")
        
        # Return Audio object for Jupyter playback
        return Audio(save_path)
        
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Error generating speech: {e}")
        return None

# Example usage (uncomment when server is running)
audio = generate_speech("Hello world! This is OpenAudio S1 Mini speaking.")
display(audio)

print("üí° Uncomment the lines above to generate speech once the server is running")
print("üí° Make sure the API server is started first!")


üé§ Generating speech for: 'Hello world! This is OpenAudio S1 Mini speaking.'
‚ùå Error generating speech: 500 Server Error: Internal Server Error for url: http://127.0.0.1:8080/api/tts


None

üí° Uncomment the lines above to generate speech once the server is running
üí° Make sure the API server is started first!


In [14]:
# Demo different emotions and tones
emotions_examples = [
    ("(excited) Hello everyone! I'm so thrilled to be here today!", "excited_speech.wav"),
    ("(whispering) This is a secret message just between us.", "whisper_speech.wav"),
    ("(angry) I can't believe this happened again!", "angry_speech.wav"),
    ("(sad) I'm feeling quite melancholy today.", "sad_speech.wav"),
    ("(laughing) Ha ha ha, that's absolutely hilarious!", "laughing_speech.wav"),
    ("(shouting) Can everyone hear me in the back?", "shouting_speech.wav"),
]

# Multilingual examples
multilingual_examples = [
    ("Hello, how are you today?", "english.wav"),
    ("Bonjour, comment allez-vous?", "french.wav"),
    ("Hola, ¬øc√≥mo est√°s hoy?", "spanish.wav"),
    ("Guten Tag, wie geht es Ihnen?", "german.wav"),
    ("„Åì„Çì„Å´„Å°„ÅØ„ÄÅ‰ªäÊó•„ÅØ„ÅÑ„Åã„Åå„Åß„Åô„ÅãÔºü", "japanese.wav"),
    ("‰Ω†Â•ΩÔºå‰Ω†‰ªäÂ§©ÊÄé‰πàÊ†∑Ôºü", "chinese.wav"),
]

def demo_emotions():
    """Generate speech samples for different emotions"""
    print("üé≠ Generating emotion examples...")
    
    for text, filename in emotions_examples:
        print(f"\nüéØ {filename}")
        # audio = generate_speech(text, filename)
        # if audio: display(audio)
        print(f"   Text: {text}")

def demo_multilingual():
    """Generate speech samples for different languages"""
    print("üåç Generating multilingual examples...")
    
    for text, filename in multilingual_examples:
        print(f"\nüó£Ô∏è {filename}")
        # audio = generate_speech(text, filename)
        # if audio: display(audio)
        print(f"   Text: {text}")

# Run demos (uncomment when server is running)
demo_emotions()
demo_multilingual()

print("üí° Uncomment the function calls above to run the demos")
print("üí° Make sure to start the API server first!")


üé≠ Generating emotion examples...

üéØ excited_speech.wav
   Text: (excited) Hello everyone! I'm so thrilled to be here today!

üéØ whisper_speech.wav
   Text: (whispering) This is a secret message just between us.

üéØ angry_speech.wav
   Text: (angry) I can't believe this happened again!

üéØ sad_speech.wav
   Text: (sad) I'm feeling quite melancholy today.

üéØ laughing_speech.wav
   Text: (laughing) Ha ha ha, that's absolutely hilarious!

üéØ shouting_speech.wav
   Text: (shouting) Can everyone hear me in the back?
üåç Generating multilingual examples...

üó£Ô∏è english.wav
   Text: Hello, how are you today?

üó£Ô∏è french.wav
   Text: Bonjour, comment allez-vous?

üó£Ô∏è spanish.wav
   Text: Hola, ¬øc√≥mo est√°s hoy?

üó£Ô∏è german.wav
   Text: Guten Tag, wie geht es Ihnen?

üó£Ô∏è japanese.wav
   Text: „Åì„Çì„Å´„Å°„ÅØ„ÄÅ‰ªäÊó•„ÅØ„ÅÑ„Åã„Åå„Åß„Åô„ÅãÔºü

üó£Ô∏è chinese.wav
   Text: ‰Ω†Â•ΩÔºå‰Ω†‰ªäÂ§©ÊÄé‰πàÊ†∑Ôºü
üí° Uncomment the function calls above to run the demos

In [15]:
# Voice cloning with reference audio using the API
import base64
from pathlib import Path

def generate_speech_with_reference(text, reference_audio_path, reference_text, save_path="cloned_output.wav"):
    """
    Generate speech using reference audio for voice cloning via API
    
    Args:
        text (str): Text to convert to speech
        reference_audio_path (str): Path to reference audio file
        reference_text (str): Text that corresponds to the reference audio
        save_path (str): Path to save the generated audio
    
    Returns:
        Audio: IPython Audio object for playback
    """
    url = "http://127.0.0.1:8080/api/tts"
    
    # Read and encode reference audio
    try:
        with open(reference_audio_path, "rb") as f:
            reference_audio_data = base64.b64encode(f.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"‚ùå Reference audio file not found: {reference_audio_path}")
        return None
    
    # API request payload with reference audio
    payload = {
        "text": text,
        "format": "wav",
        "reference_id": None,
        "reference_audio": reference_audio_data,
        "reference_text": reference_text,
        "max_new_tokens": 1024,
        "chunk_length": 200,
        "top_p": 0.7,
        "repetition_penalty": 1.2,
        "temperature": 0.7,
        "streaming": False
    }
    
    print(f"üé§ Generating speech with voice cloning...")
    print(f"üìù Text: '{text}'")
    print(f"üéµ Reference: {reference_audio_path}")
    
    try:
        response = requests.post(url, json=payload, timeout=120)  # Longer timeout for voice cloning
        response.raise_for_status()
        
        # Save the audio file
        with open(save_path, "wb") as f:
            f.write(response.content)
        
        print(f"‚úÖ Cloned audio saved to: {save_path}")
        
        # Return Audio object for Jupyter playback
        from IPython.display import Audio
        return Audio(save_path)
        
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Error generating cloned speech: {e}")
        return None

def list_available_voices():
    """List available reference voices in the data/voices directory"""
    voices_dir = Path("../data/voices")
    if not voices_dir.exists():
        print("‚ùå No voices directory found")
        return []
    
    voices = []
    for voice_dir in voices_dir.iterdir():
        if voice_dir.is_dir():
            audio_files = list(voice_dir.glob("*.mp3")) + list(voice_dir.glob("*.wav"))
            if audio_files:
                voices.append({
                    "name": voice_dir.name,
                    "path": str(voice_dir),
                    "audio_files": [str(f) for f in audio_files]
                })
    
    print("üé≠ Available voice references:")
    for voice in voices:
        print(f"  üìÅ {voice['name']}: {len(voice['audio_files'])} files")
        for audio_file in voice['audio_files'][:2]:  # Show first 2 files
            print(f"    üéµ {Path(audio_file).name}")
        if len(voice['audio_files']) > 2:
            print(f"    ... and {len(voice['audio_files']) - 2} more")
    
    return voices

# List available voices
available_voices = list_available_voices()

# Example voice cloning (uncomment when server is running and you have reference audio)
# if available_voices:
#     first_voice = available_voices[0]
#     reference_path = first_voice['audio_files'][0]
#     cloned_audio = generate_speech_with_reference(
#         text="This is a test of voice cloning using the reference audio.",
#         reference_audio_path=reference_path,
#         reference_text="This should be the text that matches the reference audio.",
#         save_path="voice_cloned_output.wav"
#     )
#     if cloned_audio:
#         display(cloned_audio)

print("üí° Uncomment the example above to test voice cloning")
print("üí° Make sure you have reference audio files in ../data/voices/")


üé≠ Available voice references:
  üìÅ dsp: 10 files
    üéµ dsp_05.wav
    üéµ dsp_02.wav
    ... and 8 more
  üìÅ batman: 4 files
    üéµ batman_02.wav
    üéµ batman_01.wav
    ... and 2 more
  üìÅ demo_1: 1 files
    üéµ demo_speaker0.mp3
  üìÅ biden: 5 files
    üéµ biden_04.wav
    üéµ biden_02.wav
    ... and 3 more
  üìÅ trump_cp: 11 files
    üéµ 2.wav
    üéµ 5.wav
    ... and 9 more
  üìÅ major: 21 files
    üéµ major_12.wav
    üéµ major_14.wav
    ... and 19 more
  üìÅ demo_3: 1 files
    üéµ demo_speaker2.mp3
  üìÅ demo_2: 1 files
    üéµ demo_speaker1.mp3
  üìÅ trump: 29 files
    üéµ trump_29.wav
    üéµ trump_19.wav
    ... and 27 more
  üìÅ loli: 6 files
    üéµ loli_02.wav
    üéµ loli_02_02.wav
    ... and 4 more
üí° Uncomment the example above to test voice cloning
üí° Make sure you have reference audio files in ../data/voices/


In [16]:
# Integration examples for using OpenAudio in applications
import asyncio
import time

class OpenAudioTTSService:
    """A class to wrap OpenAudio TTS functionality for easy integration"""
    
    def __init__(self, api_url="http://127.0.0.1:8080"):
        self.api_url = api_url
        self.tts_endpoint = f"{api_url}/api/tts"
    
    def is_server_running(self):
        """Check if the OpenAudio server is running"""
        try:
            response = requests.get(self.api_url, timeout=5)
            return response.status_code == 200
        except:
            return False
    
    def generate_speech(self, text, emotion=None, language=None, save_path=None, **kwargs):
        """
        Generate speech with optional emotion and language hints
        
        Args:
            text (str): Text to convert to speech
            emotion (str): Optional emotion marker like "(excited)" or "(whispering)"
            language (str): Language hint (though OpenAudio auto-detects)
            save_path (str): Optional path to save audio
            **kwargs: Additional API parameters
        
        Returns:
            bytes: Audio data or None if failed
        """
        if not self.is_server_running():
            print("‚ùå OpenAudio server is not running")
            return None
        
        # Add emotion markers if specified
        if emotion:
            text = f"{emotion} {text}"
        
        # Default payload
        payload = {
            "text": text,
            "format": kwargs.get("format", "wav"),
            "reference_id": kwargs.get("reference_id", None),
            "reference_audio": kwargs.get("reference_audio", None),
            "reference_text": kwargs.get("reference_text", None),
            "max_new_tokens": kwargs.get("max_new_tokens", 1024),
            "chunk_length": kwargs.get("chunk_length", 200),
            "top_p": kwargs.get("top_p", 0.7),
            "repetition_penalty": kwargs.get("repetition_penalty", 1.2),
            "temperature": kwargs.get("temperature", 0.7),
            "streaming": kwargs.get("streaming", False)
        }
        
        try:
            response = requests.post(self.tts_endpoint, json=payload, timeout=60)
            response.raise_for_status()
            
            audio_data = response.content
            
            # Save if path provided
            if save_path:
                with open(save_path, "wb") as f:
                    f.write(audio_data)
                print(f"‚úÖ Audio saved to: {save_path}")
            
            return audio_data
            
        except requests.exceptions.RequestException as e:
            print(f"‚ùå TTS generation failed: {e}")
            return None
    
    def batch_generate(self, texts, emotion=None, output_dir="batch_output"):
        """Generate multiple TTS files in batch"""
        Path(output_dir).mkdir(exist_ok=True)
        results = []
        
        for i, text in enumerate(texts):
            print(f"üé§ Generating {i+1}/{len(texts)}: {text[:50]}...")
            output_path = Path(output_dir) / f"speech_{i:03d}.wav"
            
            audio_data = self.generate_speech(
                text=text,
                emotion=emotion,
                save_path=str(output_path)
            )
            
            results.append({
                "text": text,
                "path": str(output_path) if audio_data else None,
                "success": audio_data is not None
            })
            
            # Small delay to not overwhelm the server
            time.sleep(0.5)
        
        return results

# Example usage
tts_service = OpenAudioTTSService()

# Test if server is running
if tts_service.is_server_running():
    print("‚úÖ OpenAudio server is running and ready!")
    
    # Simple generation example
    # audio_data = tts_service.generate_speech(
    #     text="Hello, this is a test of the TTS service wrapper.",
    #     emotion="(confident)",
    #     save_path="service_test.wav"
    # )
    
    # Batch generation example
    # sample_texts = [
    #     "Welcome to our application!",
    #     "Please select an option from the menu.",
    #     "Thank you for using our service.",
    #     "Have a great day!"
    # ]
    # results = tts_service.batch_generate(sample_texts, emotion="(friendly)")
    # print(f"‚úÖ Generated {sum(1 for r in results if r['success'])}/{len(results)} files")
    
else:
    print("‚ùå OpenAudio server is not running. Please start it first.")

print("üí° Uncomment the examples above to test the TTS service wrapper")


‚úÖ OpenAudio server is running and ready!
üí° Uncomment the examples above to test the TTS service wrapper


In [17]:
# Clean up - stop the server if needed
def cleanup_server():
    """Stop the OpenAudio server process"""
    try:
        if 'server_process' in globals() and server_process:
            print("üõë Stopping OpenAudio server...")
            server_process.terminate()
            server_process.wait(timeout=10)
            print("‚úÖ Server stopped successfully")
    except Exception as e:
        print(f"‚ö†Ô∏è Error stopping server: {e}")

# Summary and next steps
print("üéâ OpenAudio S1 Mini Notebook Complete!")
print("\nüìã Summary of what we covered:")
print("  ‚úÖ Model download and setup")
print("  ‚úÖ API server startup")
print("  ‚úÖ Basic text-to-speech generation")
print("  ‚úÖ Emotion and tone control")
print("  ‚úÖ Multilingual support")
print("  ‚úÖ Voice cloning with reference audio")
print("  ‚úÖ Service wrapper for easy integration")

print("\nüöÄ Next steps:")
print("  1. Integrate the OpenAudioTTSService class into your applications")
print("  2. Experiment with different emotions and tones")
print("  3. Try voice cloning with your own reference audio")
print("  4. Optimize parameters (temperature, top_p) for your use case")
print("  5. Consider using streaming for real-time applications")

print("\nüîó Useful resources:")
print("  üìñ Fish Audio Documentation: https://speech.fish.audio/")
print("  ü§ó Model on Hugging Face: https://huggingface.co/fishaudio/openaudio-s1-mini")
print("  üí¨ Discord Community: https://discord.gg/fishaudio")

# Uncomment to stop the server when done
# cleanup_server()


üéâ OpenAudio S1 Mini Notebook Complete!

üìã Summary of what we covered:
  ‚úÖ Model download and setup
  ‚úÖ API server startup
  ‚úÖ Basic text-to-speech generation
  ‚úÖ Emotion and tone control
  ‚úÖ Multilingual support
  ‚úÖ Voice cloning with reference audio
  ‚úÖ Service wrapper for easy integration

üöÄ Next steps:
  1. Integrate the OpenAudioTTSService class into your applications
  2. Experiment with different emotions and tones
  3. Try voice cloning with your own reference audio
  4. Optimize parameters (temperature, top_p) for your use case
  5. Consider using streaming for real-time applications

üîó Useful resources:
  üìñ Fish Audio Documentation: https://speech.fish.audio/
  ü§ó Model on Hugging Face: https://huggingface.co/fishaudio/openaudio-s1-mini
  üí¨ Discord Community: https://discord.gg/fishaudio
