<a href="https://colab.research.google.com/github/ap0plexik/notebooks/blob/trunk/files/text_to_speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is a text-to-audio generator using SpeechT5 with a gradio interface.

It saves generated audio automatically to an `/output` folder. To send audio files to your Google Drive, connect to your Drive in the first code block, then when you are done, run the last code block. Note that this notebook's `/output` directory will be wiped when you restart this runtime.

In [None]:
## If desired, save to your Google Drive.

# All audio is saved to the /output directory by default.
# If, later, you'd like to move or copy your audio files into your Google Drive,
# run this code block.

# Connect to your Google Drive.
from google.colab import drive
drive.mount('/content/drive/')

# Audio Output directory.  Make sure this folser exists before you proceed.
# '/content/drive/MyDrive/tt5'

In [37]:
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q torch
!pip install -q torchaudio
!pip install -q soundfile
!pip install -q librosa
!pip install -q samplerate
!pip install -q resampy
!pip install -q sentencepiece
!pip install -q gradio
!git lfs install
!git clone https://huggingface.co/spaces/Matthijs/speecht5-tts-demo
!mkdir output

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Error: Failed to call git rev-parse --git-dir: exit status 128 
Git LFS initialized.
fatal: destination path 'speecht5-tts-demo' already exists and is not an empty directory.
mkdir: cannot create directory ‘output’: File exists


In [47]:
import gradio as gr
import librosa
import numpy as np
import torch
import io
import os
import re
import string
import time
from pydub import AudioSegment
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


speaker_embeddings = {
    "BDL": "speecht5-tts-demo/spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
    "CLB": "speecht5-tts-demo/spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
    "KSP": "speecht5-tts-demo/spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
    "RMS": "speecht5-tts-demo/spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
    "SLT": "speecht5-tts-demo/spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
}

def predict(text, speaker):
    if len(text.strip()) == 0:
        return (16000, np.zeros(0).astype(np.int16))

    inputs = processor(text=text, return_tensors="pt")

    # limit input length
    input_ids = inputs["input_ids"]
    input_ids = input_ids[..., :model.config.max_text_positions]

    if speaker == "Surprise Me!":
        # load one of the provided speaker embeddings at random
        idx = np.random.randint(len(speaker_embeddings))
        key = list(speaker_embeddings.keys())[idx]
        speaker_embedding = np.load(speaker_embeddings[key])

        # randomly shuffle the elements
        np.random.shuffle(speaker_embedding)

        # randomly flip half the values
        x = (np.random.rand(512) >= 0.5) * 1.0
        x[x == 0] = -1.0
        speaker_embedding *= x

        #speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
    else:
        speaker_embedding = np.load(speaker_embeddings[speaker[:3]])

    speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)

    speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)

    speech = (speech.numpy() * 32767).astype(np.int16)
    
    save_as_mp3(speech,text)

    return (16000, speech)

def save_as_mp3(audio,text):
    audio = AudioSegment(audio.tobytes(), frame_rate=16000, sample_width=2, channels=1)
    print('here')
    mp3 = io.BytesIO()
    audio.export(mp3, format='mp3')
    mp3.seek(0)

    filename = text_to_filename(text)
    filepath = os.path.join('/content/output', filename)
    print(filepath)
    with open(filepath, "wb") as f:
        f.write(mp3.read())

    print(filepath)

def text_to_filename(text):
    # Remove punctuation and apostrophes, replace spaces with hyphens
    s = re.sub(r'[^\w\s-]', '', text).strip().replace(' ', '-')
    # Convert to lowercase and remove any remaining hyphens
    s = s.lower().strip('-')
    # Use first 10 chars of text
    s = s[:10] + ".mp3"
    timestamp = str(int(time.time()))
    file_name = f"{timestamp}_{s}"
    return file_name


title = "SpeechT5: Speech Synthesis"

examples = [
    ["It is not in the stars to hold our destiny but in ourselves.", "BDL (male)"],
    ["The octopus and Oliver went to the opera in October.", "CLB (female)"],
    ["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "RMS (male)"],
    ["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "SLT (female)"],
    ["A synonym for cinnamon is a cinnamon synonym.", "BDL (male)"],
    ["How much wood would a woodchuck chuck if a woodchuck could chuck wood? He would chuck, he would, as much as he could, and chuck as much wood as a woodchuck would if a woodchuck could chuck wood.", "CLB (female)"],
]

description = "This can create up to ~30 seconds of audio, which is about 100 words or about 500 characters. Even then, it can take quite a while to render."


In [None]:
# Start/Stop this block to re-start the interface.
gr.Interface(
    fn=predict,
    inputs=[
        gr.Text(label="Input Text",lines=5),
        gr.Radio(label="Speaker", choices=[
            "BDL (male)",
            "CLB (female)",
            "KSP (male)",
            "RMS (male)",
            "SLT (female)",
            "Surprise Me!"
        ],
        value="BDL (male)"),
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
    ],
    title=title,
    description=description,
    examples=examples,
).launch(share=True,debug=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://af05974209b44306f4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


here
/content/output/1677762762_lets-try-t.mp3
/content/output/1677762762_lets-try-t.mp3
here
/content/output/1677762777_lets-try-t.mp3
/content/output/1677762777_lets-try-t.mp3
here
/content/output/1677762792_lets-try-t.mp3
/content/output/1677762792_lets-try-t.mp3
here
/content/output/1677762802_lets-try-t.mp3
/content/output/1677762802_lets-try-t.mp3
here
/content/output/1677762810_lets-try-t.mp3
/content/output/1677762810_lets-try-t.mp3
here
/content/output/1677762835_lets-try-t.mp3
/content/output/1677762835_lets-try-t.mp3


In [44]:
# Send it to Drive.
# Make sure you mounted Drive back at the beginning of this notebook.
# Be sure to make sure you have a folder in Google Drive called 'tt5'.
# Don't uncomment both commands and run this. You're not gonna have a good time.

# Uncomment to Move all files to Drive and remove them from this notebook.
# !mv -v /content/output/* /content/drive/MyDrive/tt5/

# This just sends a Copy to Google Drive. 
# Doesn't delete anything inside our notebook's /output directory.
!cp -v /content/output/* /content/drive/MyDrive/tt5/

'/content/output/1677762128_its-pretty.mp3' -> '/content/drive/MyDrive/tt5/1677762128_its-pretty.mp3'
