# Text To Speech (TTS)


In [33]:
from huggingface_hub import login
token = "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"
print("Hugging Face logging")
login(token)

Hugging Face logging


In [34]:
import torch
device =  ("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

Usando dispositivo: cpu


## Generación de audio con speaker embeddings default

In [35]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torchaudio
import torch

# Configuración
model_name = "microsoft/speecht5_tts"  # Modelo SpeechT5 para TTS

# Cargar el modelo, el procesador y el vocoder
processor = SpeechT5Processor.from_pretrained(model_name)
model = SpeechT5ForTextToSpeech.from_pretrained(model_name, cache_dir="./models/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir="./models/speecht5_hifigan")

def text_to_speech(text, processor, model, vocoder, speaker_embeddings=torch.zeros((1, 512)), output_file="./output/output.wav"):
    # Procesar el texto
    inputs = processor(text=text, return_tensors="pt")

    # Generar el embedding de audio
    with torch.no_grad():
        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=speaker_embeddings)

    # Usar el vocoder para generar el audio final
    with torch.no_grad():
        audio = vocoder(speech)

    # Ajustar el tensor para que sea compatible con torchaudio (formato 2D)
    audio = audio.squeeze(0).unsqueeze(0)
    if output_file is not None:
        # Guardar el audio generado
        torchaudio.save(output_file, audio, sample_rate=16000)  # 16000 es la tasa de muestreo
        print(f"Audio guardado en {output_file}")
    return audio

In [36]:
# Configuración
model_name = "microsoft/speecht5_tts"  # Modelo SpeechT5 para TTS

# Cargar el modelo, el procesador y el vocoder
processor = SpeechT5Processor.from_pretrained(model_name)
model = SpeechT5ForTextToSpeech.from_pretrained(model_name)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [37]:
texto = "this is a generated text"
text_to_speech(text=texto,processor=processor, model=model, vocoder=vocoder, output_file="./output/audio_generado-es100.wav")

Audio guardado en ./output/audio_generado-es100.wav


tensor([[-0.0003, -0.0012, -0.0002,  ...,  0.0048,  0.0015, -0.0007]])

## Generación de audio con speaker embeddings personalizados

In [38]:
from datasets import load_dataset

# Cargar embeddings del dataset de voz
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
# Seleccionar un embedding específico (ajusta el índice según el dataset)
speaker_embeddings = torch.tensor(embeddings_dataset[500]["xvector"]).unsqueeze(0)

texto = "this is a generated text"
text_to_speech(text=texto,processor=processor, model=model, vocoder=vocoder, speaker_embeddings=speaker_embeddings, output_file="./output/audio_generado.wav")

Audio guardado en ./output/audio_generado.wav


tensor([[-0.0005, -0.0009, -0.0005,  ..., -0.0068, -0.0066, -0.0051]])

#### Tarea TTSB1

Generar distintos audios usando diferentes embeddings del dataset

In [39]:
# TODO: cambiar el numero del embedding del dataset para ver distintas generaciones, para ello puede usar random_embedding = random.randint(1, 7500)


### Reproducción del audio en lugar de alamacenarlo

In [40]:
import IPython.display as ipd


speaker_embeddings = torch.tensor(embeddings_dataset[7930]["xvector"]).unsqueeze(0)

texto = "this is a generated text"
audio = text_to_speech(text=texto,processor=processor, model=model, vocoder=vocoder, speaker_embeddings=speaker_embeddings, output_file=None)
audio_np = audio.cpu().numpy().squeeze()
ipd.display(ipd.Audio(audio_np, rate=16000))  # 16000 es la tasa de muestreo

## Generación de audio con speaker embeddings personalizados a partir de fichero wav

In [41]:
from datasets import Audio
# Cargar el dataset personalizado
dataset = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", cache_dir="./data/common_voice")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))


In [42]:
def filter_dataset(batch):
    return batch["client_id"] == "0a041928afd5c85b769b7aaafe7202013aa48b0a995bbb55c8ff5e6c76c9886a5c739f97ee85325f5ffa678576c81f0f9f038dd36428c69726934739e466cad8"

dataset = dataset.filter(filter_dataset)

In [43]:
print(dataset)

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_rows: 1
})


In [44]:
from speechbrain.pretrained import SpeakerRecognition

embedding_model = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", run_opts={"device": device})

audio_entry = dataset[0]["audio"]

if "array" in audio_entry and "sampling_rate" in audio_entry:

    waveform = torch.tensor(audio_entry["array"], dtype=torch.float32).unsqueeze(0).to(device)
    sampling_rate = audio_entry["sampling_rate"]
else:
    import torchaudio
    audio_path = audio_entry["path"]
    waveform, sampling_rate = torchaudio.load(audio_path)
    waveform = waveform.to(device)

# generar speaker embedding
with torch.no_grad():
    speaker_embeddings = embedding_model.encode_batch(waveform).to(device).squeeze(0)
    texto = "Esto es un texto generado"
    text_to_speech(text=texto,processor=processor, model=model, vocoder=vocoder, speaker_embeddings=speaker_embeddings, output_file="./output/audio_generado-mozilla.wav")

Audio guardado en ./output/audio_generado-mozilla.wav


#### Incrementando el numero de muestras

In [45]:
from datasets import Audio
dataset = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", cache_dir="./data/common_voice")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))


In [46]:
dataset = dataset.filter(lambda example: "España: Centro-Sur peninsular" in example["accent"] and "female" in example["gender"] )


In [47]:
import numpy as np

def get_average_embedding(dataset, embedding_model, device="cpu"):
    embeddings = []
    for entry in dataset:
        audio_entry = entry["audio"]
        if "array" in audio_entry and "sampling_rate" in audio_entry:
            waveform = torch.tensor(audio_entry["array"], dtype=torch.float32).unsqueeze(0).to(device)
        else:
            waveform, _ = torchaudio.load(audio_entry["path"])
            waveform = waveform.to(device)
        with torch.no_grad():
            embedding = embedding_model.encode_batch(waveform).squeeze(0).cpu().numpy()
            embeddings.append(embedding)
    embeddings = np.array(embeddings)  # Convertir a un numpy array para mejor manejo
    return torch.tensor(embeddings.mean(axis=0)).unsqueeze(0).to(device)


In [48]:
# Cargar el modelo de reconocimiento de hablantes
embedding_model = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", run_opts={"device": device})

# Generar embedding promedio
speaker_embeddings = get_average_embedding(dataset, embedding_model, device=device)
# Ajustar dimensiones de speaker_embeddings
if speaker_embeddings.dim() > 2:
    speaker_embeddings = speaker_embeddings.squeeze()  # Elimina dimensiones extra
if speaker_embeddings.dim() == 1:
    speaker_embeddings = speaker_embeddings.unsqueeze(0)  # Añade la dimensión de batch

# Generar audio
texto = "Esto es un texto generado usando embeddings promedio."
text_to_speech(text=texto, processor=processor, model=model, vocoder=vocoder, speaker_embeddings=speaker_embeddings, output_file="./output/audio_generado_mozilla-promedio.wav")


Audio guardado en ./output/audio_generado_mozilla-promedio.wav


tensor([[-2.5113e-05,  2.6233e-05, -1.8190e-06,  ..., -1.3156e-04,
         -5.8535e-05, -7.9762e-05]])

#### Tarea TTSB2

Con lo visto en el notebook, ¿Cómo podríamos generar un sistema TTS que sonara como otra persona?