<a href="https://colab.research.google.com/github/aakashgoel/tts/blob/main/TextToVoice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://pytorch.org/audio/stable/tutorials/tacotron2_pipeline_tutorial.html

In [None]:
# pip install deep-phonemizer

In [13]:
import IPython
import matplotlib
import matplotlib.pyplot as plt
import torch
import torchaudio

matplotlib.rcParams["figure.figsize"] = [16.0, 4.8]

torch.random.manual_seed(0)
device = "cuda" if torch.cuda.is_available() else "cpu"

print(torch.__version__)
print(torchaudio.__version__)
print(device)

2.0.1+cu118
2.0.2+cu118
cpu


In [None]:
bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH

processor = bundle.get_text_processor()
tacotron2 = bundle.get_tacotron2().to(device)
vocoder = bundle.get_vocoder().to(device)

text = """
Friends have been urging me to write to you for the sake of humanity.
But I have resisted their request, because of the feeling that any letter from me would be an impertinence.
Something tells me that I must not calculate and that I must make my appeal for whatever it may be worth.
It is quite clear that you are today the one person in the world who can prevent a war which may reduce humanity to the savage state.
Must you pay that price for an object however worthy it may appear to you to be?
Will you listen to the appeal of one who has deliberately shunned the method of war not without considerable success?
Any way I anticipate your forgiveness, if I have erred in writing to you
"""

with torch.inference_mode():
    processed, lengths = processor(text)
    processed = processed.to(device)
    lengths = lengths.to(device)
    spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
    waveforms, lengths = vocoder(spec, spec_lengths)

# fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9))
# ax1.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
# ax2.plot(waveforms[0].cpu().detach())

IPython.display.Audio(waveforms[0:1].cpu(), rate=vocoder.sample_rate)

In [None]:
# Wavenet

import torch
import torchaudio
from wavenet_vocoder import WaveNet

# Load the pre-trained WaveNet model
wavenet = WaveNet().to('cuda')  # Make sure to use GPU if available
checkpoint = torch.load('wavenet_model.pth')
wavenet.load_state_dict(checkpoint['state_dict'])
wavenet.eval()

# Load and preprocess the input mel-spectrogram
mel_spectrogram, sample_rate = torchaudio.load('input_mel_spectrogram.pt')  # Replace with your own input
mel_spectrogram = mel_spectrogram.to('cuda')  # Move to GPU if available

# Perform inference
with torch.no_grad():
    # Pass the mel-spectrogram through the WaveNet model
    audio = wavenet.infer(mel_spectrogram)

# Save the synthesized audio
torchaudio.save('synthesized_audio.wav', audio.cpu(), sample_rate)  # Replace with your desired output path

print("Synthesized audio saved successfully!")


In [None]:
import torch
import torchaudio
from wavenet_vocoder import WaveNet
from melgan_vocoder import MelGAN

# Define the list of vocoder models
vocoder_models = [
    {
        'name': 'WaveNet',
        'model': WaveNet().to('cuda'),
        'checkpoint': torch.load('wavenet_model.pth')
    },
    {
        'name': 'MelGAN',
        'model': MelGAN().to('cuda'),
        'checkpoint': torch.load('melgan_model.pth')
    },
    # Add more vocoder models as needed
]

# Load and preprocess the input mel-spectrogram
mel_spectrogram, sample_rate = torchaudio.load('input_mel_spectrogram.pt')  # Replace with your own input
mel_spectrogram = mel_spectrogram.to('cuda')  # Move to GPU if available

# Perform inference and measure performance for each vocoder model
for vocoder in vocoder_models:
    vocoder_model = vocoder['model']
    vocoder_checkpoint = vocoder['checkpoint']

    vocoder_model.load_state_dict(vocoder_checkpoint['state_dict'])
    vocoder_model.eval()

    with torch.no_grad():
        start_time = torch.cuda.Event(enable_timing=True)
        end_time = torch.cuda.Event(enable_timing=True)

        start_time.record()
        audio = vocoder_model.infer(mel_spectrogram)
        end_time.record()
        torch.cuda.synchronize()

        inference_time = start_time.elapsed_time(end_time)

    output_path = f"synthesized_audio_{vocoder['name'].lower()}.wav"
    torchaudio.save(output_path, audio.cpu(), sample_rate)

    print(f"{vocoder['name']} inference time:", inference_time, "ms")

print("Synthesized audios saved successfully!")
