In [1]:
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
import sounddevice as sd
from pathlib import Path
import numpy as np
import wave
import librosa
import time

In [2]:
start = time.time()

encoder.load_model(Path("encoder/saved_models/pretrained.pt"))
synthesizer = Synthesizer(Path("synthesizer/saved_models/logs-pretrained/taco_pretrained"))
vocoder.load_model(Path("vocoder/saved_models/pretrained/pretrained.pt"))

stop = time.time()
print(f"Total time: {stop - start}")

Loaded encoder "pretrained.pt" trained to step 1564501
Found synthesizer "pretrained" trained to step 278000
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at vocoder/saved_models/pretrained/pretrained.pt
Total time: 0.09991312026977539


In [None]:
### from the documentation

In [None]:
audio_file_path = "samples/elon_voice.wav"

In [None]:
original_wav, sampling_rate = librosa.load(str(audio_file_path))
preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)

In [None]:
#create embedding:

In [None]:
embed = encoder.embed_utterance(preprocessed_wav)

In [None]:
#get text
text = "When something is important enough, you do it even if the odds are not in your favor."

In [None]:
texts = [text]
embeds = [embed]

In [None]:
#create mel spectogram
specs = synthesizer.synthesize_spectrograms(texts, embeds)
spec = specs[0]
print("Created the mel spectrogram")

In [None]:
## Generating the waveform
print("Synthesizing the waveform:")
# Synthesizing the waveform is fairly straightforward. Remember that the longer the
# spectrogram, the more time-efficient the vocoder.
start = time.time()
generated_wav = vocoder.infer_waveform(spec)
stop = time.time()
print(f"\nTotal time: {stop - start}")

In [None]:
#pad it to prevent from cutting audio
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")

In [None]:
# Trim excess silences to compensate for gaps in spectrograms (issue #53)
generated_wav = encoder.preprocess_wav(generated_wav)

In [125]:
global_wav = []

In [126]:
def text_to_audio(text, audio_path):
    """
    Convert individual sentence to cloned audio.
    @Param:
    1. text - sentence. (string)
    2. audio_path - relative path for original .wav file. (must be .wav)
    """
    audio_file_path = audio_path
    original_wav, sampling_rate = librosa.load(str(audio_file_path))
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    embed = encoder.embed_utterance(preprocessed_wav)
    texts = [text]
    embeds = [embed]
    #create mel spectogram
    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    spec = specs[0]
    print("Created the mel spectrogram")
    ## Generating the waveform
    print("Synthesizing the waveform:")
    # Synthesizing the waveform is fairly straightforward. Remember that the longer the
    # spectrogram, the more time-efficient the vocoder.
    start = time.time()
    generated_wav = vocoder.infer_waveform(spec)
    stop = time.time()
    print(f"\nTotal time: {stop - start}")
    #pad it to prevent from cutting audio
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
    # Trim excess silences to compensate for gaps in spectrograms (issue #53)
    generated_wav = encoder.preprocess_wav(generated_wav)
    global_wav.append(generated_wav)
    return generated_wav

In [113]:
# generated_wav = text_to_audio(text, audio_file_path)

In [None]:
# Play the audio (non-blocking)
try:
    sd.stop()
    sd.play(generated_wav, synthesizer.sample_rate)
except sd.PortAudioError as e:
    print("\nCaught exception: %s" % repr(e))
    print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
except:
    raise

In [None]:
### Implement using multiprocessing

In [127]:
from threading import Thread
import os

In [128]:
threads = []

In [129]:
audio_file_path = "samples/elon_voice.wav"
#get text
text0 = "When something is important enough, you do it even if the odds are not in your favor."
text1 = "Multiprocessing refers to the ability of a system to support more than one processor at the same time."
text = [text0, text1, text0, text1, text0, text1, text0, text1, text0]

In [130]:
for i in range(len(text)):
    print(f"Registering thread {i + 1}")
    result = Thread(target=text_to_audio, args=[text[i], audio_file_path])
    threads.append(result)

Registering thread 1
Registering thread 2
Registering thread 3
Registering thread 4
Registering thread 5
Registering thread 6
Registering thread 7
Registering thread 8
Registering thread 9


In [131]:
for thread in threads:
    thread.start()
    
for thread in threads:
    thread.join()

Created the mel spectrogram
Synthesizing the waveform:
Created the mel spectrogram
Synthesizing the waveform:
Created the mel spectrogram
Synthesizing the waveform:
Created the mel spectrogram
Synthesizing the waveform:
Created the mel spectrogram
Synthesizing the waveform:
Created the mel spectrogram
Synthesizing the waveform:
Created the mel spectrogram
Synthesizing the waveform:
Created the mel spectrogram
Synthesizing the waveform:
Created the mel spectrogram
Synthesizing the waveform:
{| ████████████████ 76000/76800 | Batch Size: 8 | Gen Rate: 0.9kHz | }
Total time: 81.52424812316895
{| ████████████████ 84600/86400 | Batch Size: 9 | Gen Rate: 1.0kHz | }
Total time: 82.84767293930054
{| ████████████████ 85500/86400 | Batch Size: 9 | Gen Rate: 1.0kHz | }
Total time: 82.86099910736084
{| ████████████████ 85500/86400 | Batch Size: 9 | Gen Rate: 1.0kHz | }
Total time: 82.85699129104614

Total time: 82.8376817703247

Total time: 82.45955991744995

Total time: 83.23778676986694

Total ti

In [132]:
global_list = np.array(global_wav).reshape(len(global_wav))

In [133]:
a0 = np.array([*global_list[0]])
a1 = np.array([*global_list[1]])

In [134]:
temp = [*a0, *np.zeros(10000), *a1]

In [135]:
len(temp)

137680

In [136]:
sd.stop()
sd.play(temp, synthesizer.sample_rate)

In [None]:
#plz work...