In [1]:
import onnxruntime as ort
import soundfile as sf
import numpy as np

providers = ["CPUExecutionProvider"]
sess_options = ort.SessionOptions()
onnx_sess = ort.InferenceSession(
    "vits2-en-Eleven.onnx",
    sess_options=sess_options,
    providers=providers,
)

In [2]:
import commons
import utils
from text import text_to_sequence
import torch

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

hps = utils.get_hparams_from_file("logs/en_eleven_small/config.json")



In [3]:
texts = [
    "feng senang melihat anak itu mengambil gambar feng .",
    "sekarang cabang pohon sudah keras dan tegap .",
    "lori tahu kalau dirinya istimewa , dan lori sangat bangga dengan hal itu .",
    "mereka membuka tas penjual itu dan melihat isinya .",
    "penduduk desa menyukai burung itu dan memberi nama si cantik .",
    "ketika pulang sekolah , aku melihat sebutir benih jatuh dari burung hantu .",
    "aku akan belajar dan kawan kawanku akan senang padaku .",
    "sampai dia mencapai matahari !",
    "ibu menariknya dari kolong tempat tidur !",
    "ada burung kecil terbang dan tanya .",
    "alam.",
    "petualangan.",
    "sains.",
    "mainkref.",
    "baru saja.",
    "perpustakaan.",
    "ensiklopedia."
]

for idx, text in enumerate(texts):
    phoneme_ids = get_text(text, hps)
    text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
    text_lengths = np.array([text.shape[1]], dtype=np.int64)
    scales = np.array([0.667, 1.0, 0.8], dtype=np.float32)

    audio = onnx_sess.run(
        None,
        {
            "input": text,
            "input_lengths": text_lengths,
            "scales": scales,
            "sid": None,
        },
    )[0]
    sf.write(f"outputs/vits2-id-ID-Althaf_{idx}.wav", audio[0, 0, :], 44100)

In [6]:
import IPython.display as ipd

phoneme_ids = get_text("The quick brown fox jumps over the lazy dog, while the phoneme sounds of pheasants, quails and crickets chirp in the background.", hps)
text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
text_lengths = np.array([text.shape[1]], dtype=np.int64)
scales = np.array([0.667, 1.0, 0.8], dtype=np.float32)
sid = np.array([1], dtype=np.int64)

audio = onnx_sess.run(
    None,
    {
        "input": text,
        "input_lengths": text_lengths,
        "scales": scales,
        "sid": sid,
    },
)[0]
ipd.Audio(audio[0, 0, :], rate=44100)