In [1]:
import tensorflow as tf
from tensorflow_tts.inference import TFAutoModel, AutoConfig

2024-07-12 13:21:41.673563: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-12 13:21:41.696719: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 The versions of TensorFlow you are currently using is 2.12.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFl

In [2]:
def convert_text2mel_tflite(
    model_path: str, save_name: str, config_path: str = None, use_auth_token: bool = False
) -> float:
    # load pretrained model
    config = AutoConfig.from_pretrained(config_path if config_path else model_path)
    kwargs = {"use_auth_token": use_auth_token} if use_auth_token else {}
    model = TFAutoModel.from_pretrained(
        model_path, config=config, enable_tflite_convertible=True, **kwargs
    )

    # setup model concrete function
    concrete_function = model.inference_tflite.get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_function])

    # specify optimizations
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS,  # quantize
        tf.lite.OpsSet.SELECT_TF_OPS,
    ]

    # convert and save model to TensorFlowLite
    tflite_model = converter.convert()
    with open(save_name, "wb") as f:
        f.write(tflite_model)

    size = len(tflite_model) / 1024 / 1024.0
    return size

In [3]:
def convert_vocoder_tflite(
    model_path: str, save_name: str, use_auth_token: bool = False
) -> float:
    # load pretrained model
    model = TFAutoModel.from_pretrained(model_path, use_auth_token=use_auth_token)

    # setup model concrete function
    concrete_function = model.inference_tflite.get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_function])

    # specify optimizations
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.target_spec.supported_ops = [tf.lite.OpsSet.SELECT_TF_OPS]
    converter.target_spec.supported_types = [tf.float16]  # fp16 ops

    # convert and save model to TensorFlowLite
    tflite_model = converter.convert()
    with open(save_name, "wb") as f:
        f.write(tflite_model)

    size = len(tflite_model) / 1024 / 1024.0
    return size

In [4]:
text2mel = convert_text2mel_tflite(
    model_path="bookbot/lightspeech-mfa-sw-v2",
    save_name="lightspeech_quant.tflite",
    use_auth_token=False,
)

vocoder = convert_vocoder_tflite(
    model_path="bookbot/mb-melgan-hifi-postnets-sw-v2",
    save_name="mbmelgan.tflite",
    use_auth_token=True,
)

2024-07-12 13:23:46.847758: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-12 13:23:46.847860: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-12 13:23:46.862297: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [5]:
print(f"Text2mel: {text2mel} MBs\nVocoder: {vocoder} MBs")

Text2mel: 4.500587463378906 MBs
Vocoder: 5.052242279052734 MBs


In [6]:
from tensorflow_tts.inference import AutoProcessor

processor = AutoProcessor.from_pretrained("bookbot/lightspeech-mfa-sw-v2", use_auth_token=True)
processor.mode = "eval" # change processor from train to eval mode

In [7]:
from typing import List, Tuple

def tokenize(text: str, processor: AutoProcessor) -> List[int]:
    return processor.text_to_sequence(text)

In [8]:
def prepare_input(
    input_ids: List[str], speaker: int
) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
    input_ids = tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0)
    return (
        input_ids,
        tf.convert_to_tensor([speaker], tf.int32),
        tf.convert_to_tensor([1.0], dtype=tf.float32),
        tf.convert_to_tensor([1.0], dtype=tf.float32),
        tf.convert_to_tensor([1.0], dtype=tf.float32),
    )

In [9]:
def ls_infer(
    input_ids: List[str], speaker: int, lightspeech_path: str
) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
    # load model to Interpreter
    lightspeech = tf.lite.Interpreter(model_path=lightspeech_path)
    input_details = lightspeech.get_input_details()
    output_details = lightspeech.get_output_details()

    # print(input_details)

    # resize input tensors according to actual shape
    lightspeech.resize_tensor_input(input_details[0]["index"], [1, len(input_ids)])
    lightspeech.resize_tensor_input(input_details[1]["index"], [1])
    lightspeech.resize_tensor_input(input_details[2]["index"], [1])
    lightspeech.resize_tensor_input(input_details[3]["index"], [1])
    lightspeech.resize_tensor_input(input_details[4]["index"], [1])

    # allocate tensors
    lightspeech.allocate_tensors()

    input_data = prepare_input(input_ids, speaker)

    # set input tensors
    for i, detail in enumerate(input_details):
        lightspeech.set_tensor(detail["index"], input_data[i])

    # invoke interpreter
    lightspeech.invoke()

    # print(output_details)

    # return outputs
    return (
        lightspeech.get_tensor(output_details[0]["index"]),
        lightspeech.get_tensor(output_details[1]["index"]),
    )

In [10]:
def melgan_infer(melspectrogram: tf.Tensor, mb_melgan_path: str) -> tf.Tensor:
    # load model to Interpreter
    mb_melgan = tf.lite.Interpreter(model_path=mb_melgan_path)
    input_details = mb_melgan.get_input_details()
    output_details = mb_melgan.get_output_details()

    # resize input tensors according to actual shape
    mb_melgan.resize_tensor_input(
        input_details[0]["index"],
        [1, melspectrogram.shape[1], melspectrogram.shape[2]],
        strict=True,
    )

    # allocate tensors
    mb_melgan.allocate_tensors()

    # set input tensors
    mb_melgan.set_tensor(input_details[0]["index"], melspectrogram)

    # invoke interpreter
    mb_melgan.invoke()

    # return output
    return mb_melgan.get_tensor(output_details[0]["index"])

In [11]:
# text = "The quick brown fox jumps over the lazy dog, while the phoneme sounds of pheasants, quails and crickets chirp in the background."
# text = "Hapo mwanzo Mungu aliumba mbingu na dunia."
text = "Giza lilikuwa juu ya uso wa vilindi vya maji, naye Roho wa Mungu alikuwa ametulia juu ya maji."
# text = "Nimepata kupungukiwa, pia nimepata kuwa na wingi wa vitu."
input_ids = tokenize(text, processor)

In [12]:
# _, mel_output_tflite = ls_infer(
#     input_ids, speaker=0, lightspeech_path="fastspeech2_quant.tflite"
# )

mel_output_tflite, _ = ls_infer(
    input_ids, speaker=0, lightspeech_path="lightspeech_quant.tflite"
)

audio_tflite = melgan_infer(mel_output_tflite[:, :, :], mb_melgan_path="mbmelgan.tflite")[
    0, :, 0
]

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
INFO: Created TensorFlow Lite delegate for select TF ops.
2024-07-12 13:24:02.829964: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-12 13:24:02.830123: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-12 13:24:02.830189: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, s

In [13]:
mel_output_tflite.shape

(1, 514, 80)

In [14]:
audio_tflite.shape

(263168,)

In [15]:
from IPython.display import Audio

Audio(data=audio_tflite, rate=44100)

In [None]:
import soundfile as sf

sf.write("./gen2.wav", audio_tflite, 44100, "PCM_16")