In [1]:
import tensorflow as tf
from tensorflow_tts.inference import TFAutoModel

 The versions of TensorFlow you are currently using is 2.7.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons
  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def convert_text2mel_tflite(
    model_path: str, save_name: str, use_auth_token: bool = False
) -> float:
    # load pretrained model
    model = TFAutoModel.from_pretrained(
        model_path, enable_tflite_convertible=True, use_auth_token=use_auth_token
    )

    # setup model concrete function
    concrete_function = model.inference_tflite.get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_function])

    # specify optimizations
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS,  # quantize
        tf.lite.OpsSet.SELECT_TF_OPS,
    ]

    # convert and save model to TensorFlowLite
    tflite_model = converter.convert()
    with open(save_name, "wb") as f:
        f.write(tflite_model)

    size = len(tflite_model) / 1024 / 1024.0
    return size

In [11]:
def convert_vocoder_tflite(
    model_path: str, save_name: str, use_auth_token: bool = False
) -> float:
    # load pretrained model
    model = TFAutoModel.from_pretrained(model_path, use_auth_token=use_auth_token)

    # setup model concrete function
    concrete_function = model.inference_tflite.get_concrete_function()
    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_function])

    # specify optimizations
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.target_spec.supported_ops = [tf.lite.OpsSet.SELECT_TF_OPS]
    converter.target_spec.supported_types = [tf.float16]  # fp16 ops

    # convert and save model to TensorFlowLite
    tflite_model = converter.convert()
    with open(save_name, "wb") as f:
        f.write(tflite_model)

    size = len(tflite_model) / 1024 / 1024.0
    return size

In [12]:
text2mel = convert_text2mel_tflite(
    model_path="bookbot/lightspeech-mfa-en-v6",
    save_name="lightspeech_quant.tflite",
    use_auth_token=True,
)

vocoder = convert_vocoder_tflite(
    model_path="bookbot/mb-melgan-hifi-postnets-en-v13",
    save_name="mbmelgan.tflite",
    use_auth_token=True,
)

2023-07-12 16:52:54.838324: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-12 16:53:06.102704: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-07-12 16:53:06.103711: I tensorflow/core/grappler/clusters/single_machine.cc:358] Starting new session
2023-07-12 16:53:06.205651: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1149] Optimization results for grappler item: graph_to_optimize
  function_optimizer: function_optimizer did nothing. time = 0.039ms.
  function_optimizer: function_optimizer did nothing. time = 0.004ms.

2023-07-12 16:53:19.066171: W tensorflow/compiler/mlir/li

Estimated count of arithmetic ops: 0  ops, equivalently 0  MACs


2023-07-12 16:53:34.787757: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:363] Ignored output_format.
2023-07-12 16:53:34.787779: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:366] Ignored drop_control_dependency.
2023-07-12 16:53:35.141282: W tensorflow/compiler/mlir/lite/flatbuffer_export.cc:1891] TFLite interpreter needs to link Flex delegate in order to run the model since it contains the following Select TFop(s):
Flex ops: FlexAddV2, FlexBatchToSpaceND, FlexBiasAdd, FlexConcatV2, FlexConv2D, FlexConv2DBackpropInput, FlexExpandDims, FlexFloorMod, FlexIdentity, FlexLeakyRelu, FlexMirrorPad, FlexMul, FlexPack, FlexPad, FlexReshape, FlexShape, FlexSpaceToBatchND, FlexSqueeze, FlexStridedSlice, FlexSub, FlexTanh
Details:
	tf.AddV2(tensor<1x?x192xf32>, tensor<1x?x192xf32>) -> (tensor<1x?x192xf32>) : {device = ""}
	tf.AddV2(tensor<1x?x48xf32>, tensor<1x?x48xf32>) -> (tensor<1x?x48xf32>) : {device = ""}
	tf.AddV2(tensor<1x?x96xf32>, tensor<1x?x9

In [13]:
print(f"Text2mel: {text2mel} MBs\nVocoder: {vocoder} MBs")

Text2mel: 4.908241271972656 MBs
Vocoder: 5.051200866699219 MBs


In [2]:
from tensorflow_tts.inference import AutoProcessor

processor = AutoProcessor.from_pretrained("bookbot/lightspeech-mfa-en-v6", use_auth_token=True)
processor.mode = "eval" # change processor from train to eval mode



In [3]:
from typing import List, Tuple

def tokenize(text: str, processor: AutoProcessor) -> List[int]:
    return processor.text_to_sequence(text)

In [4]:
def prepare_input(
    input_ids: List[str], speaker: int
) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
    input_ids = tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0)
    return (
        input_ids,
        tf.convert_to_tensor([speaker], tf.int32),
        tf.convert_to_tensor([1.0], dtype=tf.float32),
        tf.convert_to_tensor([1.0], dtype=tf.float32),
        tf.convert_to_tensor([1.0], dtype=tf.float32),
    )

In [5]:
def ls_infer(
    input_ids: List[str], speaker: int, lightspeech_path: str
) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
    # load model to Interpreter
    lightspeech = tf.lite.Interpreter(model_path=lightspeech_path)
    input_details = lightspeech.get_input_details()
    output_details = lightspeech.get_output_details()

    print(input_details)

    # resize input tensors according to actual shape
    lightspeech.resize_tensor_input(input_details[0]["index"], [1, len(input_ids)])
    lightspeech.resize_tensor_input(input_details[1]["index"], [1])
    lightspeech.resize_tensor_input(input_details[2]["index"], [1])
    lightspeech.resize_tensor_input(input_details[3]["index"], [1])
    lightspeech.resize_tensor_input(input_details[4]["index"], [1])

    # allocate tensors
    lightspeech.allocate_tensors()

    input_data = prepare_input(input_ids, speaker)

    # set input tensors
    for i, detail in enumerate(input_details):
        lightspeech.set_tensor(detail["index"], input_data[i])

    # invoke interpreter
    lightspeech.invoke()

    print(output_details)

    # return outputs
    return (
        lightspeech.get_tensor(output_details[0]["index"]),
        lightspeech.get_tensor(output_details[1]["index"]),
    )

In [6]:
def melgan_infer(melspectrogram: tf.Tensor, mb_melgan_path: str) -> tf.Tensor:
    # load model to Interpreter
    mb_melgan = tf.lite.Interpreter(model_path=mb_melgan_path)
    input_details = mb_melgan.get_input_details()
    output_details = mb_melgan.get_output_details()

    # resize input tensors according to actual shape
    mb_melgan.resize_tensor_input(
        input_details[0]["index"],
        [1, melspectrogram.shape[1], melspectrogram.shape[2]],
        strict=True,
    )

    # allocate tensors
    mb_melgan.allocate_tensors()

    # set input tensors
    mb_melgan.set_tensor(input_details[0]["index"], melspectrogram)

    # invoke interpreter
    mb_melgan.invoke()

    # return output
    return mb_melgan.get_tensor(output_details[0]["index"])

In [7]:
# text = "The quick brown fox jumps over the lazy dog, while the phoneme sounds of pheasants, quails and crickets chirp in the background."
text = "The mouse is asleep in his bed, unaware of the devious spirit stalking him in the night."
input_ids = tokenize(text, processor)

In [8]:
input_ids

[25,
 29,
 13,
 40,
 17,
 51,
 23,
 29,
 17,
 12,
 42,
 16,
 51,
 14,
 8,
 51,
 23,
 3,
 50,
 4,
 71,
 68,
 14,
 29,
 22,
 50,
 34,
 29,
 21,
 25,
 29,
 4,
 42,
 21,
 9,
 29,
 17,
 17,
 16,
 51,
 34,
 33,
 18,
 17,
 18,
 47,
 11,
 33,
 26,
 8,
 51,
 13,
 51,
 14,
 25,
 29,
 14,
 39,
 18,
 72]

In [8]:
# _, mel_output_tflite = ls_infer(
#     input_ids, speaker=0, lightspeech_path="fastspeech2_quant.tflite"
# )

mel_output_tflite, _ = ls_infer(
    input_ids, speaker=2, lightspeech_path="lightspeech_quant.tflite"
)

audio_tflite = melgan_infer(mel_output_tflite[:, :, :], mb_melgan_path="mbmelgan.tflite")[
    0, :, 0
]

[{'name': 'input_ids', 'index': 0, 'shape': array([1, 1], dtype=int32), 'shape_signature': array([ 1, -1], dtype=int32), 'dtype': <class 'numpy.int32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}, {'name': 'speaker_ids', 'index': 1, 'shape': array([1], dtype=int32), 'shape_signature': array([1], dtype=int32), 'dtype': <class 'numpy.int32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}, {'name': 'speed_ratios', 'index': 2, 'shape': array([1], dtype=int32), 'shape_signature': array([1], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}, 

2023-07-12 17:02:16.691453: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[{'name': 'Identity', 'index': 883, 'shape': array([ 1,  1, 80], dtype=int32), 'shape_signature': array([-1, -1, 80], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}, {'name': 'Identity_1', 'index': 582, 'shape': array([1, 1], dtype=int32), 'shape_signature': array([-1, -1], dtype=int32), 'dtype': <class 'numpy.int32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}, {'name': 'Identity_2', 'index': 566, 'shape': array([1, 1], dtype=int32), 'shape_signature': array([-1, -1], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}

INFO: Created TensorFlow Lite delegate for select TF ops.
INFO: TfLiteFlexDelegate delegate: 496 nodes delegated out of 580 nodes with 1 partitions.



In [9]:
mel_output_tflite

array([[[ 0.18488696,  0.05659499, -0.16161469, ..., -0.30813986,
         -0.31639284, -0.37568086],
        [ 0.29060948,  0.21695155,  0.07537837, ..., -0.20757365,
         -0.1986492 , -0.2757054 ],
        [ 0.50147986,  0.507498  ,  0.4369136 , ..., -0.06122725,
         -0.05204855, -0.16556312],
        ...,
        [-3.936612  , -3.8412733 , -3.666861  , ..., -3.516617  ,
         -3.547292  , -3.6129918 ],
        [-3.9367673 , -3.8523846 , -3.717854  , ..., -3.5491571 ,
         -3.5621228 , -3.6040351 ],
        [-3.9405804 , -3.8478634 , -3.6982903 , ..., -3.5727773 ,
         -3.5748684 , -3.6324    ]]], dtype=float32)

In [10]:
mel_output_tflite.shape

(1, 471, 80)

In [17]:
audio_tflite

array([-3.9025468e-05, -3.0453808e-05, -3.9405113e-05, ...,
        1.0678735e-06,  1.6777966e-07, -9.1056029e-07], dtype=float32)

In [11]:
audio_tflite.shape

(241152,)

In [15]:
from IPython.display import Audio

Audio(data=audio_tflite, rate=44100)

In [42]:
import soundfile as sf

sf.write("./natasha2.wav", audio_tflite, 44100, "PCM_16")