In [5]:
from google.cloud import texttospeech
import os
from os import listdir
from os.path import isfile, join, isdir
import numpy as np

In [6]:
client = texttospeech.TextToSpeechClient()  # Instantiates a text-to-speech client

In [16]:
def text_to_speech_for_model_test(sentence, 
                                  voice_config, 
                                  file_path='output.mp3',
                                  effects_profile_id='headphone-class-device'):
    """
    voice_config["gender"]: person gender (MALE/FEMALE)
    voice_config["name"]: speaker type (https://cloud.google.com/text-to-speech/docs/voices)
    
    effects_profile_id: the device the voice will be played (https://cloud.google.com/text-to-speech/docs/audio-profiles)
    
    available audio configuration (https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize#audioconfig)
    """
    # Set the text input to be synthesized
    synthesis_input = texttospeech.SynthesisInput(text=sentence)
    
    # get language code from the speaker name
    language_code = "-".join(voice_config['accent'].split("-")[:2])
    
    # optimize for the device the voice will be played

    # Build the voice request, select the language code and the ssml
    if voice_config['gender'] == 'MALE':
        voice = texttospeech.VoiceSelectionParams(
            language_code=language_code,
            name=voice_config['accent'],
            ssml_gender=texttospeech.SsmlVoiceGender.MALE
        )
    elif voice_config['gender'] == 'FEMALE':
        voice = texttospeech.VoiceSelectionParams(
            language_code=language_code,
            name=voice_config['accent'],
            ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
        )
    else:
        raise ValueError('Unavailable voice configuration')

    # Select the type of audio file (.mp3)
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3,
        effects_profile_id=[effects_profile_id],
        speaking_rate=0.85
    )

    # Perform the text-to-speech request on the text input with the selected
    # voice parameters and audio file type
    response = client.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )

    # Write the response to the output file.
    # The response's audio_content is binary.
    with open(file_path, "wb") as f:
        f.write(response.audio_content)
    print(f"Saved in {file_path}")

In [32]:
sentence = 'In safety remote control task, the robot should decide whether the given remote command from the user is safe and project the command onto the safe command set if it is not.'
voice_config = {'gender': 'MALE', 'accent': 'en-US-Wavenet-J'} #D
file_path="Video_material/Speech/SRC_1.mp3"
effects_profile_id="handset-class-device"
text_to_speech_for_model_test(
    sentence, 
    voice_config, 
    file_path=file_path, 
    effects_profile_id=effects_profile_id)

Saved in Video_material/Speech/SRC_1.mp3
