In [7]:
import os
import azure.cognitiveservices.speech as speechsdk

speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('COGNITIVE_SERVICE_KEY'), region=os.environ.get('AZURE_SPEECH_REGION'))
audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)

# The language of the voice that speaks.
speech_config.speech_synthesis_language='zh-CN'
speech_config.speech_synthesis_voice_name='zh-CN-XiaohanNeural'

speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

text = "今天天气真不错，DJNobody是个好同学。"

speech_synthesizer.speak_text_async(text)

<azure.cognitiveservices.speech.ResultFuture at 0x7fae87fca230>

## 使用男声

In [2]:
speech_config.speech_synthesis_voice_name='zh-CN-YunfengNeural'
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
speech_synthesizer.speak_text_async(text)

<azure.cognitiveservices.speech.ResultFuture at 0x7fae87fca770>

## 自定义语气和角色

In [3]:
ssml = """<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
       xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="zh-CN">
    <voice name="zh-CN-YunyeNeural">
        儿子看见母亲走了过来，说到：
        <mstts:express-as role="Boy" style="cheerful">
            “妈妈，我想要买个新玩具”
        </mstts:express-as>
    </voice>
    <voice name="zh-CN-XiaomoNeural">
        母亲放下包，说：
        <mstts:express-as role="SeniorFemale" style="angry">
            “我看你长得像个玩具。”
        </mstts:express-as>
    </voice>
</speak>"""

speech_synthesis_result = speech_synthesizer.speak_ssml_async(ssml).get()

In [4]:
ssml = """<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
       xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
    <voice name="en-US-JennyNeural">
        <mstts:express-as style="excited">
            That'd be just amazing!
        </mstts:express-as>
        <mstts:express-as style="friendly">
            What's next?
        </mstts:express-as>
    </voice>
</speak>"""

speech_synthesis_result = speech_synthesizer.speak_ssml_async(ssml).get()

## 输出到文件

In [8]:
audio_config = speechsdk.audio.AudioOutputConfig(filename="./data/tts.wav")

speech_config.speech_synthesis_language='zh-CN'
speech_config.speech_synthesis_voice_name='zh-CN-XiaohanNeural'

speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

text = "今天天气真不错，我们一起出去看看风景吧。"

speech_synthesizer.speak_text_async(text)


<azure.cognitiveservices.speech.ResultFuture at 0x7fae87fc9d80>

## 输出成MP3

In [9]:
speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3)

speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
result = speech_synthesizer.speak_text_async(text).get()
stream =speechsdk.AudioDataStream(result)

stream.save_to_wav_file("./data/tts.mp3")

## PaddlePaddle语音合成

In [2]:
from paddlespeech.cli.tts.infer import TTSExecutor

tts_executor = TTSExecutor()

text = "今天天气十分不错，百度也能做语音合成。我们来介绍一下文本到语音的主要技术。"
output_file = "./data/paddlespeech.wav"
tts_executor(text=text, output=output_file)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/xiaodong/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.complex,
  from .autonotebook import tqdm as notebook_tqdm
[32m[2023-06-02 00:30:57,609] [    INFO][0m - Already cached /home/xiaodong/.paddlenlp/models/bert-base-chinese/bert-base-chinese-vocab.txt[0m
[32m[2023-06-02 00:30:57,615] [    INFO][0m - tokenizer config file saved in /home/xiaodong/.paddlenlp/models/bert-base-chinese/tokenizer_config.json[0m
[32m[2023-06-02 00:30:57,616] [    INFO][0m - Special tokens file saved in /home/xiaodong/.paddlenlp/models/bert-base-chinese/special_tokens_map.json[0m
W0602 00:30:57.789712 27035 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 8.6, Driver API Version: 12.1, Runtime API Version: 11.7
W0602 00:30:57.790331 2703

'/home/xiaodong/Gitroot/geektime-ai-course/data/paddlespeech.wav'

In [3]:
import wave
import pyaudio


def play_wav_audio(wav_file):
    # open the wave file
    wf = wave.open(wav_file, 'rb')

    # instantiate PyAudio
    p = pyaudio.PyAudio()

    # open a stream
    stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                    channels=wf.getnchannels(),
                    rate=wf.getframerate(),
                    output=True)

    # read data from the wave file and play it
    data = wf.readframes(1024)
    while data:
        stream.write(data)
        data = wf.readframes(1024)

    # close the stream and terminate PyAudio
    stream.stop_stream()
    stream.close()
    p.terminate()

play_wav_audio(output_file)

ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:8568:(snd_pcm_recover) underrun occurred


## 英文内容丢失

In [4]:
tts_executor = TTSExecutor()

text = "今天天气十分不错，PaddleSpeech也能做语音合成。"
output_file = "./data/paddlespeech_missing.wav"
tts_executor(text=text, output=output_file)

play_wav_audio(output_file)

[32m[2023-06-02 00:31:19,694] [    INFO][0m - Already cached /home/xiaodong/.paddlenlp/models/bert-base-chinese/bert-base-chinese-vocab.txt[0m
[32m[2023-06-02 00:31:19,700] [    INFO][0m - tokenizer config file saved in /home/xiaodong/.paddlenlp/models/bert-base-chinese/tokenizer_config.json[0m
[32m[2023-06-02 00:31:19,700] [    INFO][0m - Special tokens file saved in /home/xiaodong/.paddlenlp/models/bert-base-chinese/special_tokens_map.json[0m
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_ge

## 选用合适模型，提供中英文混合效果

In [6]:
tts_executor = TTSExecutor()

text = "早上好, how are you? 百度Paddle Speech一样能做中英文混合的语音合成。"
output_file = "./data/paddlespeech_mix.wav"
tts_executor(text=text, output=output_file, 
             am="fastspeech2_mix", voc="hifigan_csmsc", 
             lang="mix", spk_id=174)

play_wav_audio(output_file)

[32m[2023-06-02 00:31:57,324] [    INFO][0m - Already cached /home/xiaodong/.paddlenlp/models/bert-base-chinese/bert-base-chinese-vocab.txt[0m
[32m[2023-06-02 00:31:57,329] [    INFO][0m - tokenizer config file saved in /home/xiaodong/.paddlenlp/models/bert-base-chinese/tokenizer_config.json[0m
[32m[2023-06-02 00:31:57,329] [    INFO][0m - Special tokens file saved in /home/xiaodong/.paddlenlp/models/bert-base-chinese/special_tokens_map.json[0m
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_ge

In [10]:
tts_executor = TTSExecutor()

text = "早上好, how are you? 百度Paddle Speech一样能做中英文混合的语音合成。"
output_file = "./data/paddlespeech_mix_another.wav"
tts_executor(text=text, output=output_file, 
             am="fastspeech2_mix", voc="hifigan_csmsc", 
             lang="mix", spk_id=175)

play_wav_audio(output_file)

[32m[2023-06-02 00:33:42,360] [    INFO][0m - Already cached /home/xiaodong/.paddlenlp/models/bert-base-chinese/bert-base-chinese-vocab.txt[0m
[32m[2023-06-02 00:33:42,365] [    INFO][0m - tokenizer config file saved in /home/xiaodong/.paddlenlp/models/bert-base-chinese/tokenizer_config.json[0m
[32m[2023-06-02 00:33:42,366] [    INFO][0m - Special tokens file saved in /home/xiaodong/.paddlenlp/models/bert-base-chinese/special_tokens_map.json[0m
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_ge