-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtext_to_speech.py
More file actions
95 lines (80 loc) · 3.25 KB
/
text_to_speech.py
File metadata and controls
95 lines (80 loc) · 3.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import click
import functools
from glob import glob
import logging
import os
import re
from bs4 import BeautifulSoup
from markdown import markdown
from pydub import AudioSegment
from google.cloud import texttospeech
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
def clean_text(text):
# get rid of the Hugo preamble
text = ''.join(text.decode('utf8').split('---')[2:]).strip()
# get rid of superfluous newlines, as that counts towards our API limits
text = re.sub('\n+', ' ', text)
# we're hacking our way around the markdown by converting to html first,
# just because BeautifulSoup makes life so easy
html = markdown(text)
html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
# this removes some artifacts from Hugo shortcodes
html = re.sub(r'{{}}', '', html)
html = re.sub(r'\[\^.*?\]', ' ', html)
soup = BeautifulSoup(html, "html.parser")
text = ''.join(soup.findAll(text=True))
# get rid of superfluous whitespace
return re.sub(r'\s+', ' ', text)
@click.command()
@click.argument('filename', type=click.File('rb'))
def text_to_speech(filename):
name = os.path.basename(filename.name).replace('.md', '')
data = filename.read()
text = clean_text(data)
# initialize the API client
client = texttospeech.TextToSpeechClient()
# The API limit is 5000 bytes (not characters). Split on word
# boundaries to avoid cutting mid-word or mid-multibyte character.
chunks = []
current = []
current_bytes = 0
for word in text.split():
word_bytes = len((word + ' ').encode('utf-8'))
if current_bytes + word_bytes > 4900 and current: # leave margin
chunks.append(' '.join(current))
current = []
current_bytes = 0
current.append(word)
current_bytes += word_bytes
if current:
chunks.append(' '.join(current))
for j, chunk in enumerate(chunks):
synthesis_input = texttospeech.types.SynthesisInput(text=chunk)
voice = texttospeech.types.VoiceSelectionParams(
language_code='en-US',
name='en-US-Wavenet-B'
)
audio_config = texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.MP3
)
logging.info(f'Synthesizing speech for {name}_{j}')
response = client.synthesize_speech(synthesis_input, voice,
audio_config)
with open(f'{name}_{j}.mp3', 'wb') as out:
# Write the response to the output file.
out.write(response.audio_content)
logging.info(f'Audio content written to file "{name}_{j}.mp3"')
mp3_segments = sorted(glob(f'{name}_*.mp3'))
segments = [AudioSegment.from_mp3(f) for f in mp3_segments]
logging.info(f'Stitching together {len(segments)} mp3 files for {name}')
audio = functools.reduce(lambda a, b: a + b, segments)
logging.info(f'Exporting {name}.mp3')
audio.export(f'static/audio/{name}.mp3', format='mp3')
logging.info(f'Exporting {name}.ogg')
audio.export(f'static/audio/{name}.ogg', format='ogg')
logging.info('Removing intermediate files')
for f in mp3_segments:
os.remove(f)
if __name__ == '__main__':
text_to_speech()