/
segments.py
128 lines (105 loc) · 4.52 KB
/
segments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Copyright 2015-2019 Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Segments backend for the phonemizer"""
import codecs
import os
import pkg_resources
import segments
from phonemizer.backend.base import BaseBackend
from phonemizer.logger import get_logger
class SegmentsBackend(BaseBackend):
"""Segments backends for the phonemizer
The phonemize method will raise a ValueError when parsing an
unknown morpheme.
"""
def __init__(self, language, logger=get_logger()):
self.logger = logger
self.logger.info(
'initializing backend %s-%s', self.name(), self.version())
profile = self._load_g2p_profile(language)
self.tokenizer = segments.Tokenizer(profile=profile)
@staticmethod
def name():
return 'segments'
@staticmethod
def version():
return segments.__version__
@staticmethod
def is_available():
return True
@staticmethod
def supported_languages():
"""Returns a dict of language: file supported by the segments backend
The supported languages have a grapheme to phoneme conversion file
bundled with phonemizer. Users can also use their own file as
parameter of the phonemize() function.
"""
# directory phonemizer/share
directory = pkg_resources.resource_filename(
pkg_resources.Requirement.parse('phonemizer'),
'phonemizer/share')
# supported languages are files with the 'g2p' extension
return {f.split('.')[0]: os.path.join(directory, f)
for f in os.listdir(directory) if f.endswith('g2p')}
@classmethod
def is_supported_language(cls, language):
if os.path.isfile(language):
try:
cls._load_g2p_profile(language)
return True
except RuntimeError:
return False
return language in cls.supported_languages().keys()
@classmethod
def _load_g2p_profile(cls, language):
"""Returns a segments profile from a `language`"""
# make sure the g2p file exists
if not os.path.isfile(language):
try:
language = cls.supported_languages()[language]
except KeyError:
raise RuntimeError(
'grapheme to phoneme file not found: {}'.format(language))
# load the mapping grapheme -> phoneme from the file, make sure all
# lines are well formatted
g2p = {}
for n, line in enumerate(codecs.open(language, 'r', encoding='utf8')):
elts = line.strip().split()
if not len(elts) == 2:
raise RuntimeError(
'grapheme to phoneme file, line {} must have 2 rows '
'but have {}: {}'.format(n+1, len(elts), language))
g2p[elts[0]] = elts[1]
# build the segments profile from the g2p mapping
return segments.Profile(
*[{'Grapheme': k, 'mapping': v} for k, v in g2p.items()])
def _phonemize_aux(self, text, separator, strip):
# tokenize the input text per utterance
phonemized = (
self.tokenizer(line, column='mapping', errors='strict')
for line in text.split('\n') if line)
# the output of segments is always strip, so we need to add
# token separation at the end when strip is False.
if not strip:
# add word separator at end of utterance
phonemized = (p + ' # ' for p in phonemized)
# add phoneme separator at end of word
phonemized = (p.replace(' # ', ' # ') for p in phonemized)
# replace default separators by our custom ones
phonemized = (p.replace(' # ', '#') for p in phonemized)
phonemized = (p.replace(' ', separator.phone) for p in phonemized)
phonemized = (p.replace('#', separator.word) for p in phonemized)
# return the result as a list of utterances
return list(phonemized)