Skip to content

Commit

Permalink
refactor: Changed the merge separator for the "Merge to Translate" fe…
Browse files Browse the repository at this point in the history
…ature.
  • Loading branch information
bookfere committed Sep 30, 2023
1 parent 48a5d0b commit 702d43c
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 28 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/stable-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
pattern='^## v[0-9](\.[0-9]){2}$'
while read line && [[ "$line" != '---' ]]; do
if [[ "$line" =~ $pattern ]]; then
if [[ "$line" != ${{ github.ref_name }} ]]; then
if [[ ! "$line" =~ ${{ github.ref_name }} ]]; then
echo "*[The release notes will be populated soon]*"; break
fi
continue
Expand Down
3 changes: 2 additions & 1 deletion advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ def clean_cache(self, cache):
@pyqtSlot()
def prepare_ebook_data(self):
input_path = self.ebook.get_input_path()
element_handler = get_element_handler(self.engine_class.placeholder)
element_handler = get_element_handler(
self.engine_class.placeholder, self.engine_class.separator)
merge_length = str(element_handler.get_merge_length())
cache_id = uid(
input_path + self.engine_class.name + self.ebook.target_lang
Expand Down
1 change: 1 addition & 0 deletions engines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class Base:
api_key_hint = _('API Keys')
api_key_pattern = r'^[^\s]+$'
api_key_errors = ['401']
separator = '\n\n'
placeholder = ('{{{{id_{}}}}}', r'({{\s*)+id\s*_\s*{}\s*(\s*}})+')

concurrency_limit = 0
Expand Down
4 changes: 3 additions & 1 deletion engines/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,16 @@ def translate(self, text):
'sl': self._get_source_code(),
'tl': self._get_target_code(),
'dt': 't',
'dj': 1,
'q': text,
}

# The POST method is unstable, despite its ability to send more text.
return self.get_result(self.endpoint, data, headers)

def parse(self, data):
return ''.join(i[0] for i in json.loads(data)[0])
# return ''.join(i[0] for i in json.loads(data)[0])
return ''.join(i['trans'] for i in json.loads(data)['sentences'])


class GoogleTranslate:
Expand Down
3 changes: 2 additions & 1 deletion lib/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def convert_book(ebook_title, input_path, output_path, source_lang,
translator.set_source_lang(source_lang)
translator.set_target_lang(target_lang)

element_handler = get_element_handler(translator.placeholder)
element_handler = get_element_handler(
translator.placeholder, translator.separator)
element_handler.set_translation_lang(
translator.get_iso639_target_code(target_lang))

Expand Down
36 changes: 24 additions & 12 deletions lib/element.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,9 @@ def filter_content(self, element):


class ElementHandler:
def __init__(self, placeholder, merge_length=0):
def __init__(self, placeholder, separator, merge_length=0):
self.placeholder = placeholder
self.separator = separator
self.merge_length = merge_length

self.position = None
Expand Down Expand Up @@ -325,9 +326,10 @@ def prepare_original(self, elements):
if element.ignored:
continue
self.elements[eid] = element
placeholder = ' %s ' % self.placeholder[0].format(eid)
separator = self.separator \
or ' %s ' % self.placeholder[0].format(eid)
code = element.get_raw()
text = element.get_content(self.placeholder) + placeholder
text = element.get_content(self.placeholder) + separator
if len(content + text) < self.merge_length:
raw += code
content += text
Expand All @@ -343,12 +345,23 @@ def prepare_original(self, elements):
return self.original

def add_translations(self, paragraphs):
content = ''.join(
paragraph.translation for paragraph in paragraphs
if paragraph.translation)
content = ''
for paragraph in paragraphs:
tail = paragraph.original[-2:]
tail = tail if tail == self.separator else ''
if paragraph.translation:
content += paragraph.translation + tail
# Check if the translated content contains at least one separator;
# if none is found, use the placeholder to separate paragraphs.
if self.separator and self.separator in content:
pattern = '%s+' % self.separator[0]
content = re.sub(pattern, self.separator, content)
else:
self.separator = None

for eid, element in self.elements.copy().items():
matches = re.search(self.placeholder[1].format(eid), content)
separator = self.separator or self.placeholder[1].format(eid)
matches = re.search(separator, content)
if not matches:
continue
pattern = matches.group(0)
Expand Down Expand Up @@ -383,13 +396,12 @@ def get_page_elements(pages):
return extraction.get_elements()


def get_element_handler(placeholder):
def get_element_handler(placeholder, separator):
config = get_config()
handler = ElementHandler(placeholder)
handler = ElementHandler(placeholder, separator)
if config.get('merge_enabled'):
merge_length = config.get('merge_length')
if merge_length > 0:
handler = ElementHandlerMerge(placeholder, merge_length)
handler = ElementHandlerMerge(
placeholder, separator, config.get('merge_length'))
handler.set_translation_position(
config.get('translation_position'))
handler.set_translation_color(config.get('translation_color'))
Expand Down
156 changes: 144 additions & 12 deletions tests/test_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,14 +274,16 @@ def test_need_ignore(self):
'<code xmlns="http://www.w3.org/1999/xhtml">abc</code>',
'<table xmlns="http://www.w3.org/1999/xhtml">abc</table>',
'<p xmlns="http://www.w3.org/1999/xhtml" class="a">abc</p>']
with self.subTest():
for item in items:

for item in items:
with self.subTest(item=item):
self.assertTrue(self.extraction.need_ignore(etree.XML(item)))

items = ['<p xmlns="http://www.w3.org/1999/xhtml">abc</p>',
'<p xmlns="http://www.w3.org/1999/xhtml" id="a">abc</p>']
with self.subTest():
for item in items:

for item in items:
with self.subTest(item=item):
self.assertFalse(self.extraction.need_ignore(etree.XML(item)))

def test_extract_elements(self):
Expand Down Expand Up @@ -418,7 +420,7 @@ def setUp(self):
in self.xhtml.findall('./x:body/*', namespaces=ns)]
self.elements[-1].set_ignored(True)
self.elements[-3].set_ignored(True)
self.handler = ElementHandler(Base.placeholder)
self.handler = ElementHandler(Base.placeholder, Base.separator)

@patch('calibre_plugins.ebook_translator.lib.element.uid')
def test_prepare_original(self, mock_uid):
Expand Down Expand Up @@ -502,17 +504,37 @@ def setUp(self):
in self.xhtml.findall('./x:body/*', namespaces=ns)]
self.elements[-1].set_ignored(True)
self.elements[-3].set_ignored(True)
self.handler = ElementHandlerMerge(Base.placeholder, 1000)
self.handler = ElementHandlerMerge(Base.placeholder, None, 1000)

@patch('calibre_plugins.ebook_translator.lib.element.uid')
def test_prepare_original_merged(self, mock_uid):
def test_prepare_original_merge_placeholder(self, mock_uid):
mock_uid.return_value = 'm1'
self.assertEqual([(
0, 'm1', '<p id="a">a</p><p id="b">b</p><p id="c" class="c">c</p>',
'a {{id_0}} b {{id_1}} c {{id_3}} ', False)],
self.handler.prepare_original(self.elements))

def test_add_translations_merged(self):
@patch('calibre_plugins.ebook_translator.lib.element.uid')
def test_prepare_original_merge_separator(self, mock_uid):
mock_uid.return_value = 'm1'
self.handler.separator = Base.separator
self.assertEqual([(
0, 'm1', '<p id="a">a</p><p id="b">b</p><p id="c" class="c">c</p>',
'a\n\nb\n\nc\n\n', False)],
self.handler.prepare_original(self.elements))

@patch('calibre_plugins.ebook_translator.lib.element.uid')
def test_prepare_original_merge_separator_multiple(self, mock_uid):
mock_uid.side_effect = ['m1', 'm2', 'm3']
self.handler.merge_length = 2
self.handler.separator = Base.separator
items = [
(0, 'm1', '<p id="a">a</p>', 'a\n\n', False),
(1, 'm2', '<p id="b">b</p>', 'b\n\n', False),
(2, 'm3', '<p id="c" class="c">c</p>', 'c\n\n', False)]
self.assertEqual(items, self.handler.prepare_original(self.elements))

def test_add_translations_merge_placeholder(self):
self.handler.prepare_original(self.elements)
self.handler.add_translations([Paragraph(
0, 'm1', '<p id="a">a</p><p id="b">b</p><p id="c">c</p>',
Expand All @@ -530,7 +552,69 @@ def test_add_translations_merged(self):
self.assertEqual('c', elements[5].text)
self.assertEqual('C', elements[6].text)

def test_add_translations_merged_missing_id(self):
def test_add_translations_merge_cached_placeholder(self):
self.handler.separator = Base.separator
self.handler.prepare_original(self.elements)
self.handler.add_translations([Paragraph(
0, 'm1', '<p id="a">a</p><p id="b">b</p><p id="c">c</p>',
'a {{id_0}} b {{id_1}} c {{id_3}}', False, None, None,
'A {{id_0}} B {{id_1}} C {{id_3}}', 'ENGINE', 'LANG')])

elements = self.xhtml.findall('./x:body/*', namespaces=ns)

self.assertEqual(8, len(elements))
self.assertEqual('a', elements[0].text)
self.assertEqual('A', elements[1].text)
self.assertEqual('b', elements[2].text)
self.assertEqual('B', elements[3].text)

self.assertEqual('c', elements[5].text)
self.assertEqual('C', elements[6].text)

def test_add_translations_merge_separator(self):
self.handler.separator = Base.separator
self.handler.prepare_original(self.elements)
self.handler.add_translations([Paragraph(
0, 'm1', '<p id="a">a</p><p id="b">b</p><p id="c">c</p>',
'a\n\nb\n\nc\n\n', False, None, None,
'A\nB\n\n\nC', 'ENGINE', 'LANG')]) # missing or repeated \n

elements = self.xhtml.findall('./x:body/*', namespaces=ns)

self.assertEqual(8, len(elements))
self.assertEqual('a', elements[0].text)
self.assertEqual('A', elements[1].text)
self.assertEqual('b', elements[2].text)
self.assertEqual('B', elements[3].text)

self.assertEqual('c', elements[5].text)
self.assertEqual('C', elements[6].text)

def test_add_translations_merge_separator_multiple(self):
self.handler.merge_length = 2
self.handler.separator = Base.separator
self.handler.prepare_original(self.elements)
paragraphs = [
Paragraph(0, 'm1', '<p id="a">a</p>', 'a\n\n', False, None, None,
'A\n\n', 'ENGINE', 'LANG'),
Paragraph(1, 'm2', '<p id="b">b</p>', 'b\n\n', False, None, None,
'B', 'ENGINE', 'LANG'),
Paragraph(2, 'm3', '<p id="c" class="c">c</p>', 'c\n\n', False,
None, None, 'C\n\n', 'ENGINE', 'LANG')]
self.handler.add_translations(paragraphs)

elements = self.xhtml.findall('./x:body/*', namespaces=ns)

self.assertEqual(8, len(elements))
self.assertEqual('a', elements[0].text)
self.assertEqual('A', elements[1].text)
self.assertEqual('b', elements[2].text)
self.assertEqual('B', elements[3].text)

self.assertEqual('c', elements[5].text)
self.assertEqual('C', elements[6].text)

def test_add_translations_merge_placeholder_missing_id(self):
self.handler.prepare_original(self.elements)
self.handler.add_translations([Paragraph(
0, 'm1', '<p id="a">a</p><p id="b">b</p><p id="c">c</p>',
Expand All @@ -546,9 +630,25 @@ def test_add_translations_merged_missing_id(self):
self.assertEqual('c', elements[4].text)
self.assertEqual('C', elements[5].text)

def test_add_translations_merged_translation_only(self):
self.handler.position = 'only'
def test_add_translations_merge_placeholder_missing_newline(self):
self.handler.separator = Base.separator
self.handler.prepare_original(self.elements)
self.handler.add_translations([Paragraph(
0, 'm1', '<p id="a">a</p><p id="b">b</p><p id="c">c</p>',
'a\n\nb\n\nc\n\n', False, None, None,
'A B\n\nC\n\n', 'ENGINE', 'LANG')])

elements = self.xhtml.findall('./x:body/*', namespaces=ns)
self.assertEqual(7, len(elements))
self.assertEqual('a', elements[0].text)
self.assertEqual('A B', elements[1].text)
self.assertEqual('b', elements[2].text)
self.assertEqual('C', elements[3].text)

self.assertEqual('c', elements[5].text)

def test_add_translations_merge_palceholder_only(self):
self.handler.position = 'only'
self.handler.prepare_original(self.elements)
self.handler.add_translations([Paragraph(
0, 'm1', '<p id="a">a</p><p id="b">b</p><p id="c">c</p>'
Expand All @@ -563,7 +663,24 @@ def test_add_translations_merged_translation_only(self):

self.assertEqual('C', elements[3].text)

def test_add_translations_merged_translation_only_missing_id(self):
def test_add_translations_merge_separator_only(self):
self.handler.position = 'only'
self.handler.separator = Base.separator
self.handler.prepare_original(self.elements)
self.handler.add_translations([Paragraph(
0, 'm1', '<p id="a">a</p><p id="b">b</p><p id="c">c</p>'
'<a href="/a">a</a>',
'a\n\nb\n\nc\n\n', False, None, None,
'A\n\n B\n\nC\n\n', 'ENGINE', 'LANG')])

elements = self.xhtml.findall('./x:body/*', namespaces=ns)
self.assertEqual(5, len(elements))
self.assertEqual('A', elements[0].text)
self.assertEqual('B', elements[1].text)

self.assertEqual('C', elements[3].text)

def test_add_translations_merge_placeholder_only_missing_id(self):
self.handler.position = 'only'

self.handler.prepare_original(self.elements)
Expand All @@ -578,3 +695,18 @@ def test_add_translations_merged_translation_only_missing_id(self):
self.assertEqual('A B', elements[0].text)

self.assertEqual('C', elements[2].text)

def test_add_translations_merge_separator_only_missing_id(self):
self.handler.position = 'only'
self.handler.separator = Base.separator
self.handler.prepare_original(self.elements)
self.handler.add_translations([Paragraph(
0, 'm1', '<p id="a">a</p><p id="b">b</p><p id="c">c</p>'
'<a href="/a">a</a>',
'a\n\nb\n\nc\n\n', False, None, None,
'A B\n\nC\n\n', 'ENGINE', 'LANG')])

elements = self.xhtml.findall('./x:body/*', namespaces=ns)
self.assertEqual(4, len(elements))
self.assertEqual('A B', elements[0].text)
self.assertEqual('C', elements[1].text)

0 comments on commit 702d43c

Please sign in to comment.