Handle multilingual strings to improve text shaping results (fix py-p…

…df#1187) (py-pdf#1193) * automatically detect unicode script * Add files via upload * Fix fragment width with text shaping * add test and changelog
andersonhc · Jun 6, 2024 · fbbb3f7 · fbbb3f7
1 parent f0bd468
commit fbbb3f7
Show file tree

Hide file tree

Showing 9 changed files with 2,586 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,9 +21,11 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
 * [`Templates`](https://py-pdf.github.io/fpdf2/fpdf/Templates.html) can now be also defined in JSON files.
 * support to optionally set `wrapmode` in templates (default `"WORD"` can optionally be set to `"CHAR"` to support wrapping on characters for scripts like Chinese or Japanese) - _cf._ [#1159](https://github.com/py-pdf/fpdf2/issues/1159)
 * support for quadratic and cubic Bézier curves with [`FPDF.bezier()`](https://py-pdf.github.io/fpdf2/fpdf/Shapes.html#fpdf.fpdf.FPDF.bezier)
+* feature to identify the Unicode script of the input text and break it into fragments when different scripts are used, improving text shaping results
 ### Fixed
 * [`fpdf.drawing.DeviceCMYK`](https://py-pdf.github.io/fpdf2/fpdf/drawing.html#fpdf.drawing.DeviceCMYK) objects can now be passed to [`FPDF.set_draw_color()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.set_draw_color), [`FPDF.set_fill_color()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.set_fill_color) and [`FPDF.set_text_color()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.set_text_color) without raising a `ValueError`: [documentation](https://py-pdf.github.io/fpdf2/Text.html#text-formatting).
-* individual `/Resources` directories are now properly created for each document page. This change ensures better compliance with the PDF specification but results in a slight increase in the size of PDF documents. You can still use the old behavior by setting `FPDF().single_resources_object = True`.
+* individual `/Resources` directories are now properly created for each document page. This change ensures better compliance with the PDF specification but results in a slight increase in the size of PDF documents. You can still use the old behavior by setting `FPDF().single_resources_object = True`
+* line size calculation for fragments when text shaping is used
 ### Changed
 * [`FPDF.table()`](https://py-pdf.github.io/fpdf2/Tables.html) now raises an error when a single row is too high to be rendered on a single page
 ### Deprecated

diff --git a/fpdf/fonts.py b/fpdf/fonts.py
@@ -270,9 +270,7 @@ def shaped_text_width(self, text, font_size_pt, text_shaping_parms):
         text_width = 0
         for pos in glyph_positions:
             text_width += (
-                round(self.scale * (pos.x_advance + pos.x_offset) + 0.001)
-                * font_size_pt
-                * 0.001
+                round(self.scale * pos.x_advance + 0.001) * font_size_pt * 0.001
             )
         return (len(glyph_positions), text_width)
 

diff --git a/fpdf/fpdf.py b/fpdf/fpdf.py
@@ -116,6 +116,7 @@ class Image:
 from .syntax import DestinationXYZ, PDFArray, PDFDate
 from .table import Table, draw_box_borders
 from .text_region import TextRegionMixin, TextColumns
+from .unicode_script import UnicodeScript, get_unicode_script
 from .util import get_scale_factor, Padding
 
 # Public global variables:
@@ -3379,7 +3380,7 @@ def get_fallback_font(self, char, style=""):
 
     def _parse_chars(self, text: str, markdown: bool) -> Iterator[Fragment]:
         "Split text into fragments"
-        if not markdown and (not self.is_ttf_font or not self._fallback_font_ids):
+        if not markdown and not self.is_ttf_font:
             yield Fragment(text, self._get_current_graphics_state(), self.k)
             return
         txt_frag, in_bold, in_italics, in_underline = (
@@ -3389,9 +3390,10 @@ def _parse_chars(self, text: str, markdown: bool) -> Iterator[Fragment]:
             bool(self.underline),
         )
         current_fallback_font = None
+        current_text_script = None
 
         def frag():
-            nonlocal txt_frag, current_fallback_font
+            nonlocal txt_frag, current_fallback_font, current_text_script
             gstate = self._get_current_graphics_state()
             gstate["font_style"] = ("B" if in_bold else "") + (
                 "I" if in_italics else ""
@@ -3406,6 +3408,7 @@ def frag():
                 )
                 gstate["current_font"] = self.fonts[current_fallback_font]
                 current_fallback_font = None
+                current_text_script = None
             fragment = Fragment(
                 txt_frag,
                 gstate,
@@ -3426,6 +3429,16 @@ def frag():
                 self.MARKDOWN_UNDERLINE_MARKER,
             )
             half_marker = text[0]
+            text_script = get_unicode_script(text[0])
+            if text_script not in (
+                UnicodeScript.COMMON,
+                UnicodeScript.UNKNOWN,
+                current_text_script,
+            ):
+                if txt_frag and current_text_script:
+                    yield frag()
+                current_text_script = text_script
+
             # Check that previous & next characters are not identical to the marker:
             if markdown:
                 if (
@@ -5132,6 +5145,8 @@ def output(
                 DeprecationWarning,
                 stacklevel=get_stack_level(),
             )
+        # Clear cache of cached functions to free up memory after output
+        get_unicode_script.cache_clear()
         # Finish document if necessary:
         if not self.buffer:
             if self.page == 0: