Add Graphemebreak algorithm

Squashed together and style fixes by Tom Hacohen
adah1972 · Nov 22, 2016 · 21e5b8e · 21e5b8e
1 parent 07ef4f3
commit 21e5b8e
Show file tree

Hide file tree

Showing 11 changed files with 1,844 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -32,6 +32,7 @@ tags
 /src/WordBreakProperty.txt
 /src/LineBreakTest.txt
 /src/WordBreakTest.txt
+/src/GraphemeBreakTest.txt
 /src/tests
 /src/filter_dup
 /src/libunibreak.la

diff --git a/Makefile.am b/Makefile.am
@@ -20,5 +20,9 @@ linebreakdata:
 wordbreakdata:
 	cd src && ${MAKE} wordbreakdata
 
+graphemebreakdata:
+	cd src && ${MAKE} graphemebreakdata
+
 check:
 	cd src && ${MAKE} check
+
diff --git a/src/Makefile.am b/src/Makefile.am
@@ -4,6 +4,7 @@ include_HEADERS = \
 	unibreakdef.h \
 	linebreak.h \
 	linebreakdef.h \
+	graphemebreak.h \
 	wordbreak.h
 lib_LTLIBRARIES = libunibreak.la
 
@@ -15,6 +16,7 @@ libunibreak_la_SOURCES = \
 	linebreak.c \
 	linebreakdata.c \
 	linebreakdef.c \
+	graphemebreak.c \
 	wordbreak.c
 
 EXTRA_DIST = \
@@ -26,6 +28,9 @@ EXTRA_DIST = \
 	wordbreakdata1.tmpl \
 	wordbreakdata2.tmpl \
 	wordbreakdata.c \
+	graphemebreakdata1.tmpl \
+	graphemebreakdata2.tmpl \
+	graphemebreakdata.c \
 	Makefile.gcc \
 	Makefile.msvc \
 	filter_dup.c \
@@ -46,6 +51,9 @@ LineBreak.txt:
 WordBreakProperty.txt:
 	wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
 
+GraphemeBreakProperty.txt:
+	wget http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
+
 linebreakdata: ${builddir}/filter_dup LineBreak.txt
 	sed -E -n -f ${srcdir}/LineBreak1.sed LineBreak.txt > tmp.txt
 	sed -E -f ${srcdir}/LineBreak2.sed tmp.txt | ${builddir}/filter_dup > tmp.c
@@ -65,6 +73,19 @@ wordbreakdata: WordBreakProperty.txt
 	cat ${srcdir}/wordbreakdata1.tmpl tmp.txt ${srcdir}/wordbreakdata2.tmpl >> ${srcdir}/wordbreakdata.c
 	rm tmp.txt tmp.txt.bak
 
+
+graphemebreakdata: GraphemeBreakProperty.txt
+	sed -E -n 's/(^[0-9A-F.]+)/\1/p' GraphemeBreakProperty.txt > tmp2.txt
+	sed -E -i.bak 's/^([0-9A-F]+) +/\1..\1/' tmp2.txt
+	${srcdir}/sort_numeric_hex.py tmp2.txt > tmp.txt
+	rm tmp2.txt tmp2.txt.bak
+	sed -E -i.bak -n 's/^([0-9A-F]+)..([0-9A-F]+) *; *([A-Za-z_]+).*/'$$'\t''{0x\1, 0x\2, GBP_\3},/p' tmp.txt
+	echo "/* The content of this file is generated from:" > ${srcdir}/graphemebreakdata.c
+	head -2 GraphemeBreakProperty.txt >> ${srcdir}/graphemebreakdata.c
+	echo "*/" >> ${srcdir}/graphemebreakdata.c
+	cat ${srcdir}/graphemebreakdata1.tmpl tmp.txt ${srcdir}/graphemebreakdata2.tmpl >> ${srcdir}/graphemebreakdata.c
+	rm tmp.txt tmp.txt.bak
+
 # Tests
 
 check_PROGRAMS = tests
@@ -75,12 +96,17 @@ tests_CPPFLAGS = -I$(srcdir)
 tests_LDADD = libunibreak.la
 tests_DEPENDENCIES = libunibreak.la
 
-check: LineBreakTest.txt WordBreakTest.txt $(check_PROGRAMS)
+check: LineBreakTest.txt WordBreakTest.txt GraphemeBreakTest.txt $(check_PROGRAMS)
 	./tests word
 	./tests line
+	./tests grapheme
 
 LineBreakTest.txt:
 	wget http://www.unicode.org/Public/UNIDATA/auxiliary/LineBreakTest.txt
 
 WordBreakTest.txt:
 	wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
+
+GraphemeBreakTest.txt:
+	wget http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt
+
diff --git a/src/graphemebreak.c b/src/graphemebreak.c
@@ -0,0 +1,266 @@
+/*
+ * Grapheme breaking in a Unicode sequence.  Designed to be used in a
+ * generic text renderer.
+ *
+ * Copyright (C) 2016 Andreas Röver <roever at users dot sf dot net>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the author be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute
+ * it freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must
+ *    not claim that you wrote the original software.  If you use this
+ *    software in a product, an acknowledgement in the product
+ *    documentation would be appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must
+ *    not be misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source
+ *    distribution.
+ *
+ * The main reference is Unicode Standard Annex 29 (UAX #29):
+ *      <URL:http://unicode.org/reports/tr29>
+ *
+ * When this library was designed, this annex was at Revision 29, for
+ * Unicode 9.0.0:
+ *
+ * The Unicode Terms of Use are available at
+ *      <URL:http://www.unicode.org/copyright.html>
+ */
+
+/**
+ * @file    graphemebreak.c
+ *
+ * Implementation of the grapheme breaking algorithm as described in Unicode
+ * Standard Annex 29.
+ *
+ * @version 1.0, 2016
+ * @author  Andreas Roever
+ */
+
+#include "graphemebreak.h"
+#include "graphemebreakdata.c"
+#include "stdbool.h"
+#include "string.h"
+#include "unibreakdef.h"
+
+#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
+
+/**
+ * Initializes the wordbreak internals.  It currently does nothing, but
+ * it may in the future.
+ */
+void init_graphemebreak(void)
+{
+}
+
+/**
+ * Gets the grapheme breaking class of a character.
+ *
+ * @param ch   character to check
+ * @return     the grapheme breaking class if found; \c GBP_Other otherwise
+ */
+static enum GraphemeBreakClass get_char_gb_class(utf32_t ch)
+{
+    int min = 0;
+    int max = ARRAY_LEN(gb_prop_default) - 1;
+    int mid;
+
+    do
+    {
+        mid = (min + max) / 2;
+
+        if (ch < gb_prop_default[mid].start)
+            max = mid - 1;
+        else if (ch > gb_prop_default[mid].end)
+            min = mid + 1;
+        else
+            return gb_prop_default[mid].prop;
+    } while (min <= max);
+
+    return GBP_Other;
+}
+
+/**
+ * Sets the grapheme breaking information for a generic input string.
+ *
+ * @param[in]  s             input string
+ * @param[in]  len           length of the input
+ * @param[out] brks          pointer to the output breaking data, containing
+ *                           #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK
+ * @param[in] get_next_char  function to get the next UTF-32 character
+ */
+static void set_graphemebreaks(const void *s, size_t len, char *brks,
+                               get_next_char_t get_next_char)
+{
+    size_t posNext = 0;
+    bool rule10Left = false;  // is the left side of rule 10 fulfilled?
+    bool evenRegionalIndicators = true;  // is the number of preceeding
+                                         // GBP_RegionalIndicator characters
+                                         // even
+
+    utf32_t ch = get_next_char(s, len, &posNext);
+    enum GraphemeBreakClass current_class = get_char_gb_class(ch);
+
+    // initialize whole output to inside char
+    memset(brks, GRAPHEMEBREAK_INSIDEACHAR, len);
+
+    while (true)
+    {
+        enum GraphemeBreakClass prev_class = current_class;
+
+        // safe position if current character so that we can store the
+        // result there later on
+        size_t brksPos = posNext - 1;
+
+        // get nect character
+        ch = get_next_char(s, len, &posNext);
+
+        if (ch == EOS)
+        {
+            // done, place one final break after the last character as per
+            // algorithm rule GB1
+            brks[brksPos] = GRAPHEMEBREAK_BREAK;
+            break;
+        }
+
+        // get class of current character
+        current_class = get_char_gb_class(ch);
+
+        // update some helper variables
+        if ((prev_class == GBP_E_Base) || (prev_class == GBP_E_Base_GAZ))
+        {
+            rule10Left = true;
+        }
+        else if (prev_class != GBP_Extend)
+        {
+            rule10Left = false;
+        }
+
+        if (prev_class == GBP_Regional_Indicator)
+        {
+            evenRegionalIndicators = !evenRegionalIndicators;
+        }
+        else
+        {
+            evenRegionalIndicators = true;
+        }
+
+        // check all rules
+        if (prev_class == GBP_CR && current_class == GBP_LF)
+            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB3
+
+        else if ((prev_class == GBP_CR) || (prev_class == GBP_LF) ||
+                 (prev_class == GBP_Control) || (current_class == GBP_CR) ||
+                 (current_class == GBP_LF) ||
+                 (current_class == GBP_Control))
+            brks[brksPos] = GRAPHEMEBREAK_BREAK;  // Rule: GB4 + GB5
+
+        else if ((prev_class == GBP_L) &&
+                 ((current_class == GBP_L) || (current_class == GBP_V) ||
+                  (current_class == GBP_LV) || (current_class == GBP_LVT)))
+            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB6
+
+        else if (((prev_class == GBP_LV) || (prev_class == GBP_V)) &&
+                 ((current_class == GBP_V) || (current_class == GBP_T)))
+            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB7
+
+        else if (((prev_class == GBP_LVT) || (prev_class == GBP_T)) &&
+                 (current_class == GBP_T))
+            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB8
+
+        else if ((current_class == GBP_Extend) ||
+                 (current_class == GBP_ZWJ))
+            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB9
+
+        else if (current_class == GBP_SpacingMark)
+            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB9a
+
+        else if (prev_class == GBP_Prepend)
+            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB9b
+
+        else if (rule10Left && (current_class == GBP_E_Modifier))
+            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB10
+
+        else if ((prev_class == GBP_ZWJ) &&
+                 ((current_class == GBP_Glue_After_Zwj) ||
+                  (current_class == GBP_E_Base_GAZ)))
+            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB11
+
+        else if (!evenRegionalIndicators &&
+                 (current_class == GBP_Regional_Indicator))
+            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB12 + GB13
+
+        else
+            brks[brksPos] = GRAPHEMEBREAK_BREAK;  // Rule: GB999
+    }
+}
+
+/**
+ * Sets the grapheme breaking information for a UTF-8 input string.
+ *
+ * @param[in]  s     input UTF-8 string
+ * @param[in]  len   length of the input
+ * @param[in]  lang  language of the input, right now this does not
+ *                   influence the algorithm
+ * @param[out] brks  pointer to the output breaking data, containing
+ *                   #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
+ *                   First element in output array is for the break behind
+ *                   the first character the pointer must point to an
+ *                   array with at least as many elements as there
+ *                   are characters in the string
+ */
+void set_graphemebreaks_utf8(const utf8_t *s, size_t len, const char *lang,
+                             char *brks)
+{
+    (void)lang;
+    set_graphemebreaks(s, len, brks,
+                       (get_next_char_t)ub_get_next_char_utf8);
+}
+
+/**
+ * Sets the grapheme breaking information for a UTF-16 input string.
+ *
+ * @param[in]  s     input UTF-16 string
+ * @param[in]  len   length of the input
+ * @param[in]  lang  language of the input, right now this does not
+ *                   influence the algorithm
+ * @param[out] brks  pointer to the output breaking data, containing
+ *                   #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
+ *                   First element in output array is for the break behind
+ *                   the first character the pointer must point to an
+ *                   array with at least as many elements as there
+ *                   are characters in the string
+ */
+void set_graphemebreaks_utf16(const utf16_t *s, size_t len,
+                              const char *lang, char *brks)
+{
+    (void)lang;
+    set_graphemebreaks(s, len, brks,
+                       (get_next_char_t)ub_get_next_char_utf16);
+}
+
+/**
+ * Sets the grapheme breaking information for a UTF-32 input string.
+ *
+ * @param[in]  s     input UTF-32 string
+ * @param[in]  len   length of the input
+ * @param[in]  lang  language of the input, right now this does not
+ *                   influence the algorithm
+ * @param[out] brks  pointer to the output breaking data, containing
+ *                   #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
+ *                   First element in output array is for the break behind
+ *                   the first character the pointer must point to an
+ *                   array with at least as many elements as there
+ *                   are characters in the string
+ */
+void set_graphemebreaks_utf32(const utf32_t *s, size_t len,
+                              const char *lang, char *brks)
+{
+    (void)lang;
+    set_graphemebreaks(s, len, brks,
+                       (get_next_char_t)ub_get_next_char_utf32);
+}