Skip to content

Commit

Permalink
Add Graphemebreak algorithm
Browse files Browse the repository at this point in the history
Squashed together and style fixes by Tom Hacohen
  • Loading branch information
roever authored and tasn committed Nov 22, 2016
1 parent 07ef4f3 commit 21e5b8e
Show file tree
Hide file tree
Showing 11 changed files with 1,844 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ tags
/src/WordBreakProperty.txt
/src/LineBreakTest.txt
/src/WordBreakTest.txt
/src/GraphemeBreakTest.txt
/src/tests
/src/filter_dup
/src/libunibreak.la
Expand Down
4 changes: 4 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,9 @@ linebreakdata:
wordbreakdata:
cd src && ${MAKE} wordbreakdata

graphemebreakdata:
cd src && ${MAKE} graphemebreakdata

check:
cd src && ${MAKE} check

28 changes: 27 additions & 1 deletion src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ include_HEADERS = \
unibreakdef.h \
linebreak.h \
linebreakdef.h \
graphemebreak.h \
wordbreak.h
lib_LTLIBRARIES = libunibreak.la

Expand All @@ -15,6 +16,7 @@ libunibreak_la_SOURCES = \
linebreak.c \
linebreakdata.c \
linebreakdef.c \
graphemebreak.c \
wordbreak.c

EXTRA_DIST = \
Expand All @@ -26,6 +28,9 @@ EXTRA_DIST = \
wordbreakdata1.tmpl \
wordbreakdata2.tmpl \
wordbreakdata.c \
graphemebreakdata1.tmpl \
graphemebreakdata2.tmpl \
graphemebreakdata.c \
Makefile.gcc \
Makefile.msvc \
filter_dup.c \
Expand All @@ -46,6 +51,9 @@ LineBreak.txt:
WordBreakProperty.txt:
wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt

GraphemeBreakProperty.txt:
wget http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt

linebreakdata: ${builddir}/filter_dup LineBreak.txt
sed -E -n -f ${srcdir}/LineBreak1.sed LineBreak.txt > tmp.txt
sed -E -f ${srcdir}/LineBreak2.sed tmp.txt | ${builddir}/filter_dup > tmp.c
Expand All @@ -65,6 +73,19 @@ wordbreakdata: WordBreakProperty.txt
cat ${srcdir}/wordbreakdata1.tmpl tmp.txt ${srcdir}/wordbreakdata2.tmpl >> ${srcdir}/wordbreakdata.c
rm tmp.txt tmp.txt.bak


graphemebreakdata: GraphemeBreakProperty.txt
sed -E -n 's/(^[0-9A-F.]+)/\1/p' GraphemeBreakProperty.txt > tmp2.txt
sed -E -i.bak 's/^([0-9A-F]+) +/\1..\1/' tmp2.txt
${srcdir}/sort_numeric_hex.py tmp2.txt > tmp.txt
rm tmp2.txt tmp2.txt.bak
sed -E -i.bak -n 's/^([0-9A-F]+)..([0-9A-F]+) *; *([A-Za-z_]+).*/'$$'\t''{0x\1, 0x\2, GBP_\3},/p' tmp.txt
echo "/* The content of this file is generated from:" > ${srcdir}/graphemebreakdata.c
head -2 GraphemeBreakProperty.txt >> ${srcdir}/graphemebreakdata.c
echo "*/" >> ${srcdir}/graphemebreakdata.c
cat ${srcdir}/graphemebreakdata1.tmpl tmp.txt ${srcdir}/graphemebreakdata2.tmpl >> ${srcdir}/graphemebreakdata.c
rm tmp.txt tmp.txt.bak

# Tests

check_PROGRAMS = tests
Expand All @@ -75,12 +96,17 @@ tests_CPPFLAGS = -I$(srcdir)
tests_LDADD = libunibreak.la
tests_DEPENDENCIES = libunibreak.la

check: LineBreakTest.txt WordBreakTest.txt $(check_PROGRAMS)
check: LineBreakTest.txt WordBreakTest.txt GraphemeBreakTest.txt $(check_PROGRAMS)
./tests word
./tests line
./tests grapheme

LineBreakTest.txt:
wget http://www.unicode.org/Public/UNIDATA/auxiliary/LineBreakTest.txt

WordBreakTest.txt:
wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt

GraphemeBreakTest.txt:
wget http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt

266 changes: 266 additions & 0 deletions src/graphemebreak.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
/*
* Grapheme breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
* Copyright (C) 2016 Andreas Röver <roever at users dot sf dot net>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute
* it freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must
* not claim that you wrote the original software. If you use this
* software in a product, an acknowledgement in the product
* documentation would be appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must
* not be misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source
* distribution.
*
* The main reference is Unicode Standard Annex 29 (UAX #29):
* <URL:http://unicode.org/reports/tr29>
*
* When this library was designed, this annex was at Revision 29, for
* Unicode 9.0.0:
*
* The Unicode Terms of Use are available at
* <URL:http://www.unicode.org/copyright.html>
*/

/**
* @file graphemebreak.c
*
* Implementation of the grapheme breaking algorithm as described in Unicode
* Standard Annex 29.
*
* @version 1.0, 2016
* @author Andreas Roever
*/

#include "graphemebreak.h"
#include "graphemebreakdata.c"
#include "stdbool.h"
#include "string.h"
#include "unibreakdef.h"

#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))

/**
* Initializes the wordbreak internals. It currently does nothing, but
* it may in the future.
*/
void init_graphemebreak(void)
{
}

/**
* Gets the grapheme breaking class of a character.
*
* @param ch character to check
* @return the grapheme breaking class if found; \c GBP_Other otherwise
*/
static enum GraphemeBreakClass get_char_gb_class(utf32_t ch)
{
int min = 0;
int max = ARRAY_LEN(gb_prop_default) - 1;
int mid;

do
{
mid = (min + max) / 2;

if (ch < gb_prop_default[mid].start)
max = mid - 1;
else if (ch > gb_prop_default[mid].end)
min = mid + 1;
else
return gb_prop_default[mid].prop;
} while (min <= max);

return GBP_Other;
}

/**
* Sets the grapheme breaking information for a generic input string.
*
* @param[in] s input string
* @param[in] len length of the input
* @param[out] brks pointer to the output breaking data, containing
* #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK
* @param[in] get_next_char function to get the next UTF-32 character
*/
static void set_graphemebreaks(const void *s, size_t len, char *brks,
get_next_char_t get_next_char)
{
size_t posNext = 0;
bool rule10Left = false; // is the left side of rule 10 fulfilled?
bool evenRegionalIndicators = true; // is the number of preceeding
// GBP_RegionalIndicator characters
// even

utf32_t ch = get_next_char(s, len, &posNext);
enum GraphemeBreakClass current_class = get_char_gb_class(ch);

// initialize whole output to inside char
memset(brks, GRAPHEMEBREAK_INSIDEACHAR, len);

while (true)
{
enum GraphemeBreakClass prev_class = current_class;

// safe position if current character so that we can store the
// result there later on
size_t brksPos = posNext - 1;

// get nect character
ch = get_next_char(s, len, &posNext);

if (ch == EOS)
{
// done, place one final break after the last character as per
// algorithm rule GB1
brks[brksPos] = GRAPHEMEBREAK_BREAK;
break;
}

// get class of current character
current_class = get_char_gb_class(ch);

// update some helper variables
if ((prev_class == GBP_E_Base) || (prev_class == GBP_E_Base_GAZ))
{
rule10Left = true;
}
else if (prev_class != GBP_Extend)
{
rule10Left = false;
}

if (prev_class == GBP_Regional_Indicator)
{
evenRegionalIndicators = !evenRegionalIndicators;
}
else
{
evenRegionalIndicators = true;
}

// check all rules
if (prev_class == GBP_CR && current_class == GBP_LF)
brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB3

else if ((prev_class == GBP_CR) || (prev_class == GBP_LF) ||
(prev_class == GBP_Control) || (current_class == GBP_CR) ||
(current_class == GBP_LF) ||
(current_class == GBP_Control))
brks[brksPos] = GRAPHEMEBREAK_BREAK; // Rule: GB4 + GB5

else if ((prev_class == GBP_L) &&
((current_class == GBP_L) || (current_class == GBP_V) ||
(current_class == GBP_LV) || (current_class == GBP_LVT)))
brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB6

else if (((prev_class == GBP_LV) || (prev_class == GBP_V)) &&
((current_class == GBP_V) || (current_class == GBP_T)))
brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB7

else if (((prev_class == GBP_LVT) || (prev_class == GBP_T)) &&
(current_class == GBP_T))
brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB8

else if ((current_class == GBP_Extend) ||
(current_class == GBP_ZWJ))
brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB9

else if (current_class == GBP_SpacingMark)
brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB9a

else if (prev_class == GBP_Prepend)
brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB9b

else if (rule10Left && (current_class == GBP_E_Modifier))
brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB10

else if ((prev_class == GBP_ZWJ) &&
((current_class == GBP_Glue_After_Zwj) ||
(current_class == GBP_E_Base_GAZ)))
brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB11

else if (!evenRegionalIndicators &&
(current_class == GBP_Regional_Indicator))
brks[brksPos] = GRAPHEMEBREAK_NOBREAK; // Rule: GB12 + GB13

else
brks[brksPos] = GRAPHEMEBREAK_BREAK; // Rule: GB999
}
}

/**
* Sets the grapheme breaking information for a UTF-8 input string.
*
* @param[in] s input UTF-8 string
* @param[in] len length of the input
* @param[in] lang language of the input, right now this does not
* influence the algorithm
* @param[out] brks pointer to the output breaking data, containing
* #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
* First element in output array is for the break behind
* the first character the pointer must point to an
* array with at least as many elements as there
* are characters in the string
*/
void set_graphemebreaks_utf8(const utf8_t *s, size_t len, const char *lang,
char *brks)
{
(void)lang;
set_graphemebreaks(s, len, brks,
(get_next_char_t)ub_get_next_char_utf8);
}

/**
* Sets the grapheme breaking information for a UTF-16 input string.
*
* @param[in] s input UTF-16 string
* @param[in] len length of the input
* @param[in] lang language of the input, right now this does not
* influence the algorithm
* @param[out] brks pointer to the output breaking data, containing
* #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
* First element in output array is for the break behind
* the first character the pointer must point to an
* array with at least as many elements as there
* are characters in the string
*/
void set_graphemebreaks_utf16(const utf16_t *s, size_t len,
const char *lang, char *brks)
{
(void)lang;
set_graphemebreaks(s, len, brks,
(get_next_char_t)ub_get_next_char_utf16);
}

/**
* Sets the grapheme breaking information for a UTF-32 input string.
*
* @param[in] s input UTF-32 string
* @param[in] len length of the input
* @param[in] lang language of the input, right now this does not
* influence the algorithm
* @param[out] brks pointer to the output breaking data, containing
* #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
* First element in output array is for the break behind
* the first character the pointer must point to an
* array with at least as many elements as there
* are characters in the string
*/
void set_graphemebreaks_utf32(const utf32_t *s, size_t len,
const char *lang, char *brks)
{
(void)lang;
set_graphemebreaks(s, len, brks,
(get_next_char_t)ub_get_next_char_utf32);
}
Loading

0 comments on commit 21e5b8e

Please sign in to comment.