Skip to content

Commit

Permalink
Implement SYRIAC ABBREVIATION MARK with 'stch' feature
Browse files Browse the repository at this point in the history
The feature is enabled for any character in the Arabic shaper.
We should experiment with using it for Arabic subtending marks.
Though, that has a directionality problem as well, since those
are used with digits...

Fixes #141
  • Loading branch information
behdad committed Nov 6, 2015
1 parent c743ec5 commit 6e6f82b
Show file tree
Hide file tree
Showing 4 changed files with 244 additions and 6 deletions.
219 changes: 213 additions & 6 deletions src/hb-ot-shape-complex-arabic.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,16 @@
#include "hb-ot-shape-private.hh"


#ifndef HB_DEBUG_ARABIC
#define HB_DEBUG_ARABIC (HB_DEBUG+0)
#endif


/* buffer var allocations */
#define arabic_shaping_action() complex_var_u8_0() /* arabic shaping action */

#define HB_BUFFER_SCRATCH_FLAG_ARABIC_HAS_STCH HB_BUFFER_SCRATCH_FLAG_COMPLEX0


/*
* Joining types:
Expand Down Expand Up @@ -84,7 +91,7 @@ static const hb_tag_t arabic_features[] =


/* Same order as the feature array */
enum {
enum arabic_action_t {
ISOL,
FINA,
FIN2,
Expand All @@ -95,7 +102,11 @@ enum {

NONE,

ARABIC_NUM_FEATURES = NONE
ARABIC_NUM_FEATURES = NONE,

/* We abuse the same byte for other things... */
STCH_FIXED,
STCH_REPEATING,
};

static const struct arabic_state_table_entry {
Expand Down Expand Up @@ -139,6 +150,11 @@ arabic_fallback_shape (const hb_ot_shape_plan_t *plan,
hb_font_t *font,
hb_buffer_t *buffer);

static void
record_stch (const hb_ot_shape_plan_t *plan,
hb_font_t *font,
hb_buffer_t *buffer);

static void
collect_features_arabic (hb_ot_shape_planner_t *plan)
{
Expand All @@ -165,6 +181,9 @@ collect_features_arabic (hb_ot_shape_planner_t *plan)

map->add_gsub_pause (nuke_joiners);

map->add_global_bool_feature (HB_TAG('s','t','c','h'));
map->add_gsub_pause (record_stch);

map->add_global_bool_feature (HB_TAG('c','c','m','p'));
map->add_global_bool_feature (HB_TAG('l','o','c','l'));

Expand Down Expand Up @@ -208,8 +227,10 @@ struct arabic_shape_plan_t
* mask_array[NONE] == 0. */
hb_mask_t mask_array[ARABIC_NUM_FEATURES + 1];

bool do_fallback;
arabic_fallback_plan_t *fallback_plan;

unsigned int do_fallback : 1;
unsigned int has_stch : 1;
};

void *
Expand All @@ -220,6 +241,7 @@ data_create_arabic (const hb_ot_shape_plan_t *plan)
return NULL;

arabic_plan->do_fallback = plan->props.script == HB_SCRIPT_ARABIC;
arabic_plan->has_stch = !!plan->map.get_1_mask (HB_TAG ('s','t','c','h'));
for (unsigned int i = 0; i < ARABIC_NUM_FEATURES; i++) {
arabic_plan->mask_array[i] = plan->map.get_1_mask (arabic_features[i]);
arabic_plan->do_fallback = arabic_plan->do_fallback &&
Expand Down Expand Up @@ -320,8 +342,6 @@ setup_masks_arabic_plan (const arabic_shape_plan_t *arabic_plan,
hb_glyph_info_t *info = buffer->info;
for (unsigned int i = 0; i < count; i++)
info[i].mask |= arabic_plan->mask_array[info[i].arabic_shaping_action()];

HB_BUFFER_DEALLOCATE_VAR (buffer, arabic_shaping_action);
}

static void
Expand Down Expand Up @@ -371,6 +391,193 @@ arabic_fallback_shape (const hb_ot_shape_plan_t *plan,
arabic_fallback_plan_shape (fallback_plan, font, buffer);
}

/*
* Stretch feature: "stch".
* See example here:
* https://www.microsoft.com/typography/OpenTypeDev/syriac/intro.htm
* We implement this in a generic way, such that the Arabic subtending
* marks can use it as well.
*/

static void
record_stch (const hb_ot_shape_plan_t *plan,
hb_font_t *font,
hb_buffer_t *buffer)
{
const arabic_shape_plan_t *arabic_plan = (const arabic_shape_plan_t *) plan->data;
if (!arabic_plan->has_stch)
return;

/* 'stch' feature was just applied. Look for anything that multiplied,
* and record it for stch treatment later. Note that rtlm, frac, etc
* are applied before stch, but we assume that they didn't result in
* anything multiplying into 5 pieces, so it's safe-ish... */

unsigned int count = buffer->len;
hb_glyph_info_t *info = buffer->info;
for (unsigned int i = 0; i < count; i++)
if (unlikely (_hb_glyph_info_multiplied (&info[i])))
{
unsigned int comp = _hb_glyph_info_get_lig_comp (&info[i]);
info[i].arabic_shaping_action() = comp % 2 ? STCH_REPEATING : STCH_FIXED;
buffer->scratch_flags |= HB_BUFFER_SCRATCH_FLAG_ARABIC_HAS_STCH;
}
}

static void
apply_stch (const hb_ot_shape_plan_t *plan,
hb_buffer_t *buffer,
hb_font_t *font)
{
if (likely (!(buffer->scratch_flags & HB_BUFFER_SCRATCH_FLAG_ARABIC_HAS_STCH)))
return;

/* The Arabic shaper currently always processes in RTL mode, so we should
* stretch / position the stretched pieces to the left / preceding glyphs. */

/* We do a two pass implementation:
* First pass calculates the exact number of extra glyphs we need,
* We then enlarge buffer to have that much room,
* Second pass applies the stretch, copying things to the end of buffer.
*/

/* 30 = 2048 / 70.
* https://www.microsoft.com/typography/cursivescriptguidelines.mspx */
hb_position_t overlap = font->x_scale / 30;
DEBUG_MSG (ARABIC, NULL, "overlap for stretching is %d", overlap);
int sign = font->x_scale < 0 ? -1 : +1;
unsigned int extra_glyphs_needed = 0; // Set during MEASURE, used during CUT

for (enum step_t { MEASURE, CUT } step = MEASURE; step <= CUT; step = (step_t) (step + 1))
{
unsigned int count = buffer->len;
hb_glyph_info_t *info = buffer->info;
hb_glyph_position_t *pos = buffer->pos;
unsigned int new_len = count + extra_glyphs_needed; // write head during CUT
unsigned int j = new_len;
for (unsigned int i = count; i; i--)
{
if (!hb_in_range<unsigned> (info[i - 1].arabic_shaping_action(), STCH_FIXED, STCH_REPEATING))
{
if (step == CUT)
{
--j;
info[j] = info[i - 1];
pos[j] = pos[i - 1];
}
continue;
}

/* Yay, justification! */

hb_position_t w_total = 0; // Total to be filled
hb_position_t w_fixed = 0; // Sum of fixed tiles
hb_position_t w_repeating = 0; // Sum of repeating tiles
int n_fixed = 0;
int n_repeating = 0;

unsigned int end = i;
while (i &&
hb_in_range<unsigned> (info[i - 1].arabic_shaping_action(), STCH_FIXED, STCH_REPEATING))
{
i--;
hb_glyph_extents_t extents;
if (!font->get_glyph_extents (info[i].codepoint, &extents))
extents.width = 0;
extents.width -= overlap;
if (info[i].arabic_shaping_action() == STCH_FIXED)
{
w_fixed += extents.width;
n_fixed++;
}
else
{
w_repeating += extents.width;
n_repeating++;
}
}
unsigned int start = i;
unsigned int context = i;
while (context &&
!hb_in_range<unsigned> (info[context - 1].arabic_shaping_action(), STCH_FIXED, STCH_REPEATING) &&
(_hb_glyph_info_is_default_ignorable (&info[context - 1]) ||
HB_UNICODE_GENERAL_CATEGORY_IS_WORD (_hb_glyph_info_get_general_category (&info[context - 1]))))

This comment has been minimized.

Copy link
@jfkthame

jfkthame Nov 6, 2015

Collaborator

This allows the SAM to extend across non-Syriac letters, which it arguably shouldn't do.... silly example:

echo 070F 072B 0718 0061 0062 0063 0712 002e | hb-unicode-encode | hb-view seguihis.ttf

shows an abbreviation mark that spans the embedded "cba" as well as the Syriac letters.

Unlikely to affect most clients, though, if we assume that text will normally be partitioned into script runs before being passed to harfbuzz. So maybe not worth trying to do anything smarter here.

This comment has been minimized.

Copy link
@behdad

behdad Nov 6, 2015

Author Member

Agree with not changing for now. I really like to keep it based on gen-cat.

This comment has been minimized.

Copy link
@behdad

behdad Nov 7, 2015

Author Member

Thinking more about it, since this is in Arabic shaper, I'll remove Ll, Lu, and Lt classes.

{
context--;
w_total += pos[context].x_advance;
}
i++; // Don't touch i again.

DEBUG_MSG (ARABIC, NULL, "%s stretch at (%d,%d,%d)",
step == MEASURE ? "measuring" : "cutting", context, start, end);
DEBUG_MSG (ARABIC, NULL, "rest of word: count=%d width %d", start - context, w_total);
DEBUG_MSG (ARABIC, NULL, "fixed tiles: count=%d width=%d", n_fixed, w_fixed);
DEBUG_MSG (ARABIC, NULL, "repeating tiles: count=%d width=%d", n_repeating, w_repeating);

/* Number of additional times to repeat each repeating tile. */
int n_copies = 0;

hb_position_t w_remaining = w_total - w_fixed - overlap;
if (sign * w_remaining > sign * w_repeating && sign * w_repeating > 0)
n_copies = (sign * w_remaining + sign * w_repeating / 2) / (sign * w_repeating) - 1;

if (step == MEASURE)
{
extra_glyphs_needed += n_copies * n_repeating;
DEBUG_MSG (ARABIC, NULL, "will add extra %d copies of repeating tiles", n_copies);
}
else
{
hb_position_t x_offset = -overlap;
for (unsigned int k = end; k > start; k--)
{
hb_glyph_extents_t extents;
if (!font->get_glyph_extents (info[k - 1].codepoint, &extents))
extents.width = 0;
extents.width -= overlap;

unsigned int repeat = 1;
if (info[k - 1].arabic_shaping_action() == STCH_REPEATING)
repeat += n_copies;

DEBUG_MSG (ARABIC, NULL, "appending %d copies of glyph %d; j=%d",
repeat, info[k - 1].codepoint, j);
for (unsigned int n = 0; n < repeat; n++)
{
x_offset -= extents.width;
pos[k - 1].x_offset = x_offset;
/* Append copy. */
--j;
info[j] = info[k - 1];
pos[j] = pos[k - 1];
}
}
}
}

if (step == MEASURE)
{
if (unlikely (!buffer->ensure (count + extra_glyphs_needed)))
break;
}
else
{
assert (j == 0);
buffer->len = new_len;
}
}
}


static void
postprocess_glyphs_arabic (const hb_ot_shape_plan_t *plan,
hb_buffer_t *buffer,
hb_font_t *font)
{
apply_stch (plan, buffer, font);

HB_BUFFER_DEALLOCATE_VAR (buffer, arabic_shaping_action);
}

const hb_ot_complex_shaper_t _hb_ot_complex_shaper_arabic =
{
Expand All @@ -380,7 +587,7 @@ const hb_ot_complex_shaper_t _hb_ot_complex_shaper_arabic =
data_create_arabic,
data_destroy_arabic,
NULL, /* preprocess_text */
NULL, /* postprocess_glyphs */
postprocess_glyphs_arabic,
HB_OT_SHAPE_NORMALIZATION_MODE_DEFAULT,
NULL, /* decompose */
NULL, /* compose */
Expand Down
19 changes: 19 additions & 0 deletions src/hb-unicode-private.hh
Original file line number Diff line number Diff line change
Expand Up @@ -362,5 +362,24 @@ extern HB_INTERNAL const hb_unicode_funcs_t _hb_unicode_funcs_nil;
(FLAG (HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK)))

#define HB_UNICODE_GENERAL_CATEGORY_IS_WORD(gen_cat) \
(FLAG_SAFE (gen_cat) & \
(FLAG (HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL) | \
FLAG (HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL)))

This comment has been minimized.

Copy link
@jfkthame

jfkthame Nov 6, 2015

Collaborator

It seems a bit surprising to see the *_SYMBOL types included here; do we know whether Uniscribe lets the SAM span things like a dollar sign?

This comment has been minimized.

Copy link
@behdad

behdad Nov 6, 2015

Author Member

I have no idea what Uniscribe allows.

This comment has been minimized.

Copy link
@behdad

behdad Nov 7, 2015

Author Member

Actually, for Arabic use, I was hoping that decimal separator / thousands separator could be included, but that's hard as those are punctuation. I leave this as is for now. (I did remove the cased letters).


#endif /* HB_UNICODE_PRIVATE_HH */
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
abbreviation-mark.txt
alaph.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
ܐܒ
ܐ܏
ܐ܏ܒ
ܐ܏ܒܓ
ܐ܏ܒܓܕ
ܐ܏ܒܓܕܐ
ܐ܏ܒܓܕܐܐܐܐܐܐܐܐܐ
ܐ܏ܒܓܕܓܓܓܓܓܓ
ܐ܏ܒܓ


0 comments on commit 6e6f82b

Please sign in to comment.