Skip to content
Permalink
Browse files

Fix multibyte issues in Text::tokenize()

String offset slicing is done bytewise and not characterwise which is
necessary for multibyte characters to be used as separators.

Refs #6998
  • Loading branch information...
markstory committed Jul 12, 2015
1 parent ed1ba97 commit 1e2d1b8dc50e05abc428898926c01d9ddea42f71
Showing with 17 additions and 11 deletions.
  1. +12 −11 src/Utility/Text.php
  2. +5 −0 tests/TestCase/Utility/TextTest.php
@@ -75,38 +75,39 @@ public static function tokenize($data, $separator = ',', $leftBound = '(', $righ
$offset = 0;
$buffer = '';
$results = [];
$length = strlen($data);
$length = mb_strlen($data);
$open = false;
while ($offset <= $length) {
$tmpOffset = -1;
$offsets = [
strpos($data, $separator, $offset),
strpos($data, $leftBound, $offset),
strpos($data, $rightBound, $offset)
mb_strpos($data, $separator, $offset),
mb_strpos($data, $leftBound, $offset),
mb_strpos($data, $rightBound, $offset)
];
for ($i = 0; $i < 3; $i++) {
if ($offsets[$i] !== false && ($offsets[$i] < $tmpOffset || $tmpOffset == -1)) {
$tmpOffset = $offsets[$i];
}
}
if ($tmpOffset !== -1) {
$buffer .= substr($data, $offset, ($tmpOffset - $offset));
if (!$depth && $data{$tmpOffset} === $separator) {
$buffer .= mb_substr($data, $offset, ($tmpOffset - $offset));
$char = mb_substr($data, $tmpOffset, 1);
if (!$depth && $char === $separator) {
$results[] = $buffer;
$buffer = '';
} else {
$buffer .= $data{$tmpOffset};
$buffer .= $char;
}
if ($leftBound !== $rightBound) {
if ($data{$tmpOffset} === $leftBound) {
if ($char === $leftBound) {
$depth++;
}
if ($data{$tmpOffset} === $rightBound) {
if ($char === $rightBound) {
$depth--;
}
} else {
if ($data{$tmpOffset} === $leftBound) {
if ($char === $leftBound) {
if (!$open) {
$depth++;
$open = true;
@@ -117,7 +118,7 @@ public static function tokenize($data, $separator = ',', $leftBound = '(', $righ
}
$offset = ++$tmpOffset;
} else {
$results[] = $buffer . substr($data, $offset);
$results[] = $buffer . mb_substr($data, $offset);
$offset = $length + 1;
}
}
@@ -315,6 +315,11 @@ public function testTokenize()
$result = Text::tokenize('tagA "single tag" tagB', ' ', '"', '"');
$expected = ['tagA', '"single tag"', 'tagB'];
$this->assertEquals($expected, $result);
// Ideographic width space.
$result = Text::tokenize("tagA\xe3\x80\x80\"single\xe3\x80\x80tag\"\xe3\x80\x80tagB", "\xe3\x80\x80", '"', '"');
$expected = ['tagA', '"single tag"', 'tagB'];
$this->assertEquals($expected, $result);
}
public function testReplaceWithQuestionMarkInString()

0 comments on commit 1e2d1b8

Please sign in to comment.
You can’t perform that action at this time.