Skip to content

Commit 0aa1c78

Browse files
committed
properly validate Unicode surrogate pairs
previously there were loopholes when JSON-parsing strings with Unicode surrogate pairs: it was not verified if a high surrogate was actually followed by a low surrogate. it was not verified if only a low surrogate occurred.
1 parent 373d0e2 commit 0aa1c78

File tree

2 files changed

+55
-25
lines changed

2 files changed

+55
-25
lines changed

src/Parser.cpp

Lines changed: 38 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,8 @@ void Parser::parseString() {
278278
checkOverflow(len));
279279
_builderPtr->advance(8);
280280
}
281+
// Did we find a high surrogate in this turn?
282+
bool highSurrogateFound = false;
281283
switch (i) {
282284
case '"':
283285
ValueLength len;
@@ -293,6 +295,10 @@ void Parser::parseString() {
293295
len >>= 8;
294296
}
295297
}
298+
if (VELOCYPACK_UNLIKELY(highSurrogate != 0)) {
299+
throw Exception(Exception::InvalidUtf8Sequence,
300+
"Unexpected end of string after high surrogate");
301+
}
296302
return;
297303
case '\\':
298304
// Handle cases or throw error
@@ -305,27 +311,21 @@ void Parser::parseString() {
305311
case '/':
306312
case '\\':
307313
_builderPtr->appendByte(static_cast<uint8_t>(i));
308-
highSurrogate = 0;
309314
break;
310315
case 'b':
311316
_builderPtr->appendByte('\b');
312-
highSurrogate = 0;
313317
break;
314318
case 'f':
315319
_builderPtr->appendByte('\f');
316-
highSurrogate = 0;
317320
break;
318321
case 'n':
319322
_builderPtr->appendByte('\n');
320-
highSurrogate = 0;
321323
break;
322324
case 'r':
323325
_builderPtr->appendByte('\r');
324-
highSurrogate = 0;
325326
break;
326327
case 't':
327328
_builderPtr->appendByte('\t');
328-
highSurrogate = 0;
329329
break;
330330
case 'u': {
331331
uint32_t v = 0;
@@ -343,34 +343,45 @@ void Parser::parseString() {
343343
v = (v << 4) + i - 'A' + 10;
344344
} else {
345345
throw Exception(Exception::ParseError,
346-
"Illegal \\uXXXX escape sequence");
346+
"Illegal \\uXXXX escape sequence character");
347347
}
348348
}
349349
if (v < 0x80) {
350350
_builderPtr->appendByte(static_cast<uint8_t>(v));
351-
highSurrogate = 0;
352351
} else if (v < 0x800) {
353352
_builderPtr->reserve(2);
354353
_builderPtr->appendByteUnchecked(0xc0 + (v >> 6));
355354
_builderPtr->appendByteUnchecked(0x80 + (v & 0x3f));
356-
highSurrogate = 0;
357-
} else if (v >= 0xdc00 && v < 0xe000 && highSurrogate != 0) {
358-
// Low surrogate, put the two together:
359-
v = 0x10000 + ((highSurrogate - 0xd800) << 10) + v - 0xdc00;
360-
_builderPtr->rollback(3);
361-
_builderPtr->reserve(4);
362-
_builderPtr->appendByteUnchecked(0xf0 + (v >> 18));
363-
_builderPtr->appendByteUnchecked(0x80 + ((v >> 12) & 0x3f));
364-
_builderPtr->appendByteUnchecked(0x80 + ((v >> 6) & 0x3f));
365-
_builderPtr->appendByteUnchecked(0x80 + (v & 0x3f));
366-
highSurrogate = 0;
367-
} else {
368-
if (v >= 0xd800 && v < 0xdc00) {
355+
} else if (v >= 0xdc00 && v < 0xe000) {
356+
if (highSurrogate != 0) {
357+
// Low surrogate, put the two together:
358+
v = 0x10000 + ((highSurrogate - 0xd800) << 10) + v - 0xdc00;
359+
_builderPtr->rollback(3);
360+
_builderPtr->reserve(4);
361+
_builderPtr->appendByteUnchecked(0xf0 + (v >> 18));
362+
_builderPtr->appendByteUnchecked(0x80 + ((v >> 12) & 0x3f));
363+
_builderPtr->appendByteUnchecked(0x80 + ((v >> 6) & 0x3f));
364+
_builderPtr->appendByteUnchecked(0x80 + (v & 0x3f));
365+
highSurrogate = 0;
366+
} else {
367+
// Low surrogate without a high surrogate first
368+
throw Exception(Exception::InvalidUtf8Sequence,
369+
"Unexpected \\uXXXX escape sequence (low surrogate without high surrogate)");
370+
}
371+
} else if (v >= 0xd800 && v < 0xdc00) {
372+
if (highSurrogate == 0) {
369373
// High surrogate:
370374
highSurrogate = v;
375+
highSurrogateFound = true;
376+
_builderPtr->reserve(3);
377+
_builderPtr->appendByteUnchecked(0xe0 + (v >> 12));
378+
_builderPtr->appendByteUnchecked(0x80 + ((v >> 6) & 0x3f));
379+
_builderPtr->appendByteUnchecked(0x80 + (v & 0x3f));
371380
} else {
372-
highSurrogate = 0;
381+
throw Exception(Exception::InvalidUtf8Sequence,
382+
"Unexpected \\uXXXX escape sequence (multiple adjacent high surrogates)");
373383
}
384+
} else {
374385
_builderPtr->reserve(3);
375386
_builderPtr->appendByteUnchecked(0xe0 + (v >> 12));
376387
_builderPtr->appendByteUnchecked(0x80 + ((v >> 6) & 0x3f));
@@ -389,11 +400,9 @@ void Parser::parseString() {
389400
// control character
390401
throw Exception(Exception::UnexpectedControlCharacter);
391402
}
392-
highSurrogate = 0;
393403
_builderPtr->appendByte(static_cast<uint8_t>(i));
394404
} else {
395405
if (!options->validateUtf8Strings) {
396-
highSurrogate = 0;
397406
_builderPtr->appendByte(static_cast<uint8_t>(i));
398407
} else {
399408
// multi-byte UTF-8 sequence!
@@ -423,11 +432,15 @@ void Parser::parseString() {
423432
}
424433
_builderPtr->appendByteUnchecked(static_cast<uint8_t>(i));
425434
}
426-
highSurrogate = 0;
427435
}
428436
}
429437
break;
430438
}
439+
440+
if (VELOCYPACK_UNLIKELY(highSurrogate != 0 && !highSurrogateFound)) {
441+
throw Exception(Exception::InvalidUtf8Sequence,
442+
"Unexpected \\uXXXX escape sequence (high surrogate without low surrogate)");
443+
}
431444
}
432445
}
433446

tests/testsParser.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
/// @author Copyright 2015, ArangoDB GmbH, Cologne, Germany
2525
////////////////////////////////////////////////////////////////////////////////
2626

27+
#include <array>
2728
#include <ostream>
2829
#include <string>
2930

@@ -1196,6 +1197,22 @@ TEST(ParserTest, StringLiteralWithSurrogatePairs) {
11961197
checkDump(s, valueOut);
11971198
}
11981199

1200+
TEST(ParserTest, StringLiteralWithInvalidSurrogatePairs) {
1201+
constexpr std::array<std::string_view, 6> values = {
1202+
"\"\\udc89\"", // low surrogate, not preceeded by high surrogate
1203+
"\"\\udc89\\udc89\"", // 2 low surrogates
1204+
"\"\\ud801\"", // high surrogate, not followed by low surrogate
1205+
"\"\\ud801a\"", // high surrogate, not followed by low surrogate
1206+
"\"\\ud801ab\"", // high surrogate, not followed by low surrogate
1207+
"\"\\ud801\\ud801\"", // 2 high surrogates
1208+
};
1209+
1210+
Parser parser;
1211+
for (auto const& value : values) {
1212+
ASSERT_VELOCYPACK_EXCEPTION(parser.parse(value), Exception::InvalidUtf8Sequence);
1213+
}
1214+
}
1215+
11991216
TEST(ParserTest, EmptyArray) {
12001217
std::string const value("[]");
12011218

0 commit comments

Comments
 (0)