Skip to content

Commit

Permalink
properly validate Unicode surrogate pairs (#158)
Browse files Browse the repository at this point in the history
properly validate Unicode surrogate pairs

previously there were loopholes when JSON-parsing strings with Unicode surrogate pairs:
- it was not verified if a high surrogate was actually followed by a low surrogate.
- it was not verified if only a low surrogate occurred.
  • Loading branch information
jsteemann committed Apr 27, 2023
1 parent 373d0e2 commit 805abdd
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 25 deletions.
63 changes: 38 additions & 25 deletions src/Parser.cpp
Expand Up @@ -278,6 +278,7 @@ void Parser::parseString() {
checkOverflow(len));
_builderPtr->advance(8);
}

switch (i) {
case '"':
ValueLength len;
Expand All @@ -293,6 +294,10 @@ void Parser::parseString() {
len >>= 8;
}
}
if (VELOCYPACK_UNLIKELY(highSurrogate != 0)) {
throw Exception(Exception::InvalidUtf8Sequence,
"Unexpected end of string after high surrogate");
}
return;
case '\\':
// Handle cases or throw error
Expand All @@ -305,27 +310,21 @@ void Parser::parseString() {
case '/':
case '\\':
_builderPtr->appendByte(static_cast<uint8_t>(i));
highSurrogate = 0;
break;
case 'b':
_builderPtr->appendByte('\b');
highSurrogate = 0;
break;
case 'f':
_builderPtr->appendByte('\f');
highSurrogate = 0;
break;
case 'n':
_builderPtr->appendByte('\n');
highSurrogate = 0;
break;
case 'r':
_builderPtr->appendByte('\r');
highSurrogate = 0;
break;
case 't':
_builderPtr->appendByte('\t');
highSurrogate = 0;
break;
case 'u': {
uint32_t v = 0;
Expand All @@ -343,34 +342,46 @@ void Parser::parseString() {
v = (v << 4) + i - 'A' + 10;
} else {
throw Exception(Exception::ParseError,
"Illegal \\uXXXX escape sequence");
"Illegal \\uXXXX escape sequence character");
}
}
if (v < 0x80) {
_builderPtr->appendByte(static_cast<uint8_t>(v));
highSurrogate = 0;
} else if (v < 0x800) {
_builderPtr->reserve(2);
_builderPtr->appendByteUnchecked(0xc0 + (v >> 6));
_builderPtr->appendByteUnchecked(0x80 + (v & 0x3f));
highSurrogate = 0;
} else if (v >= 0xdc00 && v < 0xe000 && highSurrogate != 0) {
// Low surrogate, put the two together:
v = 0x10000 + ((highSurrogate - 0xd800) << 10) + v - 0xdc00;
_builderPtr->rollback(3);
_builderPtr->reserve(4);
_builderPtr->appendByteUnchecked(0xf0 + (v >> 18));
_builderPtr->appendByteUnchecked(0x80 + ((v >> 12) & 0x3f));
_builderPtr->appendByteUnchecked(0x80 + ((v >> 6) & 0x3f));
_builderPtr->appendByteUnchecked(0x80 + (v & 0x3f));
highSurrogate = 0;
} else {
if (v >= 0xd800 && v < 0xdc00) {
} else if (v >= 0xdc00 && v < 0xe000) {
if (highSurrogate != 0) {
// Low surrogate, put the two together:
v = 0x10000 + ((highSurrogate - 0xd800) << 10) + v - 0xdc00;
_builderPtr->rollback(3);
_builderPtr->reserve(4);
_builderPtr->appendByteUnchecked(0xf0 + (v >> 18));
_builderPtr->appendByteUnchecked(0x80 + ((v >> 12) & 0x3f));
_builderPtr->appendByteUnchecked(0x80 + ((v >> 6) & 0x3f));
_builderPtr->appendByteUnchecked(0x80 + (v & 0x3f));
highSurrogate = 0;
} else {
// Low surrogate without a high surrogate first
throw Exception(Exception::InvalidUtf8Sequence,
"Unexpected \\uXXXX escape sequence (low surrogate without high surrogate)");
}
} else if (v >= 0xd800 && v < 0xdc00) {
if (highSurrogate == 0) {
// High surrogate:
highSurrogate = v;
_builderPtr->reserve(3);
_builderPtr->appendByteUnchecked(0xe0 + (v >> 12));
_builderPtr->appendByteUnchecked(0x80 + ((v >> 6) & 0x3f));
_builderPtr->appendByteUnchecked(0x80 + (v & 0x3f));

continue;
} else {
highSurrogate = 0;
throw Exception(Exception::InvalidUtf8Sequence,
"Unexpected \\uXXXX escape sequence (multiple adjacent high surrogates)");
}
} else {
_builderPtr->reserve(3);
_builderPtr->appendByteUnchecked(0xe0 + (v >> 12));
_builderPtr->appendByteUnchecked(0x80 + ((v >> 6) & 0x3f));
Expand All @@ -389,11 +400,9 @@ void Parser::parseString() {
// control character
throw Exception(Exception::UnexpectedControlCharacter);
}
highSurrogate = 0;
_builderPtr->appendByte(static_cast<uint8_t>(i));
} else {
if (!options->validateUtf8Strings) {
highSurrogate = 0;
_builderPtr->appendByte(static_cast<uint8_t>(i));
} else {
// multi-byte UTF-8 sequence!
Expand Down Expand Up @@ -423,11 +432,15 @@ void Parser::parseString() {
}
_builderPtr->appendByteUnchecked(static_cast<uint8_t>(i));
}
highSurrogate = 0;
}
}
break;
}

if (VELOCYPACK_UNLIKELY(highSurrogate != 0)) {
throw Exception(Exception::InvalidUtf8Sequence,
"Unexpected \\uXXXX escape sequence (high surrogate without low surrogate)");
}
}
}

Expand Down
17 changes: 17 additions & 0 deletions tests/testsParser.cpp
Expand Up @@ -24,6 +24,7 @@
/// @author Copyright 2015, ArangoDB GmbH, Cologne, Germany
////////////////////////////////////////////////////////////////////////////////

#include <array>
#include <ostream>
#include <string>

Expand Down Expand Up @@ -1196,6 +1197,22 @@ TEST(ParserTest, StringLiteralWithSurrogatePairs) {
checkDump(s, valueOut);
}

TEST(ParserTest, StringLiteralWithInvalidSurrogatePairs) {
constexpr std::array<std::string_view, 6> values = {
"\"\\udc89\"", // low surrogate, not preceeded by high surrogate
"\"\\udc89\\udc89\"", // 2 low surrogates
"\"\\ud801\"", // high surrogate, not followed by low surrogate
"\"\\ud801a\"", // high surrogate, not followed by low surrogate
"\"\\ud801ab\"", // high surrogate, not followed by low surrogate
"\"\\ud801\\ud801\"", // 2 high surrogates
};

Parser parser;
for (auto const& value : values) {
ASSERT_VELOCYPACK_EXCEPTION(parser.parse(value), Exception::InvalidUtf8Sequence);
}
}

TEST(ParserTest, EmptyArray) {
std::string const value("[]");

Expand Down

0 comments on commit 805abdd

Please sign in to comment.