Skip to content

Commit

Permalink
only validate Unicode surrogates if Options.validateUtf8Strings is set (
Browse files Browse the repository at this point in the history
  • Loading branch information
jsteemann committed Apr 27, 2023
1 parent 805abdd commit 3e3e941
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 6 deletions.
8 changes: 4 additions & 4 deletions src/Parser.cpp
Expand Up @@ -294,7 +294,7 @@ void Parser::parseString() {
len >>= 8;
}
}
if (VELOCYPACK_UNLIKELY(highSurrogate != 0)) {
if (VELOCYPACK_UNLIKELY(options->validateUtf8Strings && highSurrogate != 0)) {
throw Exception(Exception::InvalidUtf8Sequence,
"Unexpected end of string after high surrogate");
}
Expand Down Expand Up @@ -362,7 +362,7 @@ void Parser::parseString() {
_builderPtr->appendByteUnchecked(0x80 + ((v >> 6) & 0x3f));
_builderPtr->appendByteUnchecked(0x80 + (v & 0x3f));
highSurrogate = 0;
} else {
} else if (options->validateUtf8Strings) {
// Low surrogate without a high surrogate first
throw Exception(Exception::InvalidUtf8Sequence,
"Unexpected \\uXXXX escape sequence (low surrogate without high surrogate)");
Expand All @@ -377,7 +377,7 @@ void Parser::parseString() {
_builderPtr->appendByteUnchecked(0x80 + (v & 0x3f));

continue;
} else {
} else if (options->validateUtf8Strings) {
throw Exception(Exception::InvalidUtf8Sequence,
"Unexpected \\uXXXX escape sequence (multiple adjacent high surrogates)");
}
Expand Down Expand Up @@ -437,7 +437,7 @@ void Parser::parseString() {
break;
}

if (VELOCYPACK_UNLIKELY(highSurrogate != 0)) {
if (VELOCYPACK_UNLIKELY(options->validateUtf8Strings && highSurrogate != 0)) {
throw Exception(Exception::InvalidUtf8Sequence,
"Unexpected \\uXXXX escape sequence (high surrogate without low surrogate)");
}
Expand Down
29 changes: 27 additions & 2 deletions tests/testsParser.cpp
Expand Up @@ -27,6 +27,7 @@
#include <array>
#include <ostream>
#include <string>
#include <utility>

#include "tests-common.h"

Expand Down Expand Up @@ -1154,7 +1155,6 @@ TEST(ParserTest, StringLiteralWithSpecials) {

Parser parser;
ValueLength len = parser.parse(value);
std::cout << "GOOD 1 \n";

ASSERT_EQ(1ULL, len);

Expand Down Expand Up @@ -1207,12 +1207,37 @@ TEST(ParserTest, StringLiteralWithInvalidSurrogatePairs) {
"\"\\ud801\\ud801\"", // 2 high surrogates
};

Parser parser;
Options options;
options.validateUtf8Strings = true;

Parser parser(&options);
for (auto const& value : values) {
ASSERT_VELOCYPACK_EXCEPTION(parser.parse(value), Exception::InvalidUtf8Sequence);
}
}

TEST(ParserTest, StringLiteralWithInvalidSurrogatePairsDisabled) {
std::array<std::pair<std::string_view, std::size_t>, 6> values = {
std::make_pair("\"\\udc89\"", std::size_t(0)), // low surrogate, not preceeded by high surrogate
std::make_pair("\"\\udc89\\udc89\"", std::size_t(0)), // 2 low surrogates
std::make_pair("\"\\ud801\"", std::size_t(3)), // high surrogate, not followed by low surrogate
std::make_pair("\"\\ud801a\"", std::size_t(4)), // high surrogate, not followed by low surrogate
std::make_pair("\"\\ud801ab\"", std::size_t(5)), // high surrogate, not followed by low surrogate
std::make_pair("\"\\ud801\\ud801\"", std::size_t(3)), // 2 high surrogates
};

Options options;
options.validateUtf8Strings = false;

for (auto const& [value, length] : values) {
Parser parser(&options);
parser.parse(value);
auto builder = parser.steal();
ASSERT_TRUE(builder->slice().isString());
ASSERT_EQ(length, builder->slice().stringView().size());
}
}

TEST(ParserTest, EmptyArray) {
std::string const value("[]");

Expand Down

0 comments on commit 3e3e941

Please sign in to comment.