@@ -278,6 +278,8 @@ void Parser::parseString() {
278278 checkOverflow (len));
279279 _builderPtr->advance (8 );
280280 }
281+ // Did we find a high surrogate in this turn?
282+ bool highSurrogateFound = false ;
281283 switch (i) {
282284 case ' "' :
283285 ValueLength len;
@@ -293,6 +295,10 @@ void Parser::parseString() {
293295 len >>= 8 ;
294296 }
295297 }
298+ if (VELOCYPACK_UNLIKELY (highSurrogate != 0 )) {
299+ throw Exception (Exception::InvalidUtf8Sequence,
300+ " Unexpected end of string after high surrogate" );
301+ }
296302 return ;
297303 case ' \\ ' :
298304 // Handle cases or throw error
@@ -305,27 +311,21 @@ void Parser::parseString() {
305311 case ' /' :
306312 case ' \\ ' :
307313 _builderPtr->appendByte (static_cast <uint8_t >(i));
308- highSurrogate = 0 ;
309314 break ;
310315 case ' b' :
311316 _builderPtr->appendByte (' \b ' );
312- highSurrogate = 0 ;
313317 break ;
314318 case ' f' :
315319 _builderPtr->appendByte (' \f ' );
316- highSurrogate = 0 ;
317320 break ;
318321 case ' n' :
319322 _builderPtr->appendByte (' \n ' );
320- highSurrogate = 0 ;
321323 break ;
322324 case ' r' :
323325 _builderPtr->appendByte (' \r ' );
324- highSurrogate = 0 ;
325326 break ;
326327 case ' t' :
327328 _builderPtr->appendByte (' \t ' );
328- highSurrogate = 0 ;
329329 break ;
330330 case ' u' : {
331331 uint32_t v = 0 ;
@@ -343,34 +343,45 @@ void Parser::parseString() {
343343 v = (v << 4 ) + i - ' A' + 10 ;
344344 } else {
345345 throw Exception (Exception::ParseError,
346- " Illegal \\ uXXXX escape sequence" );
346+ " Illegal \\ uXXXX escape sequence character " );
347347 }
348348 }
349349 if (v < 0x80 ) {
350350 _builderPtr->appendByte (static_cast <uint8_t >(v));
351- highSurrogate = 0 ;
352351 } else if (v < 0x800 ) {
353352 _builderPtr->reserve (2 );
354353 _builderPtr->appendByteUnchecked (0xc0 + (v >> 6 ));
355354 _builderPtr->appendByteUnchecked (0x80 + (v & 0x3f ));
356- highSurrogate = 0 ;
357- } else if (v >= 0xdc00 && v < 0xe000 && highSurrogate != 0 ) {
358- // Low surrogate, put the two together:
359- v = 0x10000 + ((highSurrogate - 0xd800 ) << 10 ) + v - 0xdc00 ;
360- _builderPtr->rollback (3 );
361- _builderPtr->reserve (4 );
362- _builderPtr->appendByteUnchecked (0xf0 + (v >> 18 ));
363- _builderPtr->appendByteUnchecked (0x80 + ((v >> 12 ) & 0x3f ));
364- _builderPtr->appendByteUnchecked (0x80 + ((v >> 6 ) & 0x3f ));
365- _builderPtr->appendByteUnchecked (0x80 + (v & 0x3f ));
366- highSurrogate = 0 ;
367- } else {
368- if (v >= 0xd800 && v < 0xdc00 ) {
355+ } else if (v >= 0xdc00 && v < 0xe000 ) {
356+ if (highSurrogate != 0 ) {
357+ // Low surrogate, put the two together:
358+ v = 0x10000 + ((highSurrogate - 0xd800 ) << 10 ) + v - 0xdc00 ;
359+ _builderPtr->rollback (3 );
360+ _builderPtr->reserve (4 );
361+ _builderPtr->appendByteUnchecked (0xf0 + (v >> 18 ));
362+ _builderPtr->appendByteUnchecked (0x80 + ((v >> 12 ) & 0x3f ));
363+ _builderPtr->appendByteUnchecked (0x80 + ((v >> 6 ) & 0x3f ));
364+ _builderPtr->appendByteUnchecked (0x80 + (v & 0x3f ));
365+ highSurrogate = 0 ;
366+ } else {
367+ // Low surrogate without a high surrogate first
368+ throw Exception (Exception::InvalidUtf8Sequence,
369+ " Unexpected \\ uXXXX escape sequence (low surrogate without high surrogate)" );
370+ }
371+ } else if (v >= 0xd800 && v < 0xdc00 ) {
372+ if (highSurrogate == 0 ) {
369373 // High surrogate:
370374 highSurrogate = v;
375+ highSurrogateFound = true ;
376+ _builderPtr->reserve (3 );
377+ _builderPtr->appendByteUnchecked (0xe0 + (v >> 12 ));
378+ _builderPtr->appendByteUnchecked (0x80 + ((v >> 6 ) & 0x3f ));
379+ _builderPtr->appendByteUnchecked (0x80 + (v & 0x3f ));
371380 } else {
372- highSurrogate = 0 ;
381+ throw Exception (Exception::InvalidUtf8Sequence,
382+ " Unexpected \\ uXXXX escape sequence (multiple adjacent high surrogates)" );
373383 }
384+ } else {
374385 _builderPtr->reserve (3 );
375386 _builderPtr->appendByteUnchecked (0xe0 + (v >> 12 ));
376387 _builderPtr->appendByteUnchecked (0x80 + ((v >> 6 ) & 0x3f ));
@@ -389,11 +400,9 @@ void Parser::parseString() {
389400 // control character
390401 throw Exception (Exception::UnexpectedControlCharacter);
391402 }
392- highSurrogate = 0 ;
393403 _builderPtr->appendByte (static_cast <uint8_t >(i));
394404 } else {
395405 if (!options->validateUtf8Strings ) {
396- highSurrogate = 0 ;
397406 _builderPtr->appendByte (static_cast <uint8_t >(i));
398407 } else {
399408 // multi-byte UTF-8 sequence!
@@ -423,11 +432,15 @@ void Parser::parseString() {
423432 }
424433 _builderPtr->appendByteUnchecked (static_cast <uint8_t >(i));
425434 }
426- highSurrogate = 0 ;
427435 }
428436 }
429437 break ;
430438 }
439+
440+ if (VELOCYPACK_UNLIKELY (highSurrogate != 0 && !highSurrogateFound)) {
441+ throw Exception (Exception::InvalidUtf8Sequence,
442+ " Unexpected \\ uXXXX escape sequence (high surrogate without low surrogate)" );
443+ }
431444 }
432445}
433446
0 commit comments