Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Backport "Fix handling of surrogates on decoding"
  • Loading branch information
adrianeboyd committed Jul 20, 2022
1 parent 63a8a1c commit 953eafc
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 28 deletions.
38 changes: 37 additions & 1 deletion srsly/tests/ujson/test_ujson.py
@@ -1,4 +1,5 @@
import decimal
import ctypes
import decimal
import json
import math
import sys
Expand Down Expand Up @@ -958,3 +959,38 @@ def test_issue_334(indent):
path = Path(__file__).with_name("334-reproducer.json")
a = ujson.loads(path.read_bytes())
ujson.dumps(a, indent=indent)


@pytest.mark.parametrize(
"test_input, expected",
[
# Normal cases
(r'"\uD83D\uDCA9"', "\U0001F4A9"),
(r'"a\uD83D\uDCA9b"', "a\U0001F4A9b"),
# Unpaired surrogates
(r'"\uD800"', "\uD800"),
(r'"a\uD800b"', "a\uD800b"),
(r'"\uDEAD"', "\uDEAD"),
(r'"a\uDEADb"', "a\uDEADb"),
(r'"\uD83D\uD83D\uDCA9"', "\uD83D\U0001F4A9"),
(r'"\uDCA9\uD83D\uDCA9"', "\uDCA9\U0001F4A9"),
(r'"\uD83D\uDCA9\uD83D"', "\U0001F4A9\uD83D"),
(r'"\uD83D\uDCA9\uDCA9"', "\U0001F4A9\uDCA9"),
(r'"\uD83D \uDCA9"', "\uD83D \uDCA9"),
# No decoding of actual surrogate characters (rather than escaped ones)
('"\uD800"', "\uD800"),
('"\uDEAD"', "\uDEAD"),
('"\uD800a\uDEAD"', "\uD800a\uDEAD"),
('"\uD83D\uDCA9"', "\uD83D\uDCA9"),
],
)
def test_decode_surrogate_characters(test_input, expected):
# FIXME Wrong output (combined char) on platforms with 16-bit wchar_t
if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2:
pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t")

assert ujson.loads(test_input) == expected
assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected

# Ensure that this matches stdlib's behaviour
assert json.loads(test_input) == expected
2 changes: 1 addition & 1 deletion srsly/ujson/JSONtoObj.c
Expand Up @@ -161,7 +161,7 @@ PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs)
else
if (PyUnicode_Check(arg))
{
sarg = PyUnicode_AsUTF8String(arg);
sarg = PyUnicode_AsEncodedString(arg, NULL, "surrogatepass");
if (sarg == NULL)
{
//Exception raised above us by codec according to docs
Expand Down
46 changes: 20 additions & 26 deletions srsly/ujson/lib/ultrajsondec.c
Expand Up @@ -424,13 +424,15 @@ static const JSUINT8 g_decoderLookup[256] =

FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
{
JSUTF16 sur[2] = { 0 };
int iSur = 0;
int index;
wchar_t *escOffset;
wchar_t *escStart;
size_t escLen = (ds->escEnd - ds->escStart);
JSUINT8 *inputOffset;
JSUTF16 ch = 0;
#if WCHAR_MAX >= 0x10FFFF
JSUINT8 *lastHighSurrogate = NULL;
#endif
JSUINT8 oct;
JSUTF32 ucs;
ds->lastType = JT_INVALID;
Expand Down Expand Up @@ -530,7 +532,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
case '7':
case '8':
case '9':
sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0');
ch = (ch << 4) + (JSUTF16) (*inputOffset - '0');
break;

case 'a':
Expand All @@ -539,7 +541,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
case 'd':
case 'e':
case 'f':
sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
break;

case 'A':
Expand All @@ -548,39 +550,31 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
case 'D':
case 'E':
case 'F':
sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
break;
}

inputOffset ++;
}

if (iSur == 0)
#if WCHAR_MAX >= 0x10FFFF
if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset))
{
if((sur[iSur] & 0xfc00) == 0xd800)
{
// First of a surrogate pair, continue parsing
iSur ++;
break;
}
(*escOffset++) = (wchar_t) sur[iSur];
iSur = 0;
// Low surrogate immediately following a high surrogate
// Overwrite existing high surrogate with combined character
*(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000;
}
else
{
// Decode pair
if ((sur[1] & 0xfc00) != 0xdc00)
{
return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'");
}
#if WCHAR_MAX == 0xffff
(*escOffset++) = (wchar_t) sur[0];
(*escOffset++) = (wchar_t) sur[1];
#else
(*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00));
#endif
iSur = 0;
{
*(escOffset++) = (wchar_t) ch;
}
#if WCHAR_MAX >= 0x10FFFF
if ((ch & 0xfc00) == 0xd800)
{
lastHighSurrogate = inputOffset;
}
#endif
break;
}

Expand Down

0 comments on commit 953eafc

Please sign in to comment.