Permalink
Browse files

Merge #7892: Add full UTF-8 support to RPC

7982fce doc: Mention full UTF-8 support in release notes (Wladimir J. van der Laan)
6bbb4ef test: test utf-8 for labels in wallet (Wladimir J. van der Laan)
a406fcb test: add ensure_ascii setting to AuthServiceProxy (Wladimir J. van der Laan)
60ab9b2 Squashed 'src/univalue/' changes from 2740c4f..f32df99 (Wladimir J. van der Laan)
  • Loading branch information...
laanwj committed Jun 16, 2016
2 parents 3f89a53 + 7982fce commit 9c3d0fab3623ad6ca6c150ad9ebb5c5c6d4bcec3
View
@@ -43,6 +43,11 @@ RPC low-level changes
32-bit and 64-bit platforms, and the txids were missing in the hashed data. This has been
fixed, but this means that the output will be different than from previous versions.
- Full UTF-8 support in the RPC API. Non-ASCII characters in, for example,
wallet labels have always been malformed because they weren't taken into account
properly in JSON RPC processing. This is no longer the case. This also affects
the GUI debug console.
C++11 and Python 3
-------------------
@@ -67,9 +67,11 @@ def EncodeDecimal(o):
class AuthServiceProxy(object):
__id_count = 0
def __init__(self, service_url, service_name=None, timeout=HTTP_TIMEOUT, connection=None):
# ensure_ascii: escape unicode as \uXXXX, passed to json.dumps
def __init__(self, service_url, service_name=None, timeout=HTTP_TIMEOUT, connection=None, ensure_ascii=True):
self.__service_url = service_url
self._service_name = service_name
self.ensure_ascii = ensure_ascii # can be toggled on the fly by tests
self.__url = urlparse.urlparse(service_url)
if self.__url.port is None:
port = 80
@@ -134,12 +136,12 @@ def __call__(self, *args):
AuthServiceProxy.__id_count += 1
log.debug("-%s-> %s %s"%(AuthServiceProxy.__id_count, self._service_name,
json.dumps(args, default=EncodeDecimal)))
json.dumps(args, default=EncodeDecimal, ensure_ascii=self.ensure_ascii)))
postdata = json.dumps({'version': '1.1',
'method': self._service_name,
'params': args,
'id': AuthServiceProxy.__id_count}, default=EncodeDecimal)
response = self._request('POST', self.__url.path, postdata)
'id': AuthServiceProxy.__id_count}, default=EncodeDecimal, ensure_ascii=self.ensure_ascii)
response = self._request('POST', self.__url.path, postdata.encode('utf-8'))
if response['error'] is not None:
raise JSONRPCException(response['error'])
elif 'result' not in response:
@@ -149,9 +151,9 @@ def __call__(self, *args):
return response['result']
def _batch(self, rpc_call_list):
postdata = json.dumps(list(rpc_call_list), default=EncodeDecimal)
postdata = json.dumps(list(rpc_call_list), default=EncodeDecimal, ensure_ascii=self.ensure_ascii)
log.debug("--> "+postdata)
return self._request('POST', self.__url.path, postdata)
return self._request('POST', self.__url.path, postdata.encode('utf-8'))
def _get_response(self):
http_response = self.__conn.getresponse()
@@ -167,7 +169,7 @@ def _get_response(self):
responsedata = http_response.read().decode('utf8')
response = json.loads(responsedata, parse_float=decimal.Decimal)
if "error" in response and response["error"] is None:
log.debug("<-%s- %s"%(response["id"], json.dumps(response["result"], default=EncodeDecimal)))
log.debug("<-%s- %s"%(response["id"], json.dumps(response["result"], default=EncodeDecimal, ensure_ascii=self.ensure_ascii)))
else:
log.debug("<-- "+responsedata)
return response
View
@@ -309,6 +309,20 @@ def run_test (self):
balance_nodes = [self.nodes[i].getbalance() for i in range(3)]
block_count = self.nodes[0].getblockcount()
# Check modes:
# - True: unicode escaped as \u....
# - False: unicode directly as UTF-8
for mode in [True, False]:
self.nodes[0].ensure_ascii = mode
# unicode check: Basic Multilingual Plane, Supplementary Plane respectively
for s in [u'рыба', u'𝅘𝅥𝅯']:
addr = self.nodes[0].getaccountaddress(s)
label = self.nodes[0].getaccount(addr)
assert_equal(label, s)
assert(s in self.nodes[0].listaccounts().keys())
self.nodes[0].ensure_ascii = True # restore to default
# maintenance tests
maintenance = [
'-rescan',
'-reindex',
View
@@ -3,7 +3,7 @@ ACLOCAL_AMFLAGS = -I build-aux/m4
.INTERMEDIATE: $(GENBIN)
include_HEADERS = include/univalue.h
noinst_HEADERS = lib/univalue_escapes.h
noinst_HEADERS = lib/univalue_escapes.h lib/univalue_utffilter.h
lib_LTLIBRARIES = libunivalue.la
@@ -73,6 +73,10 @@ TEST_FILES = \
$(TEST_DATA_DIR)/fail35.json \
$(TEST_DATA_DIR)/fail36.json \
$(TEST_DATA_DIR)/fail37.json \
$(TEST_DATA_DIR)/fail38.json \
$(TEST_DATA_DIR)/fail39.json \
$(TEST_DATA_DIR)/fail40.json \
$(TEST_DATA_DIR)/fail41.json \
$(TEST_DATA_DIR)/fail3.json \
$(TEST_DATA_DIR)/fail4.json \
$(TEST_DATA_DIR)/fail5.json \
@@ -83,6 +87,7 @@ TEST_FILES = \
$(TEST_DATA_DIR)/pass1.json \
$(TEST_DATA_DIR)/pass2.json \
$(TEST_DATA_DIR)/pass3.json \
$(TEST_DATA_DIR)/round1.json
$(TEST_DATA_DIR)/round1.json \
$(TEST_DATA_DIR)/round2.json
EXTRA_DIST=$(TEST_FILES) $(GEN_SRCS)
@@ -1,7 +1,7 @@
m4_define([libunivalue_major_version], [1])
m4_define([libunivalue_minor_version], [1])
m4_define([libunivalue_micro_version], [1])
m4_define([libunivalue_interface_age], [1])
m4_define([libunivalue_micro_version], [2])
m4_define([libunivalue_interface_age], [2])
# If you need a modifier for the version number.
# Normally empty, but can be used to make "fixup" releases.
m4_define([libunivalue_extraversion], [])
@@ -14,7 +14,7 @@ m4_define([libunivalue_age], [m4_eval(libunivalue_binary_age - libunivalue_inter
m4_define([libunivalue_version], [libunivalue_major_version().libunivalue_minor_version().libunivalue_micro_version()libunivalue_extraversion()])
AC_INIT([univalue], [1.0.1],
AC_INIT([univalue], [1.0.2],
[http://github.com/jgarzik/univalue/])
dnl make the compilation flags quiet unless V=1 is used
@@ -6,6 +6,7 @@
#include <vector>
#include <stdio.h>
#include "univalue.h"
#include "univalue_utffilter.h"
using namespace std;
@@ -174,41 +175,31 @@ enum jtokentype getJsonToken(string& tokenVal, unsigned int& consumed,
raw++; // skip "
string valStr;
JSONUTF8StringFilter writer(valStr);
while (*raw) {
if (*raw < 0x20)
if ((unsigned char)*raw < 0x20)
return JTOK_ERR;
else if (*raw == '\\') {
raw++; // skip backslash
switch (*raw) {
case '"': valStr += "\""; break;
case '\\': valStr += "\\"; break;
case '/': valStr += "/"; break;
case 'b': valStr += "\b"; break;
case 'f': valStr += "\f"; break;
case 'n': valStr += "\n"; break;
case 'r': valStr += "\r"; break;
case 't': valStr += "\t"; break;
case '"': writer.push_back('\"'); break;
case '\\': writer.push_back('\\'); break;
case '/': writer.push_back('/'); break;
case 'b': writer.push_back('\b'); break;
case 'f': writer.push_back('\f'); break;
case 'n': writer.push_back('\n'); break;
case 'r': writer.push_back('\r'); break;
case 't': writer.push_back('\t'); break;
case 'u': {
unsigned int codepoint;
if (hatoui(raw + 1, raw + 1 + 4, codepoint) !=
raw + 1 + 4)
return JTOK_ERR;
if (codepoint <= 0x7f)
valStr.push_back((char)codepoint);
else if (codepoint <= 0x7FF) {
valStr.push_back((char)(0xC0 | (codepoint >> 6)));
valStr.push_back((char)(0x80 | (codepoint & 0x3F)));
} else if (codepoint <= 0xFFFF) {
valStr.push_back((char)(0xE0 | (codepoint >> 12)));
valStr.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
valStr.push_back((char)(0x80 | (codepoint & 0x3F)));
}
writer.push_back_u(codepoint);
raw += 4;
break;
}
@@ -226,11 +217,13 @@ enum jtokentype getJsonToken(string& tokenVal, unsigned int& consumed,
}
else {
valStr += *raw;
writer.push_back(*raw);
raw++;
}
}
if (!writer.finalize())
return JTOK_ERR;
tokenVal = valStr;
consumed = (raw - rawStart);
return JTOK_STRING;
@@ -0,0 +1,119 @@
// Copyright 2016 Wladimir J. van der Laan
// Distributed under the MIT software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef UNIVALUE_UTFFILTER_H
#define UNIVALUE_UTFFILTER_H
#include <string>
/**
* Filter that generates and validates UTF-8, as well as collates UTF-16
* surrogate pairs as specified in RFC4627.
*/
class JSONUTF8StringFilter
{
public:
JSONUTF8StringFilter(std::string &s):
str(s), is_valid(true), codepoint(0), state(0), surpair(0)
{
}
// Write single 8-bit char (may be part of UTF-8 sequence)
void push_back(unsigned char ch)
{
if (state == 0) {
if (ch < 0x80) // 7-bit ASCII, fast direct pass-through
str.push_back(ch);
else if (ch < 0xc0) // Mid-sequence character, invalid in this state
is_valid = false;
else if (ch < 0xe0) { // Start of 2-byte sequence
codepoint = (ch & 0x1f) << 6;
state = 6;
} else if (ch < 0xf0) { // Start of 3-byte sequence
codepoint = (ch & 0x0f) << 12;
state = 12;
} else if (ch < 0xf8) { // Start of 4-byte sequence
codepoint = (ch & 0x07) << 18;
state = 18;
} else // Reserved, invalid
is_valid = false;
} else {
if ((ch & 0xc0) != 0x80) // Not a continuation, invalid
is_valid = false;
state -= 6;
codepoint |= (ch & 0x3f) << state;
if (state == 0)
push_back_u(codepoint);
}
}
// Write codepoint directly, possibly collating surrogate pairs
void push_back_u(unsigned int codepoint)
{
if (state) // Only accept full codepoints in open state
is_valid = false;
if (codepoint >= 0xD800 && codepoint < 0xDC00) { // First half of surrogate pair
if (surpair) // Two subsequent surrogate pair openers - fail
is_valid = false;
else
surpair = codepoint;
} else if (codepoint >= 0xDC00 && codepoint < 0xE000) { // Second half of surrogate pair
if (surpair) { // Open surrogate pair, expect second half
// Compute code point from UTF-16 surrogate pair
append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint - 0xDC00));
surpair = 0;
} else // Second half doesn't follow a first half - fail
is_valid = false;
} else {
if (surpair) // First half of surrogate pair not followed by second - fail
is_valid = false;
else
append_codepoint(codepoint);
}
}
// Check that we're in a state where the string can be ended
// No open sequences, no open surrogate pairs, etc
bool finalize()
{
if (state || surpair)
is_valid = false;
return is_valid;
}
private:
std::string &str;
bool is_valid;
// Current UTF-8 decoding state
unsigned int codepoint;
int state; // Top bit to be filled in for next UTF-8 byte, or 0
// Keep track of the following state to handle the following section of
// RFC4627:
//
// To escape an extended character that is not in the Basic Multilingual
// Plane, the character is represented as a twelve-character sequence,
// encoding the UTF-16 surrogate pair. So, for example, a string
// containing only the G clef character (U+1D11E) may be represented as
// "\uD834\uDD1E".
//
// Two subsequent \u.... may have to be replaced with one actual codepoint.
unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0
void append_codepoint(unsigned int codepoint)
{
if (codepoint <= 0x7f)
str.push_back((char)codepoint);
else if (codepoint <= 0x7FF) {
str.push_back((char)(0xC0 | (codepoint >> 6)));
str.push_back((char)(0x80 | (codepoint & 0x3F)));
} else if (codepoint <= 0xFFFF) {
str.push_back((char)(0xE0 | (codepoint >> 12)));
str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
str.push_back((char)(0x80 | (codepoint & 0x3F)));
} else if (codepoint <= 0x1FFFFF) {
str.push_back((char)(0xF0 | (codepoint >> 18)));
str.push_back((char)(0x80 | ((codepoint >> 12) & 0x3F)));
str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
str.push_back((char)(0x80 | (codepoint & 0x3F)));
}
}
};
#endif
@@ -8,8 +8,6 @@
#include "univalue.h"
#include "univalue_escapes.h"
// TODO: Using UTF8
using namespace std;
static string json_escape(const string& inS)
@@ -23,15 +21,8 @@ static string json_escape(const string& inS)
if (escStr)
outS += escStr;
else if (ch < 0x80)
else
outS += ch;
else { // TODO handle UTF-8 properly
char tmpesc[16];
sprintf(tmpesc, "\\u%04x", ch);
outS += tmpesc;
}
}
return outS;
@@ -0,0 +1 @@
["\ud834"]
@@ -0,0 +1 @@
["\udd61"]
@@ -0,0 +1 @@
["���"]
@@ -0,0 +1 @@
["�"]
@@ -0,0 +1 @@
["a§■𐎒𝅘𝅥𝅯"]
Oops, something went wrong.

0 comments on commit 9c3d0fa

Please sign in to comment.