diff --git a/lib/rb/ext/binary_protocol_accelerated.c b/lib/rb/ext/binary_protocol_accelerated.c index bd1c2da1094..a8ebe7faf33 100644 --- a/lib/rb/ext/binary_protocol_accelerated.c +++ b/lib/rb/ext/binary_protocol_accelerated.c @@ -22,7 +22,8 @@ #include #include #include -#include "macros.h" +#include +#include VALUE rb_thrift_binary_proto_native_qmark(VALUE self) { return Qtrue; @@ -80,6 +81,7 @@ static void write_string_direct(VALUE trans, VALUE str) { if (TYPE(str) != T_STRING) { rb_raise(rb_eStandardError, "Value should be a string"); } + str = convert_to_utf8_byte_buffer(str); write_i32_direct(trans, RSTRING_LEN(str)); rb_funcall(trans, write_method_id, 1, str); } @@ -380,7 +382,8 @@ VALUE rb_thrift_binary_proto_read_double(VALUE self) { VALUE rb_thrift_binary_proto_read_string(VALUE self) { int size = read_i32_direct(self); - return READ(self, size); + VALUE buffer = READ(self, size); + return convert_to_string(buffer); } void Init_binary_protocol_accelerated() { diff --git a/lib/rb/ext/bytes.c b/lib/rb/ext/bytes.c new file mode 100644 index 00000000000..8a6fac4ac43 --- /dev/null +++ b/lib/rb/ext/bytes.c @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#ifdef HAVE_RUBY_ENCODING_H +#include +#endif +#include + +VALUE force_binary_encoding(VALUE buffer) { + return rb_funcall(thrift_bytes_module, force_binary_encoding_id, 1, buffer); +} + +VALUE convert_to_utf8_byte_buffer(VALUE string) { + return rb_funcall(thrift_bytes_module, convert_to_utf8_byte_buffer_id, 1, string); +} + +VALUE convert_to_string(VALUE utf8_buffer) { + return rb_funcall(thrift_bytes_module, convert_to_string_id, 1, utf8_buffer); +} diff --git a/lib/rb/ext/bytes.h b/lib/rb/ext/bytes.h new file mode 100644 index 00000000000..7108d83ffca --- /dev/null +++ b/lib/rb/ext/bytes.h @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +/* + * A collection of utilities for working with bytes and byte buffers. + * + * These methods are the native analogies to some of the methods in + * Thrift::Bytes (thrift/bytes.rb). + */ + +VALUE force_binary_encoding(VALUE buffer); +VALUE convert_to_utf8_byte_buffer(VALUE string); +VALUE convert_to_string(VALUE utf8_buffer); diff --git a/lib/rb/ext/compact_protocol.c b/lib/rb/ext/compact_protocol.c index a47fe6c7624..0c054813d83 100644 --- a/lib/rb/ext/compact_protocol.c +++ b/lib/rb/ext/compact_protocol.c @@ -20,9 +20,10 @@ #include #include #include -#include "constants.h" -#include "struct.h" -#include "macros.h" +#include +#include +#include +#include #define LAST_ID(obj) FIX2INT(rb_ary_pop(rb_ivar_get(obj, last_field_id))) #define SET_LAST_ID(obj, val) rb_ary_push(rb_ivar_get(obj, last_field_id), val) @@ -305,6 +306,7 @@ VALUE rb_thrift_compact_proto_write_double(VALUE self, VALUE dub) { VALUE rb_thrift_compact_proto_write_string(VALUE self, VALUE str) { VALUE transport = GET_TRANSPORT(self); + str = convert_to_utf8_byte_buffer(str); write_varint32(transport, RSTRING_LEN(str)); WRITE(transport, RSTRING_PTR(str), RSTRING_LEN(str)); return Qnil; @@ -546,7 +548,8 @@ VALUE rb_thrift_compact_proto_read_double(VALUE self) { VALUE rb_thrift_compact_proto_read_string(VALUE self) { int64_t size = read_varint64(self); - return READ(self, size); + VALUE buffer = READ(self, size); + return convert_to_string(buffer); } static void Init_constants() { diff --git a/lib/rb/ext/constants.h b/lib/rb/ext/constants.h index 9ea00d2ee7b..3bfac886eae 100644 --- a/lib/rb/ext/constants.h +++ b/lib/rb/ext/constants.h @@ -76,6 +76,9 @@ extern ID write_method_id; extern ID read_all_method_id; extern ID read_into_buffer_method_id; extern ID native_qmark_method_id; +extern ID force_binary_encoding_id; +extern ID convert_to_utf8_byte_buffer_id; +extern ID convert_to_string_id; extern ID fields_const_id; extern ID transport_ivar_id; @@ -92,5 +95,6 @@ extern VALUE class_sym; extern VALUE rb_cSet; extern VALUE thrift_module; extern VALUE thrift_types_module; +extern VALUE thrift_bytes_module; extern VALUE class_thrift_protocol; extern VALUE protocol_exception_class; diff --git a/lib/rb/ext/memory_buffer.c b/lib/rb/ext/memory_buffer.c index 319b0734a69..e7253dcf2bc 100644 --- a/lib/rb/ext/memory_buffer.c +++ b/lib/rb/ext/memory_buffer.c @@ -19,7 +19,8 @@ #include #include -#include "macros.h" +#include +#include ID buf_ivar_id; ID index_ivar_id; @@ -37,6 +38,7 @@ VALUE rb_thrift_memory_buffer_read_into_buffer(VALUE self, VALUE buffer_value, V VALUE rb_thrift_memory_buffer_write(VALUE self, VALUE str) { VALUE buf = GET_BUF(self); + str = force_binary_encoding(str); rb_str_buf_cat(buf, RSTRING_PTR(str), RSTRING_LEN(str)); return Qnil; } diff --git a/lib/rb/ext/thrift_native.c b/lib/rb/ext/thrift_native.c index 2f6bb1ab1f5..f066d6c9cd2 100644 --- a/lib/rb/ext/thrift_native.c +++ b/lib/rb/ext/thrift_native.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -27,6 +28,7 @@ // cached classes/modules VALUE rb_cSet; VALUE thrift_module; +VALUE thrift_bytes_module; VALUE thrift_types_module; // TType constants @@ -90,6 +92,9 @@ ID write_method_id; ID read_all_method_id; ID read_into_buffer_method_id; ID native_qmark_method_id; +ID force_binary_encoding_id; +ID convert_to_utf8_byte_buffer_id; +ID convert_to_string_id; // constant ids ID fields_const_id; @@ -109,6 +114,7 @@ VALUE protocol_exception_class; void Init_thrift_native() { // cached classes thrift_module = rb_const_get(rb_cObject, rb_intern("Thrift")); + thrift_bytes_module = rb_const_get(thrift_module, rb_intern("Bytes")); thrift_types_module = rb_const_get(thrift_module, rb_intern("Types")); rb_cSet = rb_const_get(rb_cObject, rb_intern("Set")); protocol_exception_class = rb_const_get(thrift_module, rb_intern("ProtocolException")); @@ -173,6 +179,9 @@ void Init_thrift_native() { read_all_method_id = rb_intern("read_all"); read_into_buffer_method_id = rb_intern("read_into_buffer"); native_qmark_method_id = rb_intern("native?"); + force_binary_encoding_id = rb_intern("force_binary_encoding"); + convert_to_utf8_byte_buffer_id = rb_intern("convert_to_utf8_byte_buffer"); + convert_to_string_id = rb_intern("convert_to_string"); // constant ids fields_const_id = rb_intern("FIELDS"); diff --git a/lib/rb/lib/thrift.rb b/lib/rb/lib/thrift.rb index 72050b1d175..fb9e04a2f11 100644 --- a/lib/rb/lib/thrift.rb +++ b/lib/rb/lib/thrift.rb @@ -22,6 +22,7 @@ $:.unshift File.dirname(__FILE__) +require 'thrift/bytes' require 'thrift/core_ext' require 'thrift/exceptions' require 'thrift/types' diff --git a/lib/rb/lib/thrift/bytes.rb b/lib/rb/lib/thrift/bytes.rb new file mode 100644 index 00000000000..efd4f6440cc --- /dev/null +++ b/lib/rb/lib/thrift/bytes.rb @@ -0,0 +1,131 @@ +# encoding: ascii-8bit +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +module Thrift + # A collection of utilities for working with bytes and byte buffers. + module Bytes + if RUBY_VERSION >= '1.9' + # Creates and empty byte buffer (String with BINARY encoding) + # + # size - The Integer size of the buffer (default: nil) to create + # + # Returns a String with BINARY encoding, filled with null characters + # if size is greater than zero + def self.empty_byte_buffer(size = nil) + if (size && size > 0) + "\0".force_encoding(Encoding::BINARY) * size + else + ''.force_encoding(Encoding::BINARY) + end + end + + # Forces the encoding of the buffer to BINARY. If the buffer + # passed is frozen, then it will be duplicated. + # + # buffer - The String to force the encoding of. + # + # Returns the String passed with an encoding of BINARY; returned + # String may be a duplicate. + def self.force_binary_encoding(buffer) + buffer = buffer.dup if buffer.frozen? + buffer.force_encoding(Encoding::BINARY) + end + + # Gets the byte value of a given position in a String. + # + # string - The String to retrive the byte value from. + # index - The Integer location of the byte value to retrieve. + # + # Returns an Integer value between 0 and 255. + def self.get_string_byte(string, index) + string.getbyte(index) + end + + # Sets the byte value given to a given index in a String. + # + # string - The String to set the byte value in. + # index - The Integer location to set the byte value at. + # byte - The Integer value (0 to 255) to set in the string. + # + # Returns an Integer value of the byte value to set. + def self.set_string_byte(string, index, byte) + string.setbyte(index, byte) + end + + # Converts the given String to a UTF-8 byte buffer. + # + # string - The String to convert. + # + # Returns a new String with BINARY encoding, containing the UTF-8 + # bytes of the original string. + def self.convert_to_utf8_byte_buffer(string) + if string.encoding != Encoding::UTF_8 + # transcode to UTF-8 + string = string.encode(Encoding::UTF_8) + else + # encoding is already UTF-8, but a duplicate is needed + string = string.dup + end + string.force_encoding(Encoding::BINARY) + end + + # Converts the given UTF-8 byte buffer into a String + # + # utf8_buffer - A String, with BINARY encoding, containing UTF-8 bytes + # + # Returns a new String with UTF-8 encoding, + def self.convert_to_string(utf8_buffer) + # duplicate the buffer, force encoding to UTF-8 + utf8_buffer.dup.force_encoding(Encoding::UTF_8) + end + else + def self.empty_byte_buffer(size = nil) + if (size && size > 0) + "\0" * size + else + '' + end + end + + def self.force_binary_encoding(buffer) + buffer + end + + def self.get_string_byte(string, index) + string[index] + end + + def self.set_string_byte(string, index, byte) + string[index] = byte + end + + def self.convert_to_utf8_byte_buffer(string) + # This assumes $KCODE is 'UTF8'/'U', which would mean the String is already a UTF-8 byte buffer + # TODO consider handling other $KCODE values and transcoding with iconv + string + end + + def self.convert_to_string(utf8_buffer) + # See comment in 'convert_to_utf8_byte_buffer' for relevant assumptions. + utf8_buffer + end + end + end +end diff --git a/lib/rb/lib/thrift/protocol/base_protocol.rb b/lib/rb/lib/thrift/protocol/base_protocol.rb index b19909d5ffc..a5a174d7375 100644 --- a/lib/rb/lib/thrift/protocol/base_protocol.rb +++ b/lib/rb/lib/thrift/protocol/base_protocol.rb @@ -114,6 +114,13 @@ def write_double(dub) raise NotImplementedError end + # Writes a Thrift String. In Ruby 1.9+, the String passed will be transcoded to UTF-8. + # + # str - The String to write. + # + # Raises EncodingError if the transcoding to UTF-8 fails. + # + # Returns nothing. def write_string(str) raise NotImplementedError end @@ -178,6 +185,9 @@ def read_double raise NotImplementedError end + # Reads a Thrift String. In Ruby 1.9+, all String will be returned with an Encoding of UTF-8. + # + # Returns a String. def read_string raise NotImplementedError end diff --git a/lib/rb/lib/thrift/protocol/binary_protocol.rb b/lib/rb/lib/thrift/protocol/binary_protocol.rb index f9adb20364b..252827615a8 100644 --- a/lib/rb/lib/thrift/protocol/binary_protocol.rb +++ b/lib/rb/lib/thrift/protocol/binary_protocol.rb @@ -32,8 +32,7 @@ def initialize(trans, strict_read=true, strict_write=true) # Pre-allocated read buffer for fixed-size read methods. Needs to be at least 8 bytes long for # read_i64() and read_double(). - @rbuf = "\0" * 8 - @rbuf.force_encoding("BINARY") if @rbuf.respond_to?(:force_encoding) + @rbuf = Bytes.empty_byte_buffer(8) end def write_message_begin(name, type, seqid) @@ -108,6 +107,7 @@ def write_double(dub) end def write_string(str) + str = Bytes.convert_to_utf8_byte_buffer(str) write_i32(str.length) trans.write(str) end @@ -214,9 +214,9 @@ def read_double end def read_string - sz = read_i32 - dat = trans.read_all(sz) - dat + size = read_i32 + buffer = trans.read_all(size) + Bytes.convert_to_string(buffer) end end diff --git a/lib/rb/lib/thrift/protocol/compact_protocol.rb b/lib/rb/lib/thrift/protocol/compact_protocol.rb index ede82f2bd75..758e1ae8648 100644 --- a/lib/rb/lib/thrift/protocol/compact_protocol.rb +++ b/lib/rb/lib/thrift/protocol/compact_protocol.rb @@ -100,8 +100,7 @@ def initialize(transport) @boolean_value = nil # Pre-allocated read buffer for read_double(). - @rbuf = "\0" * 8 - @rbuf.force_encoding("BINARY") if @rbuf.respond_to?(:force_encoding) + @rbuf = Bytes.empty_byte_buffer(8) end def write_message_begin(name, type, seqid) @@ -211,6 +210,7 @@ def write_double(dub) end def write_string(str) + str = Bytes.convert_to_utf8_byte_buffer(str) write_varint32(str.length) @trans.write(str) end @@ -333,7 +333,8 @@ def read_double def read_string size = read_varint32() - trans.read_all(size) + buffer = trans.read_all(size) + Bytes.convert_to_string(buffer) end diff --git a/lib/rb/lib/thrift/protocol/json_protocol.rb b/lib/rb/lib/thrift/protocol/json_protocol.rb index ddbf193f381..6f8d1d170f6 100644 --- a/lib/rb/lib/thrift/protocol/json_protocol.rb +++ b/lib/rb/lib/thrift/protocol/json_protocol.rb @@ -1,3 +1,4 @@ +# encoding: UTF-8 # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -482,13 +483,21 @@ def read_json_syntax_char(ch) end # Decodes the four hex parts of a JSON escaped string character and returns - # the character via out. The first two characters must be "00". + # the character via out. + # + # Note - this only supports Unicode characters in the BMP (U+0000 to U+FFFF); + # characters above the BMP are encoded as two escape sequences (surrogate pairs), + # which is not yet implemented def read_json_escape_char - read_json_syntax_char('0') - read_json_syntax_char('0') str = @reader.read str += @reader.read - str.hex.chr + str += @reader.read + str += @reader.read + if RUBY_VERSION >= '1.9' + str.hex.chr(Encoding::UTF_8) + else + str.hex.chr + end end # Decodes a JSON string, including unescaping, and returns the string via str diff --git a/lib/rb/lib/thrift/transport/base_transport.rb b/lib/rb/lib/thrift/transport/base_transport.rb index 0a12cea3a89..879032644ff 100644 --- a/lib/rb/lib/thrift/transport/base_transport.rb +++ b/lib/rb/lib/thrift/transport/base_transport.rb @@ -35,22 +35,14 @@ def initialize(type=UNKNOWN, message=nil) end module TransportUtils - if RUBY_VERSION >= '1.9' - def self.get_string_byte(string, index) - string.getbyte(index) - end - - def self.set_string_byte(string, index, byte) - string.setbyte(index, byte) - end - else - def self.get_string_byte(string, index) - string[index] - end + # Deprecated: Use Thrift::Bytes instead + def self.get_string_byte(string, index) + Bytes.get_string_byte(string, index) + end - def self.set_string_byte(string, index, byte) - string[index] = byte - end + # Deprecated: Use Thrift::Bytes instead + def self.set_string_byte(string, index, byte) + Bytes.set_string_byte(string, index, byte) end end @@ -61,6 +53,11 @@ def open; end def close; end + # Reads a number of bytes from the transports. In Ruby 1.9+, the String returned will have a BINARY (aka ASCII8BIT) encoding. + # + # sz - The number of bytes to read from the transport. + # + # Returns a String acting as a byte buffer. def read(sz) raise NotImplementedError end @@ -68,7 +65,7 @@ def read(sz) # Returns an unsigned byte as a Fixnum in the range (0..255). def read_byte buf = read_all(1) - return ::Thrift::TransportUtils.get_string_byte(buf, 0) + return Bytes.get_string_byte(buf, 0) end # Reads size bytes and copies them into buffer[0..size]. @@ -76,14 +73,14 @@ def read_into_buffer(buffer, size) tmp = read_all(size) i = 0 tmp.each_byte do |byte| - ::Thrift::TransportUtils.set_string_byte(buffer, i, byte) + Bytes.set_string_byte(buffer, i, byte) i += 1 end i end def read_all(size) - return '' if size <= 0 + return Bytes.empty_byte_buffer if size <= 0 buf = read(size) while (buf.length < size) chunk = read(size - buf.length) @@ -92,7 +89,12 @@ def read_all(size) buf end - + + # Writes the byte buffer to the transport. In Ruby 1.9+, the buffer will be forced into BINARY encoding. + # + # buf - A String acting as a byte buffer. + # + # Returns nothing. def write(buf); end alias_method :<<, :write @@ -104,4 +106,4 @@ def get_transport(trans) return trans end end -end \ No newline at end of file +end diff --git a/lib/rb/lib/thrift/transport/buffered_transport.rb b/lib/rb/lib/thrift/transport/buffered_transport.rb index 676a4d306bc..781d3c69cfa 100644 --- a/lib/rb/lib/thrift/transport/buffered_transport.rb +++ b/lib/rb/lib/thrift/transport/buffered_transport.rb @@ -24,8 +24,8 @@ class BufferedTransport < BaseTransport def initialize(transport) @transport = transport - @wbuf = '' - @rbuf = '' + @wbuf = Bytes.empty_byte_buffer + @rbuf = Bytes.empty_byte_buffer @index = 0 end @@ -44,12 +44,12 @@ def close def read(sz) @index += sz - ret = @rbuf.slice(@index - sz, sz) || '' + ret = @rbuf.slice(@index - sz, sz) || Bytes.empty_byte_buffer if ret.length == 0 @rbuf = @transport.read([sz, DEFAULT_BUFFER].max) @index = sz - ret = @rbuf.slice(0, sz) || '' + ret = @rbuf.slice(0, sz) || Bytes.empty_byte_buffer end ret @@ -65,9 +65,15 @@ def read_byte # The read buffer has some data now, read a single byte. Using get_string_byte() avoids # allocating a temp string of size 1 unnecessarily. @index += 1 - return ::Thrift::TransportUtils.get_string_byte(@rbuf, @index - 1) + return Bytes.get_string_byte(@rbuf, @index - 1) end + # Reads a number of bytes from the transport into the buffer passed. + # + # buffer - The String (byte buffer) to write data to; this is assumed to have a BINARY encoding. + # size - The number of bytes to read from the transport and write to the buffer. + # + # Returns the number of bytes read. def read_into_buffer(buffer, size) i = 0 while i < size @@ -78,8 +84,8 @@ def read_into_buffer(buffer, size) end # The read buffer has some data now, so copy bytes over to the output buffer. - byte = ::Thrift::TransportUtils.get_string_byte(@rbuf, @index) - ::Thrift::TransportUtils.set_string_byte(buffer, i, byte) + byte = Bytes.get_string_byte(@rbuf, @index) + Bytes.set_string_byte(buffer, i, byte) @index += 1 i += 1 end @@ -87,13 +93,13 @@ def read_into_buffer(buffer, size) end def write(buf) - @wbuf << buf + @wbuf << Bytes.force_binary_encoding(buf) end def flush - if @wbuf != '' + unless @wbuf.empty? @transport.write(@wbuf) - @wbuf = '' + @wbuf = Bytes.empty_byte_buffer end @transport.flush diff --git a/lib/rb/lib/thrift/transport/framed_transport.rb b/lib/rb/lib/thrift/transport/framed_transport.rb index e7630d05c49..d806ce022b6 100644 --- a/lib/rb/lib/thrift/transport/framed_transport.rb +++ b/lib/rb/lib/thrift/transport/framed_transport.rb @@ -22,8 +22,8 @@ module Thrift class FramedTransport < BaseTransport def initialize(transport, read=true, write=true) @transport = transport - @rbuf = '' - @wbuf = '' + @rbuf = Bytes.empty_byte_buffer + @wbuf = Bytes.empty_byte_buffer @read = read @write = write @index = 0 @@ -44,12 +44,12 @@ def close def read(sz) return @transport.read(sz) unless @read - return '' if sz <= 0 + return Bytes.empty_byte_buffer if sz <= 0 read_frame if @index >= @rbuf.length @index += sz - @rbuf.slice(@index - sz, sz) || '' + @rbuf.slice(@index - sz, sz) || Bytes.empty_byte_buffer end def read_byte @@ -60,7 +60,7 @@ def read_byte # The read buffer has some data now, read a single byte. Using get_string_byte() avoids # allocating a temp string of size 1 unnecessarily. @index += 1 - return ::Thrift::TransportUtils.get_string_byte(@rbuf, @index - 1) + return Bytes.get_string_byte(@rbuf, @index - 1) end def read_into_buffer(buffer, size) @@ -69,18 +69,18 @@ def read_into_buffer(buffer, size) read_frame if @index >= @rbuf.length # The read buffer has some data now, so copy bytes over to the output buffer. - byte = ::Thrift::TransportUtils.get_string_byte(@rbuf, @index) - ::Thrift::TransportUtils.set_string_byte(buffer, i, byte) + byte = Bytes.get_string_byte(@rbuf, @index) + Bytes.set_string_byte(buffer, i, byte) @index += 1 i += 1 end i end - - def write(buf,sz=nil) + def write(buf, sz=nil) return @transport.write(buf) unless @write + buf = Bytes.force_binary_encoding(buf) @wbuf << (sz ? buf[0...sz] : buf) end @@ -92,10 +92,11 @@ def flush return @transport.flush unless @write out = [@wbuf.length].pack('N') + # Array#pack should return a BINARY encoded String, so it shouldn't be necessary to force encoding out << @wbuf @transport.write(out) @transport.flush - @wbuf = '' + @wbuf = Bytes.empty_byte_buffer end private diff --git a/lib/rb/lib/thrift/transport/http_client_transport.rb b/lib/rb/lib/thrift/transport/http_client_transport.rb index 1ef0fab5a99..07f74bc42bc 100644 --- a/lib/rb/lib/thrift/transport/http_client_transport.rb +++ b/lib/rb/lib/thrift/transport/http_client_transport.rb @@ -29,12 +29,12 @@ class HTTPClientTransport < BaseTransport def initialize(url) @url = URI url @headers = {'Content-Type' => 'application/x-thrift'} - @outbuf = "" + @outbuf = Bytes.empty_byte_buffer end def open?; true end def read(sz); @inbuf.read sz end - def write(buf); @outbuf << buf end + def write(buf); @outbuf << Bytes.force_binary_encoding(buf) end def add_headers(headers) @headers = @headers.merge(headers) @@ -42,11 +42,12 @@ def add_headers(headers) def flush http = Net::HTTP.new @url.host, @url.port - http.use_ssl = @url.scheme == "https" + http.use_ssl = @url.scheme == 'https' resp = http.post(@url.request_uri, @outbuf, @headers) data = resp.body + data = Bytes.force_binary_encoding(data) @inbuf = StringIO.new data - @outbuf = "" + @outbuf = Bytes.empty_byte_buffer end end end diff --git a/lib/rb/lib/thrift/transport/io_stream_transport.rb b/lib/rb/lib/thrift/transport/io_stream_transport.rb index be348aa09fb..e3c8379da65 100644 --- a/lib/rb/lib/thrift/transport/io_stream_transport.rb +++ b/lib/rb/lib/thrift/transport/io_stream_transport.rb @@ -32,7 +32,7 @@ def initialize(input, output) def open?; not @input.closed? or not @output.closed? end def read(sz); @input.read(sz) end - def write(buf); @output.write(buf) end + def write(buf); @output.write(Bytes.force_binary_encoding(buf)) end def close; @input.close; @output.close end def to_io; @input end # we're assuming this is used in a IO.select for reading end diff --git a/lib/rb/lib/thrift/transport/memory_buffer_transport.rb b/lib/rb/lib/thrift/transport/memory_buffer_transport.rb index 62c529232d3..ad5ad855589 100644 --- a/lib/rb/lib/thrift/transport/memory_buffer_transport.rb +++ b/lib/rb/lib/thrift/transport/memory_buffer_transport.rb @@ -28,7 +28,7 @@ class MemoryBufferTransport < BaseTransport # this behavior is no longer required. If you wish to change it # go ahead, just make sure the specs pass def initialize(buffer = nil) - @buf = buffer || '' + @buf = buffer ? Bytes.force_binary_encoding(buffer) : Bytes.empty_byte_buffer @index = 0 end @@ -48,7 +48,7 @@ def peek # this method does not use the passed object directly but copies it def reset_buffer(new_buf = '') - @buf.replace new_buf + @buf.replace Bytes.force_binary_encoding(new_buf) @index = 0 end @@ -72,7 +72,7 @@ def read(len) def read_byte raise EOFError.new("Not enough bytes remain in buffer") if @index >= @buf.size - val = ::Thrift::TransportUtils.get_string_byte(@buf, @index) + val = Bytes.get_string_byte(@buf, @index) @index += 1 if @index >= GARBAGE_BUFFER_SIZE @buf = @buf.slice(@index..-1) @@ -87,8 +87,8 @@ def read_into_buffer(buffer, size) raise EOFError.new("Not enough bytes remain in buffer") if @index >= @buf.size # The read buffer has some data now, so copy bytes over to the output buffer. - byte = ::Thrift::TransportUtils.get_string_byte(@buf, @index) - ::Thrift::TransportUtils.set_string_byte(buffer, i, byte) + byte = Bytes.get_string_byte(@buf, @index) + Bytes.set_string_byte(buffer, i, byte) @index += 1 i += 1 end @@ -100,7 +100,7 @@ def read_into_buffer(buffer, size) end def write(wbuf) - @buf << wbuf + @buf << Bytes.force_binary_encoding(wbuf) end def flush diff --git a/lib/rb/lib/thrift/transport/socket.rb b/lib/rb/lib/thrift/transport/socket.rb index 36461e9a549..2b7ca095c44 100644 --- a/lib/rb/lib/thrift/transport/socket.rb +++ b/lib/rb/lib/thrift/transport/socket.rb @@ -61,6 +61,7 @@ def open? def write(str) raise IOError, "closed stream" unless open? + str = Bytes.force_binary_encoding(str) begin if @timeout.nil? or @timeout == 0 @handle.write(str) diff --git a/lib/rb/spec/binary_protocol_spec_shared.rb b/lib/rb/spec/binary_protocol_spec_shared.rb index ce4931f2622..c49ff1ff65c 100644 --- a/lib/rb/spec/binary_protocol_spec_shared.rb +++ b/lib/rb/spec/binary_protocol_spec_shared.rb @@ -1,3 +1,4 @@ +# encoding: ascii-8bit # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -192,13 +193,41 @@ it "should error gracefully when trying to write a nil double" do lambda { @prot.write_double(nil) }.should raise_error end - - it "should write a string" do - str = "hello world" - @prot.write_string(str) - @trans.read(@trans.available).should == [str.size].pack("N") + str + + if RUBY_VERSION >= '1.9' + it 'should write a string' do + str = 'abc' + @prot.write_string(str) + a = @trans.read(@trans.available) + a.encoding.should == Encoding::BINARY + a.unpack('C*').should == [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63] + end + + it 'should write a string with unicode characters' do + str = "abc \u20AC \u20AD".encode('UTF-8') + @prot.write_string(str) + a = @trans.read(@trans.available) + a.encoding.should == Encoding::BINARY + a.unpack('C*').should == [0x00, 0x00, 0x00, 0x0B, 0x61, 0x62, 0x63, 0x20, + 0xE2, 0x82, 0xAC, 0x20, 0xE2, 0x82, 0xAD] + end + + it 'should write should write a string with unicode characters and transcoding' do + str = "abc \u20AC".encode('ISO-8859-15') + @prot.write_string(str) + a = @trans.read(@trans.available) + a.encoding.should == Encoding::BINARY + a.unpack('C*').should == [0x00, 0x00, 0x00, 0x07, 0x61, 0x62, 0x63, 0x20, 0xE2, 0x82, 0xAC] + end + else + it 'should write a string' do + str = 'abc' + @prot.write_string(str) + a = @trans.read(@trans.available) + a.unpack('C*').should == [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63] + end end - + it "should error gracefully when trying to write a nil string" do lambda { @prot.write_string(nil) }.should raise_error end @@ -294,11 +323,32 @@ @prot.read_double.should == f end end - - it "should read a string" do - str = "hello world" - @trans.write([str.size].pack("N") + str) - @prot.read_string.should == str + + if RUBY_VERSION >= '1.9' + it 'should read a string' do + # i32 of value 3, followed by three characters/UTF-8 bytes 'a', 'b', 'c' + buffer = [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63].pack('C*') + @trans.write(buffer) + a = @prot.read_string + a.should == 'abc'.encode('UTF-8') + a.encoding.should == Encoding::UTF_8 + end + + it 'should read a string containing unicode characters from UTF-8 encoded buffer' do + # i32 of value 3, followed by one character U+20AC made up of three bytes + buffer = [0x00, 0x00, 0x00, 0x03, 0xE2, 0x82, 0xAC].pack('C*') + @trans.write(buffer) + a = @prot.read_string + a.should == "\u20AC".encode('UTF-8') + a.encoding.should == Encoding::UTF_8 + end + else + it 'should read a string' do + # i32 of value 3, followed by three characters/UTF-8 bytes 'a', 'b', 'c' + buffer = [0x00, 0x00, 0x00, 0x03, 0x61, 0x62, 0x63].pack('C*') + @trans.write(buffer) + @prot.read_string.should == 'abc' + end end it "should perform a complete rpc with no args or return" do diff --git a/lib/rb/spec/bytes_spec.rb b/lib/rb/spec/bytes_spec.rb new file mode 100644 index 00000000000..b82e304b758 --- /dev/null +++ b/lib/rb/spec/bytes_spec.rb @@ -0,0 +1,160 @@ +# encoding: UTF-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +require 'spec_helper' + +describe Thrift::Bytes do + if RUBY_VERSION >= '1.9' + describe '.empty_byte_buffer' do + it 'should create an empty buffer' do + b = Thrift::Bytes.empty_byte_buffer + b.length.should == 0 + b.encoding.should == Encoding::BINARY + end + + it 'should create an empty buffer of given size' do + b = Thrift::Bytes.empty_byte_buffer 2 + b.length.should == 2 + b.getbyte(0).should == 0 + b.getbyte(1).should == 0 + b.encoding.should == Encoding::BINARY + end + end + + describe '.force_binary_encoding' do + it 'should change encoding' do + e = 'STRING'.encode('UTF-8') + e.encoding.should_not == Encoding::BINARY + a = Thrift::Bytes.force_binary_encoding e + a.encoding.should == Encoding::BINARY + end + end + + describe '.get_string_byte' do + it 'should get the byte at index' do + s = "\x41\x42" + Thrift::Bytes.get_string_byte(s, 0).should == 0x41 + Thrift::Bytes.get_string_byte(s, 1).should == 0x42 + end + end + + describe '.set_string_byte' do + it 'should set byte value at index' do + s = "\x41\x42" + Thrift::Bytes.set_string_byte(s, 0, 0x43) + s.getbyte(0).should == 0x43 + s.should == 'CB' + end + end + + describe '.convert_to_utf8_byte_buffer' do + it 'should convert UTF-8 String to byte buffer' do + e = "\u20AC".encode('UTF-8') # a string with euro sign character U+20AC + e.length.should == 1 + + a = Thrift::Bytes.convert_to_utf8_byte_buffer e + a.encoding.should == Encoding::BINARY + a.length.should == 3 + a.unpack('C*').should == [0xE2, 0x82, 0xAC] + end + + it 'should convert ISO-8859-15 String to UTF-8 byte buffer' do + # Assumptions + e = "\u20AC".encode('ISO-8859-15') # a string with euro sign character U+20AC, then converted to ISO-8859-15 + e.length.should == 1 + e.unpack('C*').should == [0xA4] # euro sign is a different code point in ISO-8859-15 + + a = Thrift::Bytes.convert_to_utf8_byte_buffer e + a.encoding.should == Encoding::BINARY + a.length.should == 3 + a.unpack('C*').should == [0xE2, 0x82, 0xAC] + end + end + + describe '.convert_to_string' do + it 'should convert UTF-8 byte buffer to a UTF-8 String' do + e = [0xE2, 0x82, 0xAC].pack("C*") + e.encoding.should == Encoding::BINARY + a = Thrift::Bytes.convert_to_string e + a.encoding.should == Encoding::UTF_8 + a.should == "\u20AC" + end + end + + else # RUBY_VERSION + describe '.empty_byte_buffer' do + it 'should create an empty buffer' do + b = Thrift::Bytes.empty_byte_buffer + b.length.should == 0 + end + + it 'should create an empty buffer of given size' do + b = Thrift::Bytes.empty_byte_buffer 2 + b.length.should == 2 + b[0].should == 0 + b[1].should == 0 + end + end + + describe '.force_binary_encoding' do + it 'should be a no-op' do + e = 'STRING' + a = Thrift::Bytes.force_binary_encoding e + a.should == e + a.should be(e) + end + end + + describe '.get_string_byte' do + it 'should get the byte at index' do + s = "\x41\x42" + Thrift::Bytes.get_string_byte(s, 0).should == 0x41 + Thrift::Bytes.get_string_byte(s, 1).should == 0x42 + end + end + + describe '.set_string_byte' do + it 'should set byte value at index' do + s = "\x41\x42" + Thrift::Bytes.set_string_byte(s, 0, 0x43) + s[0].should == 0x43 + s.should == 'CB' + end + end + + describe '.convert_to_utf8_byte_buffer' do + it 'should be a no-op' do + e = 'STRING' + a = Thrift::Bytes.convert_to_utf8_byte_buffer e + a.should == e + a.should be(e) + end + end + + describe '.convert_to_string' do + it 'should be a no-op' do + e = 'STRING' + a = Thrift::Bytes.convert_to_string e + a.should == e + a.should be(e) + end + end + end +end diff --git a/lib/rb/spec/compact_protocol_spec.rb b/lib/rb/spec/compact_protocol_spec.rb index 13c6b83d107..91dfe4402ea 100644 --- a/lib/rb/spec/compact_protocol_spec.rb +++ b/lib/rb/spec/compact_protocol_spec.rb @@ -1,3 +1,4 @@ +# encoding: UTF-8 # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -25,7 +26,7 @@ :i16 => (0..14).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort, :i32 => (0..30).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort, :i64 => (0..62).map {|shift| [1 << shift, -(1 << shift)]}.flatten.sort, - :string => ["", "1", "short", "fourteen123456", "fifteen12345678", "1" * 127, "1" * 3000], + :string => ["", "1", "short", "fourteen123456", "fifteen12345678", "unicode characters: \u20AC \u20AD", "1" * 127, "1" * 3000], :binary => ["", "\001", "\001" * 5, "\001" * 14, "\001" * 15, "\001" * 127, "\001" * 3000], :double => [0.0, 1.0, -1.0, 1.1, -1.1, 10000000.1, 1.0/0.0, -1.0/0.0], :bool => [true, false] diff --git a/lib/rb/spec/json_protocol_spec.rb b/lib/rb/spec/json_protocol_spec.rb index 3945925f8dc..a294ac5b908 100644 --- a/lib/rb/spec/json_protocol_spec.rb +++ b/lib/rb/spec/json_protocol_spec.rb @@ -1,3 +1,4 @@ +# encoding: UTF-8 # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -220,9 +221,25 @@ @trans.read(@trans.available).should == "\"-Infinity\"" end - it "should write string" do - @prot.write_string("this is a test string") - @trans.read(@trans.available).should == "\"this is a test string\"" + if RUBY_VERSION >= '1.9' + it 'should write string' do + @prot.write_string('this is a test string') + a = @trans.read(@trans.available) + a.should == '"this is a test string"'.force_encoding(Encoding::BINARY) + a.encoding.should == Encoding::BINARY + end + + it 'should write string with unicode characters' do + @prot.write_string("this is a test string with unicode characters: \u20AC \u20AD") + a = @trans.read(@trans.available) + a.should == "\"this is a test string with unicode characters: \u20AC \u20AD\"".force_encoding(Encoding::BINARY) + a.encoding.should == Encoding::BINARY + end + else + it 'should write string' do + @prot.write_string('this is a test string') + @trans.read(@trans.available).should == '"this is a test string"' + end end it "should write binary" do @@ -461,9 +478,25 @@ @prot.read_double.should == 12.23 end - it "should read string" do - @trans.write("\"this is a test string\"") - @prot.read_string.should == "this is a test string" + if RUBY_VERSION >= '1.9' + it 'should read string' do + @trans.write('"this is a test string"'.force_encoding(Encoding::BINARY)) + a = @prot.read_string + a.should == 'this is a test string' + a.encoding.should == Encoding::UTF_8 + end + + it 'should read string with unicode characters' do + @trans.write('"this is a test string with unicode characters: \u20AC \u20AD"'.force_encoding(Encoding::BINARY)) + a = @prot.read_string + a.should == "this is a test string with unicode characters: \u20AC \u20AD" + a.encoding.should == Encoding::UTF_8 + end + else + it 'should read string' do + @trans.write('"this is a test string"') + @prot.read_string.should == 'this is a test string' + end end it "should read binary" do