Skip to content

Commit

Permalink
Thrift-1023:Thrift encoding (UTF-8) issue with Ruby 1.9.2
Browse files Browse the repository at this point in the history
Client: rb
Patch: Nathan Beyer 

Fixes encoding issue for UTF-8 strings in ruby client.



git-svn-id: https://svn.apache.org/repos/asf/thrift/trunk@1395832 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
jfarrell committed Oct 9, 2012
1 parent fc35612 commit b5a18a1
Show file tree
Hide file tree
Showing 24 changed files with 583 additions and 88 deletions.
7 changes: 5 additions & 2 deletions lib/rb/ext/binary_protocol_accelerated.c
Expand Up @@ -22,7 +22,8 @@
#include <stdint.h>
#include <constants.h>
#include <struct.h>
#include "macros.h"
#include <macros.h>
#include <bytes.h>

VALUE rb_thrift_binary_proto_native_qmark(VALUE self) {
return Qtrue;
Expand Down Expand Up @@ -80,6 +81,7 @@ static void write_string_direct(VALUE trans, VALUE str) {
if (TYPE(str) != T_STRING) {
rb_raise(rb_eStandardError, "Value should be a string");
}
str = convert_to_utf8_byte_buffer(str);
write_i32_direct(trans, RSTRING_LEN(str));
rb_funcall(trans, write_method_id, 1, str);
}
Expand Down Expand Up @@ -380,7 +382,8 @@ VALUE rb_thrift_binary_proto_read_double(VALUE self) {

VALUE rb_thrift_binary_proto_read_string(VALUE self) {
int size = read_i32_direct(self);
return READ(self, size);
VALUE buffer = READ(self, size);
return convert_to_string(buffer);
}

void Init_binary_protocol_accelerated() {
Expand Down
36 changes: 36 additions & 0 deletions lib/rb/ext/bytes.c
@@ -0,0 +1,36 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include <ruby.h>
#ifdef HAVE_RUBY_ENCODING_H
#include <ruby/encoding.h>
#endif
#include <constants.h>

VALUE force_binary_encoding(VALUE buffer) {
return rb_funcall(thrift_bytes_module, force_binary_encoding_id, 1, buffer);
}

VALUE convert_to_utf8_byte_buffer(VALUE string) {
return rb_funcall(thrift_bytes_module, convert_to_utf8_byte_buffer_id, 1, string);
}

VALUE convert_to_string(VALUE utf8_buffer) {
return rb_funcall(thrift_bytes_module, convert_to_string_id, 1, utf8_buffer);
}
31 changes: 31 additions & 0 deletions lib/rb/ext/bytes.h
@@ -0,0 +1,31 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include <ruby.h>

/*
* A collection of utilities for working with bytes and byte buffers.
*
* These methods are the native analogies to some of the methods in
* Thrift::Bytes (thrift/bytes.rb).
*/

VALUE force_binary_encoding(VALUE buffer);
VALUE convert_to_utf8_byte_buffer(VALUE string);
VALUE convert_to_string(VALUE utf8_buffer);
11 changes: 7 additions & 4 deletions lib/rb/ext/compact_protocol.c
Expand Up @@ -20,9 +20,10 @@
#include <ruby.h>
#include <stdbool.h>
#include <stdint.h>
#include "constants.h"
#include "struct.h"
#include "macros.h"
#include <constants.h>
#include <struct.h>
#include <macros.h>
#include <bytes.h>

#define LAST_ID(obj) FIX2INT(rb_ary_pop(rb_ivar_get(obj, last_field_id)))
#define SET_LAST_ID(obj, val) rb_ary_push(rb_ivar_get(obj, last_field_id), val)
Expand Down Expand Up @@ -305,6 +306,7 @@ VALUE rb_thrift_compact_proto_write_double(VALUE self, VALUE dub) {

VALUE rb_thrift_compact_proto_write_string(VALUE self, VALUE str) {
VALUE transport = GET_TRANSPORT(self);
str = convert_to_utf8_byte_buffer(str);
write_varint32(transport, RSTRING_LEN(str));
WRITE(transport, RSTRING_PTR(str), RSTRING_LEN(str));
return Qnil;
Expand Down Expand Up @@ -546,7 +548,8 @@ VALUE rb_thrift_compact_proto_read_double(VALUE self) {

VALUE rb_thrift_compact_proto_read_string(VALUE self) {
int64_t size = read_varint64(self);
return READ(self, size);
VALUE buffer = READ(self, size);
return convert_to_string(buffer);
}

static void Init_constants() {
Expand Down
4 changes: 4 additions & 0 deletions lib/rb/ext/constants.h
Expand Up @@ -76,6 +76,9 @@ extern ID write_method_id;
extern ID read_all_method_id;
extern ID read_into_buffer_method_id;
extern ID native_qmark_method_id;
extern ID force_binary_encoding_id;
extern ID convert_to_utf8_byte_buffer_id;
extern ID convert_to_string_id;

extern ID fields_const_id;
extern ID transport_ivar_id;
Expand All @@ -92,5 +95,6 @@ extern VALUE class_sym;
extern VALUE rb_cSet;
extern VALUE thrift_module;
extern VALUE thrift_types_module;
extern VALUE thrift_bytes_module;
extern VALUE class_thrift_protocol;
extern VALUE protocol_exception_class;
4 changes: 3 additions & 1 deletion lib/rb/ext/memory_buffer.c
Expand Up @@ -19,7 +19,8 @@

#include <ruby.h>
#include <constants.h>
#include "macros.h"
#include <bytes.h>
#include <macros.h>

ID buf_ivar_id;
ID index_ivar_id;
Expand All @@ -37,6 +38,7 @@ VALUE rb_thrift_memory_buffer_read_into_buffer(VALUE self, VALUE buffer_value, V

VALUE rb_thrift_memory_buffer_write(VALUE self, VALUE str) {
VALUE buf = GET_BUF(self);
str = force_binary_encoding(str);
rb_str_buf_cat(buf, RSTRING_PTR(str), RSTRING_LEN(str));
return Qnil;
}
Expand Down
9 changes: 9 additions & 0 deletions lib/rb/ext/thrift_native.c
Expand Up @@ -18,6 +18,7 @@
*/

#include <ruby.h>
#include <bytes.h>
#include <struct.h>
#include <binary_protocol_accelerated.h>
#include <compact_protocol.h>
Expand All @@ -27,6 +28,7 @@
// cached classes/modules
VALUE rb_cSet;
VALUE thrift_module;
VALUE thrift_bytes_module;
VALUE thrift_types_module;

// TType constants
Expand Down Expand Up @@ -90,6 +92,9 @@ ID write_method_id;
ID read_all_method_id;
ID read_into_buffer_method_id;
ID native_qmark_method_id;
ID force_binary_encoding_id;
ID convert_to_utf8_byte_buffer_id;
ID convert_to_string_id;

// constant ids
ID fields_const_id;
Expand All @@ -109,6 +114,7 @@ VALUE protocol_exception_class;
void Init_thrift_native() {
// cached classes
thrift_module = rb_const_get(rb_cObject, rb_intern("Thrift"));
thrift_bytes_module = rb_const_get(thrift_module, rb_intern("Bytes"));
thrift_types_module = rb_const_get(thrift_module, rb_intern("Types"));
rb_cSet = rb_const_get(rb_cObject, rb_intern("Set"));
protocol_exception_class = rb_const_get(thrift_module, rb_intern("ProtocolException"));
Expand Down Expand Up @@ -173,6 +179,9 @@ void Init_thrift_native() {
read_all_method_id = rb_intern("read_all");
read_into_buffer_method_id = rb_intern("read_into_buffer");
native_qmark_method_id = rb_intern("native?");
force_binary_encoding_id = rb_intern("force_binary_encoding");
convert_to_utf8_byte_buffer_id = rb_intern("convert_to_utf8_byte_buffer");
convert_to_string_id = rb_intern("convert_to_string");

// constant ids
fields_const_id = rb_intern("FIELDS");
Expand Down
1 change: 1 addition & 0 deletions lib/rb/lib/thrift.rb
Expand Up @@ -22,6 +22,7 @@

$:.unshift File.dirname(__FILE__)

require 'thrift/bytes'
require 'thrift/core_ext'
require 'thrift/exceptions'
require 'thrift/types'
Expand Down
131 changes: 131 additions & 0 deletions lib/rb/lib/thrift/bytes.rb
@@ -0,0 +1,131 @@
# encoding: ascii-8bit
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#

module Thrift
# A collection of utilities for working with bytes and byte buffers.
module Bytes
if RUBY_VERSION >= '1.9'
# Creates and empty byte buffer (String with BINARY encoding)
#
# size - The Integer size of the buffer (default: nil) to create
#
# Returns a String with BINARY encoding, filled with null characters
# if size is greater than zero
def self.empty_byte_buffer(size = nil)
if (size && size > 0)
"\0".force_encoding(Encoding::BINARY) * size
else
''.force_encoding(Encoding::BINARY)
end
end

# Forces the encoding of the buffer to BINARY. If the buffer
# passed is frozen, then it will be duplicated.
#
# buffer - The String to force the encoding of.
#
# Returns the String passed with an encoding of BINARY; returned
# String may be a duplicate.
def self.force_binary_encoding(buffer)
buffer = buffer.dup if buffer.frozen?
buffer.force_encoding(Encoding::BINARY)
end

# Gets the byte value of a given position in a String.
#
# string - The String to retrive the byte value from.
# index - The Integer location of the byte value to retrieve.
#
# Returns an Integer value between 0 and 255.
def self.get_string_byte(string, index)
string.getbyte(index)
end

# Sets the byte value given to a given index in a String.
#
# string - The String to set the byte value in.
# index - The Integer location to set the byte value at.
# byte - The Integer value (0 to 255) to set in the string.
#
# Returns an Integer value of the byte value to set.
def self.set_string_byte(string, index, byte)
string.setbyte(index, byte)
end

# Converts the given String to a UTF-8 byte buffer.
#
# string - The String to convert.
#
# Returns a new String with BINARY encoding, containing the UTF-8
# bytes of the original string.
def self.convert_to_utf8_byte_buffer(string)
if string.encoding != Encoding::UTF_8
# transcode to UTF-8
string = string.encode(Encoding::UTF_8)
else
# encoding is already UTF-8, but a duplicate is needed
string = string.dup
end
string.force_encoding(Encoding::BINARY)
end

# Converts the given UTF-8 byte buffer into a String
#
# utf8_buffer - A String, with BINARY encoding, containing UTF-8 bytes
#
# Returns a new String with UTF-8 encoding,
def self.convert_to_string(utf8_buffer)
# duplicate the buffer, force encoding to UTF-8
utf8_buffer.dup.force_encoding(Encoding::UTF_8)
end
else
def self.empty_byte_buffer(size = nil)
if (size && size > 0)
"\0" * size
else
''
end
end

def self.force_binary_encoding(buffer)
buffer
end

def self.get_string_byte(string, index)
string[index]
end

def self.set_string_byte(string, index, byte)
string[index] = byte
end

def self.convert_to_utf8_byte_buffer(string)
# This assumes $KCODE is 'UTF8'/'U', which would mean the String is already a UTF-8 byte buffer
# TODO consider handling other $KCODE values and transcoding with iconv
string
end

def self.convert_to_string(utf8_buffer)
# See comment in 'convert_to_utf8_byte_buffer' for relevant assumptions.
utf8_buffer
end
end
end
end
10 changes: 10 additions & 0 deletions lib/rb/lib/thrift/protocol/base_protocol.rb
Expand Up @@ -114,6 +114,13 @@ def write_double(dub)
raise NotImplementedError
end

# Writes a Thrift String. In Ruby 1.9+, the String passed will be transcoded to UTF-8.
#
# str - The String to write.
#
# Raises EncodingError if the transcoding to UTF-8 fails.
#
# Returns nothing.
def write_string(str)
raise NotImplementedError
end
Expand Down Expand Up @@ -178,6 +185,9 @@ def read_double
raise NotImplementedError
end

# Reads a Thrift String. In Ruby 1.9+, all String will be returned with an Encoding of UTF-8.
#
# Returns a String.
def read_string
raise NotImplementedError
end
Expand Down

0 comments on commit b5a18a1

Please sign in to comment.