Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

more work on parser, getting UTF-8 unescaping working

  • Loading branch information...
commit 36f20ec8651760190620f79905e8642d0bc79599 1 parent c0aec93
@brianmario authored
View
14 benchmark/parser.rb
@@ -14,21 +14,21 @@
Benchmark.bm do |x|
puts "yajl-ruby"
x.report do
- Yajl::Parser.parse(json)
+ yajl = Yajl::Parser.parse(json)
end
puts "JsonMachine"
x.report do
- JsonMachine::Parser.new.parse(json)
+ json_machine = JsonMachine::Parser.new.parse(json)
end
- puts "JSON (pure)"
- x.report do
- JSON.parse(json, :max_nesting => false)
- end
+ # puts "JSON (pure)"
+ # x.report do
+ # JSON.parse(json, :max_nesting => false)
+ # end
# puts "ActiveSupport"
# x.report do
# active_support = ActiveSupport::JSON.decode(json)
# end
-end
+end
View
60 lib/core_ext/string.rb
@@ -0,0 +1,60 @@
+# encoding: UTF-8
+
+class String
+
+ # This was ported from Yajl (http://github.com/lloyd/yajl)
+ # The reason for this is because Ruby 1.8's Iconv class, nor 1.9's native Unicode
+ # implementation seemed to support surrogate characters (I may be wrong about that)
+ # The example below would throw exceptions on *every* attempt I'd tried to decode it.
+ # But Yajl decodes it fine, so I ported it's decoding logic into pure Ruby for us all
+ # to enjoy :)
+ #
+ # Takes an escaped string such as:
+ # "\u004d\u0430\u4e8c\ud800\udf02"
+ #
+ # And returns a new unescaped UTF-8 string like:
+ # Mа二������
+ #
+ def unescape_utf8
+ utf8Buf = nil
+ scanner = StringScanner.new(self)
+ while !scanner.eos?
+ if scanner.getch == "\\" && scanner.getch == "u"
+ utf8Buf ||= ""
+ codepoint = scanner.peek(4).to_i(16)
+ scanner.pos += 4
+
+ # check if this is a surrogate
+ if ((codepoint & 0xFC00) == 0xD800)
+ if scanner.getch == "\\" && scanner.getch == "u"
+ surrogate = scanner.peek(4).to_i(16)
+ scanner.pos += 4
+ codepoint = (((codepoint & 0x3F) << 10) |
+ ((((codepoint >> 6) & 0xF) + 1) << 16) |
+ (surrogate & 0x3FF))
+ end
+ end
+
+ if (codepoint < 0x80)
+ utf8Buf << codepoint
+ elsif (codepoint < 0x0800)
+ utf8Buf << ((codepoint >> 6) | 0xC0)
+ utf8Buf << ((codepoint & 0x3F) | 0x80)
+ elsif (codepoint < 0x10000)
+ utf8Buf << ((codepoint >> 12) | 0xE0)
+ utf8Buf << (((codepoint >> 6) & 0x3F) | 0x80)
+ utf8Buf << ((codepoint & 0x3F) | 0x80)
+ elsif (codepoint < 0x200000)
+ utf8Buf << ((codepoint >> 18) | 0xF0)
+ utf8Buf << (((codepoint >> 12) & 0x3F) | 0x80)
+ utf8Buf << (((codepoint >> 6) & 0x3F) | 0x80)
+ utf8Buf << ((codepoint & 0x3F) | 0x80)
+ else
+ utf8Buf << '?'
+ end
+
+ end
+ end
+ utf8Buf.nil? ? self : utf8Buf
+ end
+end
View
1  lib/json_machine.rb
@@ -1,5 +1,6 @@
# encoding: UTF-8
+require File.join(File.dirname(__FILE__), 'core_ext', 'string')
require File.join(File.dirname(__FILE__), 'json_machine', 'parser')
require File.join(File.dirname(__FILE__), 'json_machine', 'encoder')
View
16 lib/json_machine/parser.rb
@@ -1,4 +1,5 @@
# encoding: UTF-8
+
require 'strscan'
module JsonMachine
@@ -117,18 +118,9 @@ def internal_parse(str)
# grabs the contents of a string between " and ", even escaped strings
scanner.pos += 1 # don't need the wrapping " char
current = scanner.scan_until(/\"|\\\".+\"/m)
- current.gsub!(/\\[\\bfnrt]/) { |match| u if u = UNESCAPE_MAP[$&[1]] }
- current.gsub!(/\\([\\\/]|u[[:xdigit:]]{4})/) do
- ustr = $1
- if ustr[0,1] == 'u'
- [ustr[1..-1].to_i(16)].pack("U")
- elsif ustr == '\\'
- '\\\\'
- else
- ustr
- end
- end
- current = current[0,current.size-1]
+ current.gsub!(/\\[\\bfnrt]/) { |match| match if match = UNESCAPE_MAP[$&[1]] }
+ current = current.unescape_utf8
+ current = current[0,current.size-1] if current[current.size-1,1] == "\""
if @state == :wants_hash_key
found_hash_key(current)
else
Please sign in to comment.
Something went wrong with that request. Please try again.