Skip to content
This repository has been archived by the owner on Nov 10, 2017. It is now read-only.

Commit

Permalink
more work on parser, getting UTF-8 unescaping working
Browse files Browse the repository at this point in the history
  • Loading branch information
brianmario committed Aug 3, 2009
1 parent c0aec93 commit 36f20ec
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 19 deletions.
14 changes: 7 additions & 7 deletions benchmark/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,21 @@
Benchmark.bm do |x|
puts "yajl-ruby"
x.report do
Yajl::Parser.parse(json)
yajl = Yajl::Parser.parse(json)
end

puts "JsonMachine"
x.report do
JsonMachine::Parser.new.parse(json)
json_machine = JsonMachine::Parser.new.parse(json)
end

puts "JSON (pure)"
x.report do
JSON.parse(json, :max_nesting => false)
end
# puts "JSON (pure)"
# x.report do
# JSON.parse(json, :max_nesting => false)
# end

# puts "ActiveSupport"
# x.report do
# active_support = ActiveSupport::JSON.decode(json)
# end
end
end
60 changes: 60 additions & 0 deletions lib/core_ext/string.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# encoding: UTF-8

class String

# This was ported from Yajl (http://github.com/lloyd/yajl)
# The reason for this is because Ruby 1.8's Iconv class, nor 1.9's native Unicode
# implementation seemed to support surrogate characters (I may be wrong about that)
# The example below would throw exceptions on *every* attempt I'd tried to decode it.
# But Yajl decodes it fine, so I ported it's decoding logic into pure Ruby for us all
# to enjoy :)
#
# Takes an escaped string such as:
# "\u004d\u0430\u4e8c\ud800\udf02"
#
# And returns a new unescaped UTF-8 string like:
# Mа二������
#
def unescape_utf8
utf8Buf = nil
scanner = StringScanner.new(self)
while !scanner.eos?
if scanner.getch == "\\" && scanner.getch == "u"
utf8Buf ||= ""
codepoint = scanner.peek(4).to_i(16)
scanner.pos += 4

# check if this is a surrogate
if ((codepoint & 0xFC00) == 0xD800)
if scanner.getch == "\\" && scanner.getch == "u"
surrogate = scanner.peek(4).to_i(16)
scanner.pos += 4
codepoint = (((codepoint & 0x3F) << 10) |
((((codepoint >> 6) & 0xF) + 1) << 16) |
(surrogate & 0x3FF))
end
end

if (codepoint < 0x80)
utf8Buf << codepoint
elsif (codepoint < 0x0800)
utf8Buf << ((codepoint >> 6) | 0xC0)
utf8Buf << ((codepoint & 0x3F) | 0x80)
elsif (codepoint < 0x10000)
utf8Buf << ((codepoint >> 12) | 0xE0)
utf8Buf << (((codepoint >> 6) & 0x3F) | 0x80)
utf8Buf << ((codepoint & 0x3F) | 0x80)
elsif (codepoint < 0x200000)
utf8Buf << ((codepoint >> 18) | 0xF0)
utf8Buf << (((codepoint >> 12) & 0x3F) | 0x80)
utf8Buf << (((codepoint >> 6) & 0x3F) | 0x80)
utf8Buf << ((codepoint & 0x3F) | 0x80)
else
utf8Buf << '?'
end

end
end
utf8Buf.nil? ? self : utf8Buf
end
end
1 change: 1 addition & 0 deletions lib/json_machine.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# encoding: UTF-8

require File.join(File.dirname(__FILE__), 'core_ext', 'string')
require File.join(File.dirname(__FILE__), 'json_machine', 'parser')
require File.join(File.dirname(__FILE__), 'json_machine', 'encoder')

Expand Down
16 changes: 4 additions & 12 deletions lib/json_machine/parser.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# encoding: UTF-8

require 'strscan'

module JsonMachine
Expand Down Expand Up @@ -117,18 +118,9 @@ def internal_parse(str)
# grabs the contents of a string between " and ", even escaped strings
scanner.pos += 1 # don't need the wrapping " char
current = scanner.scan_until(/\"|\\\".+\"/m)
current.gsub!(/\\[\\bfnrt]/) { |match| u if u = UNESCAPE_MAP[$&[1]] }
current.gsub!(/\\([\\\/]|u[[:xdigit:]]{4})/) do
ustr = $1
if ustr[0,1] == 'u'
[ustr[1..-1].to_i(16)].pack("U")
elsif ustr == '\\'
'\\\\'
else
ustr
end
end
current = current[0,current.size-1]
current.gsub!(/\\[\\bfnrt]/) { |match| match if match = UNESCAPE_MAP[$&[1]] }
current = current.unescape_utf8
current = current[0,current.size-1] if current[current.size-1,1] == "\""
if @state == :wants_hash_key
found_hash_key(current)
else
Expand Down

0 comments on commit 36f20ec

Please sign in to comment.