Skip to content

Commit

Permalink
Improve Utils.normalize_uri
Browse files Browse the repository at this point in the history
Globally replacing generally unsafe characters in a URL would not fix
invalid authorities and paths, so use Addressable::URI to normalize them
when necessary.

This should fix #1701.
  • Loading branch information
knu committed Oct 4, 2016
1 parent fd5bd09 commit 62f661c
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 8 deletions.
37 changes: 32 additions & 5 deletions lib/utils.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
require 'jsonpath'
require 'cgi'
require 'addressable/uri'

module Utils
def self.unindent(s)
Expand All @@ -25,14 +26,40 @@ def self.normalize_uri(uri)
begin
URI(uri)
rescue URI::Error
URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe|
unsafe.bytes.each_with_object(String.new) { |uc, s|
s << sprintf('%%%02X', uc)
}
}.force_encoding(Encoding::US_ASCII))
begin
URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe|
unsafe.bytes.each_with_object(String.new) { |uc, s|
s << sprintf('%%%02X', uc)
}
}.force_encoding(Encoding::US_ASCII))
rescue URI::Error => e
begin
auri = Addressable::URI.parse(uri.to_s)
rescue
# Do not leak Addressable::URI::InvalidURIError which
# callers might not expect.
raise e
else
# Addressable::URI#normalize! modifies the query and
# fragment components beyond escaping unsafe characters, so
# avoid using it. Otherwise `?a[]=%2F` would be normalized
# as `?a%5B%5D=/`, for example.
auri.site = auri.normalized_site
auri.path = auri.normalized_path
URI(auri.to_s)
end
end
end
end

def self.percent_encode(string, pattern = /[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/)
string.gsub(pattern) { |unsafe|
unsafe.bytes.each_with_object(String.new) { |b, s|
s << sprintf('%%%02X', b)
}
}
end

def self.interpolate_jsonpaths(value, data, options = {})
if options[:leading_dollarsign_is_jsonpath] && value[0] == '$'
Utils.values_at(data, value).first.to_s
Expand Down
3 changes: 2 additions & 1 deletion spec/data_fixtures/urlTest.html
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
<li><a href="https://www.google.ca/search?q=위키백과:대문">unicode param</a></li>
<li><a href="http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded url</a></li>
<li><a href="https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded param</a></li>
<li><a href="http://example.org/path[]?query[]=foo">brackets</a></li>
</ul>
</body>
</html>
</html>
9 changes: 7 additions & 2 deletions spec/models/agents/website_agent_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1105,8 +1105,8 @@

describe "#check" do
before do
expect { @checker.check }.to change { Event.count }.by(7)
@events = Event.last(7)
expect { @checker.check }.to change { Event.count }.by(8)
@events = Event.last(8)
end

it "should check hostname" do
Expand Down Expand Up @@ -1143,6 +1143,11 @@
event = @events[6]
expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
end

it "should check url with unescaped brackets in the path component" do
event = @events[7]
expect(event.payload['url']).to eq("http://example.org/path%5B%5D?query[]=foo")
end
end
end
end

0 comments on commit 62f661c

Please sign in to comment.