Skip to content

Commit

Permalink
Introduce Utils.normalize_uri and use it in WebsiteAgent
Browse files Browse the repository at this point in the history
This fixes #938, and the specs are from #958. (Thanks @irfancharania!)
  • Loading branch information
knu committed Nov 14, 2015
1 parent 8e9013d commit cdfdc7f
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 3 deletions.
7 changes: 4 additions & 3 deletions app/models/agents/website_agent.rb
Expand Up @@ -264,8 +264,9 @@ def check_url(url, payload = {})
error "Ignoring a non-HTTP url: #{url.inspect}"
return
end
log "Fetching #{url}"
response = faraday.get(url)
uri = Utils.normalize_uri(url)
log "Fetching #{uri}"
response = faraday.get(uri)
raise "Failed: #{response.inspect}" unless response.success?

interpolation_context.stack {
Expand Down Expand Up @@ -303,7 +304,7 @@ def check_url(url, payload = {})
interpolated['extract'].keys.each do |name|
result[name] = output[name][index]
if name.to_s == 'url'
result[name] = (response.env[:url] + result[name]).to_s
result[name] = (response.env[:url] + Utils.normalize_uri(result[name])).to_s
end
end

Expand Down
12 changes: 12 additions & 0 deletions lib/utils.rb
Expand Up @@ -21,6 +21,18 @@ def self.pretty_print(struct, indent = true)
end
end

def self.normalize_uri(uri)
begin
URI(uri)
rescue URI::Error
URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe|
unsafe.bytes.each_with_object(String.new) { |uc, s|
s << sprintf('%%%02X', uc)
}
}.force_encoding(Encoding::US_ASCII))
end
end

def self.interpolate_jsonpaths(value, data, options = {})
if options[:leading_dollarsign_is_jsonpath] && value[0] == '$'
Utils.values_at(data, value).first.to_s
Expand Down
17 changes: 17 additions & 0 deletions spec/data_fixtures/urlTest.html
@@ -0,0 +1,17 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>test</title>
</head>
<body>
<ul>
<li><a href="http://google.com">google</a></li>
<li><a href="https://www.google.ca/search?q=some query">broken</a></li>
<li><a href="https://www.google.ca/search?q=some%20query">escaped</a></li>
<li><a href="http://ko.wikipedia.org/wiki/위키백과:대문">unicode url</a></li>
<li><a href="https://www.google.ca/search?q=위키백과:대문">unicode param</a></li>
<li><a href="http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded url</a></li>
<li><a href="https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded param</a></li>
</ul>
</body>
</html>
63 changes: 63 additions & 0 deletions spec/models/agents/website_agent_spec.rb
Expand Up @@ -911,4 +911,67 @@
end
end
end

describe "checking urls" do
before do
stub_request(:any, /example/).
to_return(:body => File.read(Rails.root.join("spec/data_fixtures/urlTest.html")), :status => 200)
@valid_options = {
'name' => "Url Test",
'expected_update_period_in_days' => "2",
'type' => "html",
'url' => "http://www.example.com",
'mode' => 'all',
'extract' => {
'url' => { 'css' => "a", 'value' => "@href" },
}
}
@checker = Agents::WebsiteAgent.new(:name => "ua", :options => @valid_options)
@checker.user = users(:bob)
@checker.save!
end

describe "#check" do
before do
expect { @checker.check }.to change { Event.count }.by(7)
@events = Event.last(7)
end

it "should check hostname" do
event = @events[0]
expect(event.payload['url']).to eq("http://google.com")
end

it "should check unescaped query" do
event = @events[1]
expect(event.payload['url']).to eq("https://www.google.ca/search?q=some%20query")
end

it "should check properly escaped query" do
event = @events[2]
expect(event.payload['url']).to eq("https://www.google.ca/search?q=some%20query")
end

it "should check unescaped unicode url" do
event = @events[3]
expect(event.payload['url']).to eq("http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
end

it "should check unescaped unicode query" do
event = @events[4]
expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
end

it "should check properly escaped unicode url" do
event = @events[5]
expect(event.payload['url']).to eq("http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
end

it "should check properly escaped unicode query" do
event = @events[6]
expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
end

end
end
end

0 comments on commit cdfdc7f

Please sign in to comment.