Skip to content

Commit

Permalink
Merge pull request #1743 from cantino/website_agent_can_interpolate_a…
Browse files Browse the repository at this point in the history
…fter_extraction

WebsiteAgent can interpolate after extraction

Incorporating feedback from @cantino and @dsander.
  • Loading branch information
knu committed Nov 1, 2016
2 parents e3f1429 + 58fabb8 commit 91f096b
Show file tree
Hide file tree
Showing 6 changed files with 222 additions and 15 deletions.
5 changes: 5 additions & 0 deletions app/concerns/liquid_interpolatable.rb
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,11 @@ def uri_expand(url, limit = 5)
url
end

# Rebase URIs contained in attributes in a given HTML fragment
def rebase_hrefs(input, base_uri)
Utils.rebase_hrefs(input, base_uri) rescue input
end

# Unescape (basic) HTML entities in a string
#
# This currently decodes the following entities only: "'",
Expand Down
70 changes: 58 additions & 12 deletions app/models/agents/website_agent.rb
Original file line number Diff line number Diff line change
Expand Up @@ -111,24 +111,41 @@ class WebsiteAgent < Agent
Set `http_success_codes` to an array of status codes (e.g., `[404, 422]`) to treat HTTP response codes beyond 200 as successes.
If a `template` option is given, it is used as a Liquid template for each event created by this Agent, instead of directly emitting the results of extraction as events. In the template, keys of extracted data can be interpolated, and some additional variables are also available as explained in the next section. For example:
"template": {
"url": "{{ url }}",
"title": "{{ title }}",
"description": "{{ body_text }}",
"last_modified": "{{ _response_.headers.Last-Modified | date: '%FT%T' }}"
}
In the `on_change` mode, change is detected based on the resulted event payload after applying this option. If you want to add some keys to each event but ignore any change in them, set `mode` to `all` and put a DeDuplicationAgent downstream.
# Liquid Templating
In Liquid templating, the following variable is available:
In Liquid templating, the following variables are available except when invoked by `data_from_event`:
* `_url_`: The URL specified to fetch the content from.
* `_response_`: A response object with the following keys:
* `status`: HTTP status as integer. (Almost always 200)
* `headers`: Response headers; for example, `{{ _response_.headers.Content-Type }}` expands to the value of the Content-Type header. Keys are insensitive to cases and -/_.
* `url`: The final URL of the fetched page, following redirects. Using this in the `template` option, you can resolve relative URLs extracted from a document like `{{ link | to_uri: _request_.url }}` and `{{ content | rebase_hrefs: _request_.url }}`.
# Ordering Events
#{description_events_order}
MD

event_description do
keys = options['template'].presence || options['extract'].keys

"Events will have the following fields:\n\n %s" % [
Utils.pretty_print(Hash[options['extract'].keys.map { |key|
Utils.pretty_print(Hash[keys.map { |key|
[key, "..."]
}])
]
Expand Down Expand Up @@ -157,6 +174,7 @@ def validate_options
errors.add(:base, "either url, url_from_event, or data_from_event are required") unless options['url'].present? || options['url_from_event'].present? || options['data_from_event'].present?
errors.add(:base, "expected_update_period_in_days is required") unless options['expected_update_period_in_days'].present?
validate_extract_options!
validate_template_options!
validate_http_success_codes!

# Check for optional fields
Expand Down Expand Up @@ -281,6 +299,15 @@ def validate_extract_options!
end
end

def validate_template_options!
template = options['template'].presence or return

unless Hash === template &&
template.each_pair.all? { |key, value| String === value }
errors.add(:base, 'template must be a hash of strings.')
end
end

def check
check_urls(interpolated['url'])
end
Expand All @@ -305,6 +332,7 @@ def check_url(url, existing_payload = {})
raise "Failed: #{response.inspect}" unless consider_response_successful?(response)

interpolation_context.stack {
interpolation_context['_url_'] = uri.to_s
interpolation_context['_response_'] = ResponseDrop.new(response)
handle_data(response.body, response.env[:url], existing_payload)
}
Expand Down Expand Up @@ -343,20 +371,33 @@ def handle_data(body, url, existing_payload)
extract_xml(doc)
end

num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq

if num_unique_lengths.length != 1
if output.each_value.each_cons(2).any? { |m, n| m.size != n.size }
raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
end

old_events = previous_payloads num_unique_lengths.first
num_unique_lengths.first.times do |index|
result = {}
interpolated['extract'].keys.each do |name|
result[name] = output[name][index]
if name.to_s == 'url' && url.present?
result[name] = (url + Utils.normalize_uri(result[name])).to_s
num_tuples = output.each_value.first.size

old_events = previous_payloads num_tuples

template = options['template'].presence

num_tuples.times do |index|
extracted = {}
interpolated['extract'].each_key do |name|
extracted[name] = output[name][index]
end

result =
if template
interpolate_with(extracted) do
interpolate_options(template)
end
else
extracted
end

if payload_url = result['url'].presence
result['url'] = (url + Utils.normalize_uri(payload_url)).to_s
end

if store_payload!(old_events, result)
Expand Down Expand Up @@ -567,6 +608,11 @@ def headers
def status
@object.status
end

# The URL
def url
@object.env.url.to_s
end
end

# Wraps Faraday::Utils::Headers
Expand Down
89 changes: 89 additions & 0 deletions lib/utils.rb
Original file line number Diff line number Diff line change
Expand Up @@ -170,4 +170,93 @@ def self.if_present(string, method)
nil
end
end

module HTMLTransformer
SINGLE = 1
MULTIPLE = 2
COMMA_SEPARATED = 3
SRCSET = 4

URI_ATTRIBUTES = {
'a' => { 'href' => SINGLE },
'applet' => { 'archive' => COMMA_SEPARATED, 'codebase' => SINGLE },
'area' => { 'href' => SINGLE },
'audio' => { 'src' => SINGLE },
'base' => { 'href' => SINGLE },
'blockquote' => { 'cite' => SINGLE },
'body' => { 'background' => SINGLE },
'button' => { 'formaction' => SINGLE },
'command' => { 'icon' => SINGLE },
'del' => { 'cite' => SINGLE },
'embed' => { 'src' => SINGLE },
'form' => { 'action' => SINGLE },
'frame' => { 'longdesc' => SINGLE, 'src' => SINGLE },
'head' => { 'profile' => SINGLE },
'html' => { 'manifest' => SINGLE },
'iframe' => { 'longdesc' => SINGLE, 'src' => SINGLE },
'img' => { 'longdesc' => SINGLE, 'src' => SINGLE, 'srcset' => SRCSET, 'usemap' => SINGLE },
'input' => { 'formaction' => SINGLE, 'src' => SINGLE, 'usemap' => SINGLE },
'ins' => { 'cite' => SINGLE },
'link' => { 'href' => SINGLE },
'object' => { 'archive' => MULTIPLE, 'classid' => SINGLE, 'codebase' => SINGLE, 'data' => SINGLE, 'usemap' => SINGLE },
'q' => { 'cite' => SINGLE },
'script' => { 'src' => SINGLE },
'source' => { 'src' => SINGLE, 'srcset' => SRCSET },
'video' => { 'poster' => SINGLE, 'src' => SINGLE },
}

URI_ELEMENTS_XPATH = '//*[%s]' % URI_ATTRIBUTES.keys.map { |name| "name()='#{name}'" }.join(' or ')

module_function

def transform(html, &block)
block or raise ArgumentError, 'block must be given'

case html
when /\A\s*(?:<\?xml[\s?]|<!DOCTYPE\s)/i
doc = Nokogiri.parse(html)
yield doc
doc.to_s
when /\A\s*<(html|head|body)[\s>]/i
# Libxml2 automatically adds DOCTYPE and <html>, so we need to
# skip them.
element_name = $1
doc = Nokogiri::HTML::Document.parse(html)
yield doc
doc.at_xpath("//#{element_name}").xpath('self::node() | following-sibling::node()').to_s
else
doc = Nokogiri::HTML::Document.parse("<html><body>#{html}")
yield doc
doc.xpath("/html/body/node()").to_s
end
end

def replace_uris(html, &block)
block or raise ArgumentError, 'block must be given'

transform(html) { |doc|
doc.xpath(URI_ELEMENTS_XPATH).each { |element|
uri_attrs = URI_ATTRIBUTES[element.name] or next
uri_attrs.each { |name, format|
attr = element.attribute(name) or next
case format
when SINGLE
attr.value = block.call(attr.value.strip)
when MULTIPLE
attr.value = attr.value.gsub(/(\S+)/) { block.call($1) }
when COMMA_SEPARATED, SRCSET
attr.value = attr.value.gsub(/((?:\A|,)\s*)(\S+)/) { $1 + block.call($2) }
end
}
}
}
end
end

def self.rebase_hrefs(html, base_uri)
base_uri = normalize_uri(base_uri)
HTMLTransformer.replace_uris(html) { |url|
base_uri.merge(normalize_uri(url)).to_s
}
end
end
38 changes: 38 additions & 0 deletions spec/concerns/liquid_interpolatable_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -323,4 +323,42 @@ def ensure_safety(obj)
end
end
end

describe 'rebase_hrefs' do
let(:agent) { Agents::InterpolatableAgent.new(name: "test") }

let(:fragment) { <<HTML }
<ul>
<li>
<a href="downloads/file1"><img src="/images/iconA.png" srcset="/images/iconA.png 1x, /images/iconA@2x.png 2x">file1</a>
</li>
<li>
<a href="downloads/file2"><img src="/images/iconA.png" srcset="/images/iconA.png 1x, /images/iconA@2x.png 2x">file2</a>
</li>
<li>
<a href="downloads/file3"><img src="/images/iconB.png" srcset="/images/iconB.png 1x, /images/iconB@2x.png 2x">file3</a>
</li>
</ul>
HTML

let(:replaced_fragment) { <<HTML }
<ul>
<li>
<a href="http://example.com/support/downloads/file1"><img src="http://example.com/images/iconA.png" srcset="http://example.com/images/iconA.png 1x, http://example.com/images/iconA@2x.png 2x">file1</a>
</li>
<li>
<a href="http://example.com/support/downloads/file2"><img src="http://example.com/images/iconA.png" srcset="http://example.com/images/iconA.png 1x, http://example.com/images/iconA@2x.png 2x">file2</a>
</li>
<li>
<a href="http://example.com/support/downloads/file3"><img src="http://example.com/images/iconB.png" srcset="http://example.com/images/iconB.png 1x, http://example.com/images/iconB@2x.png 2x">file3</a>
</li>
</ul>
HTML

it 'rebases relative URLs in a fragment' do
agent.interpolation_context['content'] = fragment
agent.options['template'] = "{{ content | rebase_hrefs: 'http://example.com/support/files.html' }}"
expect(agent.interpolated['template']).to eq(replaced_fragment)
end
end
end
30 changes: 28 additions & 2 deletions spec/models/agents/website_agent_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
headers: {
'X-Status-Message' => 'OK'
})
stub_request(:any, /xkcd\.com\/index$/).to_return(status: 301,
headers: {
'Location' => 'http://xkcd.com/'
})
@valid_options = {
'name' => "XKCD",
'expected_update_period_in_days' => "2",
Expand Down Expand Up @@ -729,14 +733,36 @@
end

it "should interpolate _response_" do
@valid_options['url'] = 'http://xkcd.com/index'
@valid_options['extract']['response_info'] =
@valid_options['extract']['url'].merge(
'value' => '"{{ "The reponse was " | append:_response_.status | append:" " | append:_response_.headers.X-Status-Message | append:"." }}"'
'value' => '{{ "The reponse from " | append:_response_.url | append:" was " | append:_response_.status | append:" " | append:_response_.headers.X-Status-Message | append:"." | to_xpath }}'
)
@valid_options['extract']['original_url'] =
@valid_options['extract']['url'].merge(
'value' => '{{ _url_ | to_xpath }}'
)
@checker.options = @valid_options
@checker.check
event = Event.last
expect(event.payload['response_info']).to eq('The reponse from http://xkcd.com/ was 200 OK.')
expect(event.payload['original_url']).to eq('http://xkcd.com/index')
end

it "should be formatted by template after extraction" do
@valid_options['template'] = {
'url' => '{{url}}',
'title' => '{{title | upcase}}',
'summary' => '{{title}}: {{hovertext | truncate: 20}}',
}
@checker.options = @valid_options
@checker.check
event = Event.last
expect(event.payload['response_info']).to eq('The reponse was 200 OK.')
expect(event.payload).to eq({
'title' => 'EVOLVING',
'url' => 'http://imgs.xkcd.com/comics/evolving.png',
'summary' => 'Evolving: Biologists play r...',
})
end

describe "XML" do
Expand Down
5 changes: 4 additions & 1 deletion spec/support/shared_examples/agent_controller_concern.rb
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,10 @@
end

it "should configure targets with nested objects" do
agent.control_targets << agents(:bob_data_output_agent)
agent.control_targets = [
agents(:bob_basecamp_agent), # does not support a `template` option, but anyway
agents(:bob_data_output_agent)
]
agent.options['action'] = 'configure'
agent.options['configure_options'] = {
template: {
Expand Down

0 comments on commit 91f096b

Please sign in to comment.