Skip to content

Commit

Permalink
Add a repeat option to extractors
Browse files Browse the repository at this point in the history
This allows user to include a value that only appears once in a content
in all events created from the content.
  • Loading branch information
knu committed Nov 2, 2016
1 parent 71feb75 commit f67da0b
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 18 deletions.
65 changes: 51 additions & 14 deletions app/models/agents/website_agent.rb
Expand Up @@ -33,7 +33,9 @@ class WebsiteAgent < Agent
To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor. E.g., if you're extracting rows, all extractors must match all rows. For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful.
Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor except when it has `repeat` set to true. E.g., if you're extracting rows, all extractors must match all rows. For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful.
For extractors with `repeat` set to true, their first matches will be included in all extracts. This is useful such as when you want to include the title of a page in all events created from the page.
# Scraping HTML and XML
Expand All @@ -42,7 +44,8 @@ class WebsiteAgent < Agent
"extract": {
"url": { "css": "#comic img", "value": "@src" },
"title": { "css": "#comic img", "value": "@title" },
"body_text": { "css": "div.main", "value": "string(.)" }
"body_text": { "css": "div.main", "value": "string(.)" },
"page_title": { "css": "title", "value": "string(.)" }
}
"@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and `string(.)` gives a string with all the enclosed text nodes concatenated without entity escaping (such as `&amp;`). To extract the innerHTML, use `./node()`; and to extract the outer HTML, use `.`.
Expand Down Expand Up @@ -371,21 +374,24 @@ def handle_data(body, url, existing_payload)
extract_xml(doc)
end

if output.each_value.each_cons(2).any? { |m, n| m.size != n.size }
raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
end

num_tuples = output.each_value.first.size
num_tuples = output.each_value.inject(nil) { |num, value|
case size = value.size
when Float::INFINITY
num
when Integer
if num && num != size
raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
end
size
end
} or raise "At least one non-repeat key is required"

old_events = previous_payloads num_tuples

template = options['template'].presence

num_tuples.times do |index|
extracted = {}
interpolated['extract'].each_key do |name|
extracted[name] = output[name][index]
end
num_tuples.times.zip(*output.values) do |index, *values|
extracted = output.each_key.lazy.zip(values).to_h

result =
if template
Expand Down Expand Up @@ -510,8 +516,14 @@ def use_namespaces?

def extract_each(&block)
interpolated['extract'].each_with_object({}) { |(name, extraction_details), output|
values = []
block.call(extraction_details, values)
if boolify(extraction_details['repeat'])
values = Repeater.new { |repeater|
block.call(extraction_details, repeater)
}
else
values = []
block.call(extraction_details, values)
end
log "Values extracted: #{values}"
output[name] = values
}
Expand Down Expand Up @@ -599,6 +611,31 @@ def is_positive_integer?(value)
false
end

class Repeater < Enumerator
# Repeater.new { |y|
# # ...
# y << value
# } #=> [value, ...]
def initialize(&block)
@value = nil
super(Float::INFINITY) { |y|
loop { y << @value }
}
catch(:done) {
block.call(self)
}
end

def <<(value)
@value = value
throw :done
end

def to_s
"[#{@value.inspect}, ...]"
end
end

# Wraps Faraday::Response
class ResponseDrop < LiquidDroppable::Drop
def headers
Expand Down
22 changes: 18 additions & 4 deletions spec/models/agents/website_agent_spec.rb
Expand Up @@ -782,6 +782,7 @@
'title' => { 'xpath' => '/feed/entry', 'value' => 'normalize-space(./title)' },
'url' => { 'xpath' => '/feed/entry', 'value' => './link[1]/@href' },
'thumbnail' => { 'xpath' => '/feed/entry', 'value' => './thumbnail/@url' },
'page_title': { 'xpath': '/feed/title', 'value': 'string(.)', 'repeat' => true }
}
}, keep_events_for: 2.days)
@checker.user = users(:bob)
Expand All @@ -792,7 +793,10 @@
expect {
@checker.check
}.to change { Event.count }.by(20)
event = Event.last
events = Event.last(20)
expect(events.size).to eq(20)
expect(events.map { |event| event.payload['page_title'] }.uniq).to eq(['Recent Commits to huginn:master'])
event = events.last
expect(event.payload['title']).to eq('Shift to dev group')
expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
Expand Down Expand Up @@ -935,6 +939,7 @@
it "can handle arrays" do
json = {
'response' => {
'status' => 'ok',
'data' => [
{'title' => "first", 'version' => 2},
{'title' => "second", 'version' => 2.5}
Expand All @@ -949,8 +954,9 @@
'url' => "http://json-site.com",
'mode' => 'on_change',
'extract' => {
:title => {'path' => "response.data[*].title"},
:version => {'path' => "response.data[*].version"}
'title' => { 'path' => "response.data[*].title" },
'version' => { 'path' => "response.data[*].version" },
'status' => { 'path' => "response.status", 'repeat' => true },
}
}
checker = Agents::WebsiteAgent.new(:name => "Weather Site", :options => site)
Expand All @@ -962,9 +968,11 @@
}.to change { Event.count }.by(2)

(event2, event1) = Event.last(2)
expect(event1.payload['status']).to eq('ok')
expect(event1.payload['version']).to eq(2.5)
expect(event1.payload['title']).to eq("second")

expect(event2.payload['status']).to eq('ok')
expect(event2.payload['version']).to eq(2)
expect(event2.payload['title']).to eq("first")
end
Expand Down Expand Up @@ -998,6 +1006,7 @@
describe "text parsing" do
before do
stub_request(:any, /text-site/).to_return(body: <<-EOF, status: 200)
VERSION 1
water: wet
fire: hot
EOF
Expand All @@ -1008,6 +1017,7 @@
'url' => 'http://text-site.com',
'mode' => 'on_change',
'extract' => {
'version' => { 'regexp' => '^VERSION (.+)$', index: 1, repeat: true },
'word' => { 'regexp' => '^(.+?): (.+)$', index: 1 },
'property' => { 'regexp' => '^(.+?): (.+)$', index: '2' },
}
Expand All @@ -1018,7 +1028,7 @@
end

it "works with regexp with named capture" do
@checker.options = @checker.options.merge('extract' => {
@checker.options = @checker.options.deep_merge('extract' => {
'word' => { 'regexp' => '^(?<word>.+?): (?<property>.+)$', index: 'word' },
'property' => { 'regexp' => '^(?<word>.+?): (?<property>.+)$', index: 'property' },
})
Expand All @@ -1028,8 +1038,10 @@
}.to change { Event.count }.by(2)

event1, event2 = Event.last(2)
expect(event1.payload['version']).to eq('1')
expect(event1.payload['word']).to eq('water')
expect(event1.payload['property']).to eq('wet')
expect(event2.payload['version']).to eq('1')
expect(event2.payload['word']).to eq('fire')
expect(event2.payload['property']).to eq('hot')
end
Expand All @@ -1040,8 +1052,10 @@
}.to change { Event.count }.by(2)

event1, event2 = Event.last(2)
expect(event1.payload['version']).to eq('1')
expect(event1.payload['word']).to eq('water')
expect(event1.payload['property']).to eq('wet')
expect(event2.payload['version']).to eq('1')
expect(event2.payload['word']).to eq('fire')
expect(event2.payload['property']).to eq('hot')
end
Expand Down

0 comments on commit f67da0b

Please sign in to comment.