Add a repeat option to extractors

This allows user to include a value that only appears once in a content in all events created from the content.
huginn · Nov 2, 2016 · f67da0b · f67da0b
1 parent 71feb75
commit f67da0b
Show file tree

Hide file tree

Showing 2 changed files with 69 additions and 18 deletions.
diff --git a/app/models/agents/website_agent.rb b/app/models/agents/website_agent.rb
@@ -33,7 +33,9 @@ class WebsiteAgent < Agent
 
       To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
 
-      Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor.  E.g., if you're extracting rows, all extractors must match all rows.  For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful.
+      Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor except when it has `repeat` set to true.  E.g., if you're extracting rows, all extractors must match all rows.  For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful.
+
+      For extractors with `repeat` set to true, their first matches will be included in all extracts.  This is useful such as when you want to include the title of a page in all events created from the page.
 
       # Scraping HTML and XML
 
@@ -42,7 +44,8 @@ class WebsiteAgent < Agent
           "extract": {
             "url": { "css": "#comic img", "value": "@src" },
             "title": { "css": "#comic img", "value": "@title" },
-            "body_text": { "css": "div.main", "value": "string(.)" }
+            "body_text": { "css": "div.main", "value": "string(.)" },
+            "page_title": { "css": "title", "value": "string(.)" }
           }
 
       "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and `string(.)` gives a string with all the enclosed text nodes concatenated without entity escaping (such as `&amp;`). To extract the innerHTML, use `./node()`; and to extract the outer HTML, use `.`.
@@ -371,21 +374,24 @@ def handle_data(body, url, existing_payload)
             extract_xml(doc)
         end
 
-      if output.each_value.each_cons(2).any? { |m, n| m.size != n.size }
-        raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
-      end
-
-      num_tuples = output.each_value.first.size
+      num_tuples = output.each_value.inject(nil) { |num, value|
+        case size = value.size
+        when Float::INFINITY
+          num
+        when Integer
+          if num && num != size
+            raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}"
+          end
+          size
+        end
+      } or raise "At least one non-repeat key is required"
 
       old_events = previous_payloads num_tuples
 
       template = options['template'].presence
 
-      num_tuples.times do |index|
-        extracted = {}
-        interpolated['extract'].each_key do |name|
-          extracted[name] = output[name][index]
-        end
+      num_tuples.times.zip(*output.values) do |index, *values|
+        extracted = output.each_key.lazy.zip(values).to_h
 
         result =
           if template
@@ -510,8 +516,14 @@ def use_namespaces?
 
     def extract_each(&block)
       interpolated['extract'].each_with_object({}) { |(name, extraction_details), output|
-        values = []
-        block.call(extraction_details, values)
+        if boolify(extraction_details['repeat'])
+          values = Repeater.new { |repeater|
+            block.call(extraction_details, repeater)
+          }
+        else
+          values = []
+          block.call(extraction_details, values)
+        end
         log "Values extracted: #{values}"
         output[name] = values
       }
@@ -599,6 +611,31 @@ def is_positive_integer?(value)
       false
     end
 
+    class Repeater < Enumerator
+      # Repeater.new { |y|
+      #   # ...
+      #   y << value
+      # } #=> [value, ...]
+      def initialize(&block)
+        @value = nil
+        super(Float::INFINITY) { |y|
+          loop { y << @value }
+        }
+        catch(:done) {
+          block.call(self)
+        }
+      end
+
+      def <<(value)
+        @value = value
+        throw :done
+      end
+
+      def to_s
+        "[#{@value.inspect}, ...]"
+      end
+    end
+
     # Wraps Faraday::Response
     class ResponseDrop < LiquidDroppable::Drop
       def headers

diff --git a/spec/models/agents/website_agent_spec.rb b/spec/models/agents/website_agent_spec.rb
@@ -782,6 +782,7 @@
               'title' => { 'xpath' => '/feed/entry', 'value' => 'normalize-space(./title)' },
               'url' => { 'xpath' => '/feed/entry', 'value' => './link[1]/@href' },
               'thumbnail' => { 'xpath' => '/feed/entry', 'value' => './thumbnail/@url' },
+              'page_title': { 'xpath': '/feed/title', 'value': 'string(.)', 'repeat' => true }
             }
           }, keep_events_for: 2.days)
           @checker.user = users(:bob)
@@ -792,7 +793,10 @@
           expect {
             @checker.check
           }.to change { Event.count }.by(20)
-          event = Event.last
+          events = Event.last(20)
+          expect(events.size).to eq(20)
+          expect(events.map { |event| event.payload['page_title'] }.uniq).to eq(['Recent Commits to huginn:master'])
+          event = events.last
           expect(event.payload['title']).to eq('Shift to dev group')
           expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
           expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
@@ -935,6 +939,7 @@
         it "can handle arrays" do
           json = {
             'response' => {
+              'status' => 'ok',
               'data' => [
                 {'title' => "first", 'version' => 2},
                 {'title' => "second", 'version' => 2.5}
@@ -949,8 +954,9 @@
             'url' => "http://json-site.com",
             'mode' => 'on_change',
             'extract' => {
-              :title => {'path' => "response.data[*].title"},
-              :version => {'path' => "response.data[*].version"}
+              'title' => { 'path' => "response.data[*].title" },
+              'version' => { 'path' => "response.data[*].version" },
+              'status' => { 'path' => "response.status", 'repeat' => true },
             }
           }
           checker = Agents::WebsiteAgent.new(:name => "Weather Site", :options => site)
@@ -962,9 +968,11 @@
           }.to change { Event.count }.by(2)
 
           (event2, event1) = Event.last(2)
+          expect(event1.payload['status']).to eq('ok')
           expect(event1.payload['version']).to eq(2.5)
           expect(event1.payload['title']).to eq("second")
 
+          expect(event2.payload['status']).to eq('ok')
           expect(event2.payload['version']).to eq(2)
           expect(event2.payload['title']).to eq("first")
         end
@@ -998,6 +1006,7 @@
       describe "text parsing" do
         before do
           stub_request(:any, /text-site/).to_return(body: <<-EOF, status: 200)
+VERSION 1
 water: wet
 fire: hot
           EOF
@@ -1008,6 +1017,7 @@
             'url' => 'http://text-site.com',
             'mode' => 'on_change',
             'extract' => {
+              'version' => { 'regexp' => '^VERSION (.+)$', index: 1, repeat: true },
               'word' => { 'regexp' => '^(.+?): (.+)$', index: 1 },
               'property' => { 'regexp' => '^(.+?): (.+)$', index: '2' },
             }
@@ -1018,7 +1028,7 @@
         end
 
         it "works with regexp with named capture" do
-          @checker.options = @checker.options.merge('extract' => {
+          @checker.options = @checker.options.deep_merge('extract' => {
             'word' => { 'regexp' => '^(?<word>.+?): (?<property>.+)$', index: 'word' },
             'property' => { 'regexp' => '^(?<word>.+?): (?<property>.+)$', index: 'property' },
           })
@@ -1028,8 +1038,10 @@
           }.to change { Event.count }.by(2)
 
           event1, event2 = Event.last(2)
+          expect(event1.payload['version']).to eq('1')
           expect(event1.payload['word']).to eq('water')
           expect(event1.payload['property']).to eq('wet')
+          expect(event2.payload['version']).to eq('1')
           expect(event2.payload['word']).to eq('fire')
           expect(event2.payload['property']).to eq('hot')
         end
@@ -1040,8 +1052,10 @@
           }.to change { Event.count }.by(2)
 
           event1, event2 = Event.last(2)
+          expect(event1.payload['version']).to eq('1')
           expect(event1.payload['word']).to eq('water')
           expect(event1.payload['property']).to eq('wet')
+          expect(event2.payload['version']).to eq('1')
           expect(event2.payload['word']).to eq('fire')
           expect(event2.payload['property']).to eq('hot')
         end