Skip to content
Browse files

#154 Improvements to website deduplication logic

as discussed with @cantino
  • Loading branch information...
1 parent 7b38df6 commit b1898cc7ff92ab0b9c3c9e06a8751e55f4b0dad0 @Alex-Ikanow Alex-Ikanow committed Feb 6, 2014
Showing with 50 additions and 6 deletions.
  1. +37 −5 app/models/agents/website_agent.rb
  2. +13 −1 spec/models/agents/website_agent_spec.rb
View
42 app/models/agents/website_agent.rb
@@ -36,7 +36,7 @@ class WebsiteAgent < Agent
Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent (only used to set the "working" status).
- Set `uniqueness_look_back` (defaults to 10000) to limit the number of events checked for uniqueness (typically for performance).
+ Set `uniqueness_look_back` (defaults to the larger of 200, 3x the number of received events) to limit the number of events checked for uniqueness (typically for performance).
MD
event_description do
@@ -45,7 +45,8 @@ class WebsiteAgent < Agent
default_schedule "every_12h"
- UNIQUENESS_LOOK_BACK = 10000
+ UNIQUENESS_LOOK_BACK = 200
+ UNIQUENESS_FACTOR = 3
def working?
event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs?
@@ -65,10 +66,32 @@ def default_options
end
def validate_options
+ # Check required fields are present
errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present?
if !options['extract'].present? && extraction_type != "json"
errors.add(:base, "extract is required for all types except json")
end
+ # Check options:
+ if options['mode'].present?
+ if options['mode'] != "on_change" && options['mode'] != "all"
+ errors.add(:base, "mode should be all or on_change")
+ end
+ end
+ # Check integer variables:
+ if options['expected_update_period_in_days'].present?
+ begin
+ Integer(options['expected_update_period_in_days'])
+ rescue
+ errors.add(:base, "Invalid expected_update_period_in_days format")
+ end
+ end
+ if options['uniqueness_look_back'].present?
+ begin
+ Integer(options['uniqueness_look_back'])
+ rescue
+ errors.add(:base, "Invalid uniqueness_look_back format")
+ end
+ end
end
def check
@@ -84,9 +107,9 @@ def check
end
request.on_success do |response|
doc = parse(response.body)
- old_events = previous_payloads
if extract_full_json?
+ old_events = previous_payloads 1
result = doc
if store_payload? old_events, result
log "Storing new result for '#{name}': #{result.inspect}"
@@ -119,6 +142,7 @@ def check
return
end
+ old_events = previous_payloads num_unique_lengths.first
num_unique_lengths.first.times do |index|
result = {}
options['extract'].keys.each do |name|
@@ -159,8 +183,16 @@ def store_payload?(old_events, result)
raise "Illegal options[mode]: " + options['mode'].to_s
end
- def previous_payloads
- look_back = options['uniqueness_look_back'] ? options['uniqueness_look_back'].to_i : UNIQUENESS_LOOK_BACK
+ def previous_payloads(num_events)
+ if options['uniqueness_look_back'].present?
+ look_back = options['uniqueness_look_back'].to_i
+ else
+ # Larger of UNIQUENESS_FACTOR*num_events and UNIQUENESS_LOOK_BACK
+ look_back = UNIQUENESS_FACTOR*num_events
+ if look_back < UNIQUENESS_LOOK_BACK
+ look_back = UNIQUENESS_LOOK_BACK
+ end
+ end
events.order("id desc").limit(look_back) if options['mode'].to_s == "on_change"
end
View
14 spec/models/agents/website_agent_spec.rb
@@ -21,6 +21,18 @@
end
describe "#check" do
+
+ it "should validate the integer fields" do
+ @checker.options['expected_update_period_in_days'] = "nonsense"
+ lambda { @checker.save! }.should raise_error;
+ @checker.options['expected_update_period_in_days'] = "2"
+ @checker.options['uniqueness_look_back'] = "nonsense"
+ lambda { @checker.save! }.should raise_error;
+ @checker.options['mode'] = "nonsense"
+ lambda { @checker.save! }.should raise_error;
+ @checker.options = @site
+ end
+
it "should check for changes (and update Event.expires_at)" do
lambda { @checker.check }.should change { Event.count }.by(1)
event = Event.last
@@ -107,7 +119,7 @@
'expected_update_period_in_days' => 2,
'type' => "html",
'url' => "http://xkcd.com",
- 'mode' => :on_change,
+ 'mode' => "on_change",
'extract' => {
'url' => {'css' => "#topLeft a", 'attr' => "href"},
'title' => {'css' => "#topLeft a", 'text' => "true"}

0 comments on commit b1898cc

Please sign in to comment.
Something went wrong with that request. Please try again.