Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alex ikanow website deduplication improvement #164

Merged
merged 6 commits into from
Feb 13, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 72 additions & 20 deletions app/models/agents/website_agent.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ module Agents
class WebsiteAgent < Agent
cannot_receive_events!

default_schedule "every_12h"

UNIQUENESS_LOOK_BACK = 200
UNIQUENESS_FACTOR = 3

description <<-MD
The WebsiteAgent scrapes a website, XML document, or JSON feed and creates Events based on the results.

Expand Down Expand Up @@ -34,17 +39,15 @@ class WebsiteAgent < Agent

Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `username:password`.

Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent.
Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent. This is only used to set the "working" status.

Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance). This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results.
MD

event_description do
"Events will have the fields you specified. Your options look like:\n\n #{Utils.pretty_print options['extract']}"
end

default_schedule "every_12h"

UNIQUENESS_LOOK_BACK = 30

def working?
event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs?
end
Expand All @@ -54,7 +57,7 @@ def default_options
'expected_update_period_in_days' => "2",
'url' => "http://xkcd.com",
'type' => "html",
'mode' => :on_change,
'mode' => "on_change",
'extract' => {
'url' => {'css' => "#comic img", 'attr' => "src"},
'title' => {'css' => "#comic img", 'attr' => "title"}
Expand All @@ -63,31 +66,44 @@ def default_options
end

def validate_options
# Check for required fields
errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present?
if !options['extract'].present? && extraction_type != "json"
errors.add(:base, "extract is required for all types except json")
end

# Check for optional fields
if options['mode'].present?
errors.add(:base, "mode must be set to on_change or all") unless %w[on_change all].include?(options['mode'])
end

if options['expected_update_period_in_days'].present?
errors.add(:base, "Invalid expected_update_period_in_days format") unless is_positive_integer?(options['expected_update_period_in_days'])
end

if options['uniqueness_look_back'].present?
errors.add(:base, "Invalid uniqueness_look_back format") unless is_positive_integer?(options['uniqueness_look_back'])
end
end

def check
hydra = Typhoeus::Hydra.new
log "Fetching #{options['url']}"
request_opts = {:followlocation => true}
if options['basic_auth'].present?
request_opts[:userpwd] = options['basic_auth']
end
request_opts = { :followlocation => true }
request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present?
request = Typhoeus::Request.new(options['url'], request_opts)

request.on_failure do |response|
error "Failed: #{response.inspect}"
end

request.on_success do |response|
doc = parse(response.body)

if extract_full_json?
result = doc
if store_payload? result
log "Storing new result for '#{name}': #{result.inspect}"
create_event :payload => result
if store_payload!(previous_payloads(1), doc)
log "Storing new result for '#{name}': #{doc.inspect}"
create_event :payload => doc
end
else
output = {}
Expand Down Expand Up @@ -116,6 +132,7 @@ def check
return
end

old_events = previous_payloads num_unique_lengths.first
num_unique_lengths.first.times do |index|
result = {}
options['extract'].keys.each do |name|
Expand All @@ -125,7 +142,7 @@ def check
end
end

if store_payload? result
if store_payload!(old_events, result)
log "Storing new parsed result for '#{name}': #{result.inspect}"
create_event :payload => result
end
Expand All @@ -138,16 +155,43 @@ def check

private

def store_payload? result
!options['mode'] || options['mode'].to_s == "all" || (options['mode'].to_s == "on_change" && !previous_payloads.include?(result.to_json))
# This method returns true if the result should be stored as a new event.
# If mode is set to 'on_change', this method may return false and update an existing
# event to expire further in the future.
def store_payload!(old_events, result)
if !options['mode'].present?
return true
elsif options['mode'].to_s == "all"
return true
elsif options['mode'].to_s == "on_change"
result_json = result.to_json
old_events.each do |old_event|
if old_event.payload.to_json == result_json
old_event.expires_at = new_event_expiration_date
old_event.save!
return false
end
end
return true
end
raise "Illegal options[mode]: " + options['mode'].to_s
end

def previous_payloads
events.order("id desc").limit(UNIQUENESS_LOOK_BACK).pluck(:payload).map(&:to_json) if options['mode'].to_s == "on_change"
def previous_payloads(num_events)
if options['uniqueness_look_back'].present?
look_back = options['uniqueness_look_back'].to_i
else
# Larger of UNIQUENESS_FACTOR * num_events and UNIQUENESS_LOOK_BACK
look_back = UNIQUENESS_FACTOR * num_events
if look_back < UNIQUENESS_LOOK_BACK
look_back = UNIQUENESS_LOOK_BACK
end
end
events.order("id desc").limit(look_back) if options['mode'].present? && options['mode'].to_s == "on_change"
end

def extract_full_json?
(!options['extract'].present? && extraction_type == "json")
!options['extract'].present? && extraction_type == "json"
end

def extraction_type
Expand All @@ -174,5 +218,13 @@ def parse(data)
raise "Unknown extraction type #{extraction_type}"
end
end

def is_positive_integer?(value)
begin
Integer(value) >= 0
rescue
false
end
end
end
end
49 changes: 44 additions & 5 deletions spec/models/agents/website_agent_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,31 @@
'title' => {'css' => "#comic img", 'attr' => "title"}
}
}
@checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site)
@checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site, :keep_events_for => 2)
@checker.user = users(:bob)
@checker.save!
end

describe "#check" do
it "should check for changes" do

it "should validate the integer fields" do
@checker.options['expected_update_period_in_days'] = "nonsense"
lambda { @checker.save! }.should raise_error;
@checker.options['expected_update_period_in_days'] = "2"
@checker.options['uniqueness_look_back'] = "nonsense"
lambda { @checker.save! }.should raise_error;
@checker.options['mode'] = "nonsense"
lambda { @checker.save! }.should raise_error;
@checker.options = @site
end

it "should check for changes (and update Event.expires_at)" do
lambda { @checker.check }.should change { Event.count }.by(1)
event = Event.last
sleep 2
lambda { @checker.check }.should_not change { Event.count }
update_event = Event.last
update_event.expires_at.should_not == event.expires_at
end

it "should always save events when in :all mode" do
Expand All @@ -35,6 +51,30 @@
}.should change { Event.count }.by(2)
end

it "should take uniqueness_look_back into account during deduplication" do
@site['mode'] = 'all'
@checker.options = @site
@checker.check
@checker.check
event = Event.last
event.payload = "{}"
event.save

lambda {
@site['mode'] = 'on_change'
@site['uniqueness_look_back'] = 2
@checker.options = @site
@checker.check
}.should_not change { Event.count }

lambda {
@site['mode'] = 'on_change'
@site['uniqueness_look_back'] = 1
@checker.options = @site
@checker.check
}.should change { Event.count }.by(1)
end

it "should log an error if the number of results for a set of extraction patterns differs" do
@site['extract']['url']['css'] = "div"
@checker.options = @site
Expand Down Expand Up @@ -79,7 +119,7 @@
'expected_update_period_in_days' => 2,
'type' => "html",
'url' => "http://xkcd.com",
'mode' => :on_change,
'mode' => "on_change",
'extract' => {
'url' => {'css' => "#topLeft a", 'attr' => "href"},
'title' => {'css' => "#topLeft a", 'text' => "true"}
Expand Down Expand Up @@ -216,5 +256,4 @@
end
end
end

end
end