Permalink
Browse files

Merge pull request #144 from alphagov/anonymous-feedback-deduplication

Anonymous feedback de-duplication batch processing
  • Loading branch information...
2 parents 84366d7 + 80ac815 commit 8685615340d8b4c4bf44df1cff9324648c45b581 @alext alext committed Mar 4, 2014
@@ -0,0 +1,23 @@
+require 'support/requests/anonymous/anonymous_contact'
+
+class DeduplicationWorker
+ include Sidekiq::Worker
+ include Support::Requests::Anonymous
+
+ sidekiq_retry_in do |count|
+ 60 * 60 * 2 # every 2 hours
+ end
+
+ def perform(year, month, day)
+ logger.info("Deduping anonymous feedback for #{year}-#{month}-#{day}")
+ day = Date.new(year, month, day)
+
+ AnonymousContact.deduplicate_contacts_created_between(day.beginning_of_day..day.end_of_day)
+ end
+
+ def self.run
+ yesterday = Date.yesterday
+ perform_async(yesterday.year, yesterday.month, yesterday.day)
+ logger.info("Deduplication scheduled for anonymous feedback created on #{yesterday}")
+ end
+end
@@ -1,5 +1,6 @@
require 'support/requests/requester'
require 'support/requests/anonymous/field_which_may_contain_personal_information'
+require 'support/requests/anonymous/duplicate_detector'
module Support
module Requests
@@ -40,6 +41,23 @@ def self.find_all_starting_with_path(path)
select { |pr| pr.path && pr.path.start_with?(path) }
end
+ def self.deduplicate_contacts_created_between(interval)
+ contacts = where(created_at: interval).order("created_at asc")
+ duplicate_detector = DuplicateDetector.new
+ contacts.each do |contact|
+ if duplicate_detector.duplicate?(contact)
+ contact.mark_as_duplicate
+ contact.save!
+ end
+ end
+ end
+
+ def mark_as_duplicate
+ self.is_actionable = false
+ self.reason_why_not_actionable = "duplicate"
+ self
+ end
+
private
def detect_personal_information
self.personal_information_status ||= personal_info_present? ? "suspected" : "absent"
@@ -0,0 +1,46 @@
+require 'support/requests/anonymous/anonymous_contact'
+
+module Support
+ module Requests
+ module Anonymous
+ # This class uses a sliding window to identify duplicates within a certain period of time
+ # This assumes that data is being fed into the class in chronological order
+ class DuplicateDetector
+ def initialize
+ @comparator = AnonymousFeedbackComparator.new
+ @unique_records = []
+ end
+
+ def duplicate?(record)
+ is_dupe = @unique_records.any? {|saved_record| @comparator.same?(saved_record, record)}
+ unless is_dupe
+ @unique_records.select! {|r| @comparator.created_within_a_short_interval?(r, record)}
+ @unique_records << record
+ end
+ is_dupe
+ end
+ end
+
+ class AnonymousFeedbackComparator
+ DUPLICATION_INTERVAL_IN_SECONDS = 5
+
+ def initialize
+ @fields_to_compare = AnonymousContact.attribute_names - ["id", "created_at", "updated_at"]
+ end
+
+ def same?(r1, r2)
+ fields_same?(r1, r2) && created_within_a_short_interval?(r1, r2)
+ end
+
+ def created_within_a_short_interval?(r1, r2)
+ (r1["created_at"] - r2["created_at"]).abs < DUPLICATION_INTERVAL_IN_SECONDS
+ end
+
+ private
+ def fields_same?(r1, r2)
+ @fields_to_compare.all? {|field| r1[field] == r2[field] }
+ end
+ end
+ end
+ end
+end
@@ -0,0 +1,12 @@
+desc "Trigger deduplication for yesterday's anonymous feedback"
+task :anonymous_feedback_deduplication => :environment do
+ require File.join(Rails.root, 'app', 'workers', 'deduplication_worker')
+ require 'volatile_lock'
+
+ if VolatileLock.new('support:anonymous_feedback_deduplication').obtained?
+ DeduplicationWorker.run
+ puts "DeduplicationWorker invoked"
+ else
+ puts "DeduplicationWorker: skipping, couldn't obtain lock (probably the task has already run on another node)"
+ end
+end
@@ -0,0 +1,75 @@
+require 'test_helper'
+require 'time'
+require 'json'
+require 'deduplication_worker'
+
+class DeduplicationTest < ActionDispatch::IntegrationTest
+ def setup
+ User.first
+ rescue
+ User.create!(
+ "uid" => 'dummy-user',
+ "name" => 'Ms Example',
+ "email" => 'example@example.com',
+ "permissions" => ['single_points_of_contact', 'feedex', 'api_users']
+ )
+ end
+
+ test "duplicate service feedback is flagged and removed from results" do
+ # create service feedback
+
+ Timecop.travel Time.parse("2013-01-15 12:00:00")
+
+ create_service_feedback_with(
+ service_satisfaction_rating: 5,
+ details: "this service is great",
+ slug: "some-tx",
+ url: "https://www.gov.uk/done/some-tx"
+ )
+
+ create_service_feedback_with(
+ service_satisfaction_rating: 3,
+ details: "this service is meh",
+ slug: "some-tx",
+ url: "https://www.gov.uk/done/some-tx"
+ )
+
+ Timecop.travel Time.parse("2013-01-15 12:00:01")
+
+ create_service_feedback_with(
+ service_satisfaction_rating: 3,
+ details: "this service is meh",
+ slug: "some-tx",
+ url: "https://www.gov.uk/done/some-tx"
+ )
+
+ get '/anonymous_feedback?path=/done/some-tx', "", {"CONTENT_TYPE" => 'application/json', 'HTTP_ACCEPT' => 'application/json'}
+
+ results = JSON.parse(response.body)
+ assert_equal 3, results.size
+
+ # deduplicate
+ Timecop.travel Time.parse("2013-01-16 00:30:00")
+ DeduplicationWorker.run
+
+ # rerun query, one piece of feedback should now be suppressed
+ get '/anonymous_feedback?path=/done/some-tx', "", {"CONTENT_TYPE" => 'application/json', 'HTTP_ACCEPT' => 'application/json'}
+
+ results = JSON.parse(response.body)
+ assert_equal 2, results.size
+ assert_equal ["this service is great", "this service is meh"], results.map {|r| r["details"]}.sort
+ end
+
+ def teardown
+ Timecop.return
+ end
+
+ def create_service_feedback_with(options)
+ defaults = {
+ service_satisfaction_rating: 5,
+ details: "this service is great",
+ javascript_enabled: true,
+ }
+ Support::Requests::Anonymous::ServiceFeedback.create!(defaults.merge(options))
+ end
+end
@@ -53,6 +53,14 @@ def path_for(url)
refute new_contact(personal_information_status: "abcde").valid?
end
+ should "mark duplicates as non-actionable" do
+ contact = new_contact
+ contact.mark_as_duplicate
+
+ refute contact.is_actionable
+ assert_equal "duplicate", contact.reason_why_not_actionable
+ end
+
context "#find_all_starting_with_path" do
should "find urls beginning with the given path" do
a = contact(url: "https://www.gov.uk/some-calculator/y/abc")
@@ -100,6 +108,24 @@ def teardown
TestContact.delete_all
end
end
+
+ context "#deduplicate_contacts_created_between" do
+ should "update contacts created in the given interval as they are marked as dupes" do
+ interval = mock("some time interval")
+ original_contact = mock("anonymous contact")
+ duplicate = mock("anonymous contact")
+ contacts = [ original_contact, duplicate ]
+
+ AnonymousContact.expects(:where).with(created_at: interval).returns(stub(order: contacts))
+ DuplicateDetector.any_instance.stubs(:duplicate?).with(original_contact).returns(false)
+ DuplicateDetector.any_instance.stubs(:duplicate?).with(duplicate).returns(true)
+
+ duplicate.expects(:mark_as_duplicate)
+ duplicate.expects(:save!)
+
+ AnonymousContact.deduplicate_contacts_created_between(interval)
+ end
+ end
end
end
end
@@ -0,0 +1,45 @@
+require 'test_helper'
+require 'support/requests/anonymous/duplicate_detector'
+
+module Support
+ module Requests
+ module Anonymous
+ class DuplicateDetectorTest < Test::Unit::TestCase
+ should "identify duplicate records" do
+ r1 = stub("record 1")
+ r2 = stub("record 2")
+ r3 = stub("record 3")
+
+ AnonymousFeedbackComparator.any_instance.stubs(:same?).with(r1, r2).returns(false)
+ AnonymousFeedbackComparator.any_instance.stubs(:same?).with(r1, r3).returns(true)
+ AnonymousFeedbackComparator.any_instance.stubs(:same?).with(r2, r3).returns(false)
+ AnonymousFeedbackComparator.any_instance.stubs(:created_within_a_short_interval?).returns(true)
+
+ detector = DuplicateDetector.new
+
+ refute detector.duplicate?(r1)
+ refute detector.duplicate?(r2)
+ assert detector.duplicate?(r3)
+ end
+
+ context "the comparator" do
+ should "detect if two identical pieces of feedback are created within a short time of each other" do
+ comparator = AnonymousFeedbackComparator.new
+ defaults = { "service_satisfaction_rating" => 5, "details" => "it's ace" }
+
+ time = Time.now
+ r1 = defaults.merge("created_at" => time)
+ r2 = r1.clone
+ r3 = defaults.merge("created_at" => time + 4.seconds)
+ r4 = defaults.merge("created_at" => time + 8.seconds)
+
+ assert comparator.same?(r1, r2)
+ assert comparator.same?(r1, r3)
+ refute comparator.same?(r1, r4)
+ assert comparator.same?(r3, r4)
+ end
+ end
+ end
+ end
+ end
+end
@@ -0,0 +1,15 @@
+require 'test_helper'
+
+class DeduplicationWorkerTest < ActiveSupport::TestCase
+ include Support::Requests::Anonymous
+
+ should "remove duplicate anonymous feedback for yesterday" do
+ Date.stubs(:yesterday).returns(Date.new(2013,2,10))
+
+ AnonymousContact.
+ expects(:deduplicate_contacts_created_between).
+ with(Date.new(2013,2,10).beginning_of_day..Date.new(2013,2,10).end_of_day)
+
+ DeduplicationWorker.run
+ end
+end

0 comments on commit 8685615

Please sign in to comment.