Skip to content

Commit

Permalink
Add script for PDF attachment report
Browse files Browse the repository at this point in the history
This script produces a CSV of counts of published PDF attachemnts per
organisation owning the associated manual. This was produced for the
Content Operating Model team.
  • Loading branch information
neilvanbeinum committed Mar 8, 2017
1 parent 1adcd46 commit 95f3264
Showing 1 changed file with 84 additions and 0 deletions.
84 changes: 84 additions & 0 deletions bin/content_model_pdf_report
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env ruby

require File.expand_path("../../config/environment", __FILE__)
require 'csv'

POST_PUBLICATION_STATES = %w(published archived).freeze

first_period_start_date = ENV.fetch('FIRST_PERIOD_START_DATE', Date.parse('2016-01-01'))
last_time_period_days = ENV.fetch('LAST_TIME_PERIOD_DAYS', 30)
last_time_period_start_date = last_time_period_days.days.ago

manual_records = ManualRecord.all
unique_owning_organisation_slugs = manual_records.map(&:organisation_slug).uniq

organisation_published_pdfs_total_counts_hash = Hash[unique_owning_organisation_slugs.map { |o| [o, 0] }]
organisation_published_pdfs_since_first_period_counts_hash = Hash[unique_owning_organisation_slugs.map { |o| [o, 0] }]
organisation_published_pdfs_since_second_period_counts_hash = Hash[unique_owning_organisation_slugs.map { |o| [o, 0] }]

def document_published_after_date?(document_edition, date)
(document_edition.exported_at || document_edition.updated_at) >= date
end

def document_edition_never_published?(document_edition)
!POST_PUBLICATION_STATES.include?(document_edition.state)
end

def all_unique_document_ids_for_manual(manual)
manual.editions.map(&:document_ids).flatten.uniq
end

manual_records.to_a.each do |manual|
next unless manual.has_ever_been_published?

unique_pdf_attachment_file_ids_for_manual = Set.new

# Rather than examine each manual edition and its set of document editions and attachments in turn,
# we instead get all unique document ids associated with this manual, then walk through
# the editions of these documents in version order to find unique PDF attachments and their
# publication times.
all_unique_document_ids_for_manual(manual).each do |document_id|
document_editions = SpecialistDocumentEdition.where(document_id: document_id).order(:version_number)

document_editions.each do |document_edition|
next if document_edition_never_published?(document_edition)

document_edition.attachments.each do |attachment|
next if unique_pdf_attachment_file_ids_for_manual.include? attachment.file_id
next unless /.*\.pdf$/ =~ attachment.filename

organisation_published_pdfs_total_counts_hash[manual.organisation_slug] += 1

if document_published_after_date?(document_edition, first_period_start_date)
organisation_published_pdfs_since_first_period_counts_hash[manual.organisation_slug] += 1
end

if document_published_after_date?(document_edition, last_time_period_start_date)
organisation_published_pdfs_since_second_period_counts_hash[manual.organisation_slug] += 1
end

unique_pdf_attachment_file_ids_for_manual << attachment.file_id
end
end
end
end

document_report_filename = Rails.root.join("content-operating-report-for-pdf-documents-#{Time.zone.today.strftime('%Y-%m-%d')}.csv")

CSV.open(document_report_filename, 'w') do |document_csv|
document_csv << [
"Organisation",
"Total published PDF attachments",
"#{first_period_start_date} - present PDF attachments",
"Last #{last_time_period_days} days PDF attachments"
]

unique_owning_organisation_slugs.each do |organisation_slug|
document_csv << [
organisation_slug.titleize,
organisation_published_pdfs_total_counts_hash[organisation_slug],
organisation_published_pdfs_since_first_period_counts_hash[organisation_slug],
organisation_published_pdfs_since_second_period_counts_hash[organisation_slug]
]
end
end

0 comments on commit 95f3264

Please sign in to comment.