From 80a1ecf5bf4c0fe1ab541a9e252a595e6a78eade Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Tue, 10 Feb 2009 09:42:04 -0500 Subject: [PATCH] sanitize_document is here. willkommen, bienvenue. --- lib/dryopteris/sanitize.rb | 13 +++++++++++++ test/test_sanitizer.rb | 7 +++++++ 2 files changed, 20 insertions(+) diff --git a/lib/dryopteris/sanitize.rb b/lib/dryopteris/sanitize.rb index 6d19184..21b2ef2 100644 --- a/lib/dryopteris/sanitize.rb +++ b/lib/dryopteris/sanitize.rb @@ -32,6 +32,19 @@ def sanitize(string, encoding=nil) body.children.map { |x| x.to_xml }.join end + def sanitize_document(string_or_io, encoding=nil) + return nil if string_or_io.nil? + return "" if string_or_io.strip.size == 0 + + doc = Nokogiri::HTML.parse(string_or_io, nil, encoding) + elements = doc.xpath("/html/head/*","/html/body/*") + return "" if (elements.nil? || elements.empty?) + elements.each do |node| + traverse_conditionally_top_down(node, :sanitize_node) + end + doc.root.to_xml + end + private def traverse_conditionally_top_down(node, method_name) return if send(method_name, node) diff --git a/test/test_sanitizer.rb b/test/test_sanitizer.rb index 08430ed..4588065 100644 --- a/test/test_sanitizer.rb +++ b/test/test_sanitizer.rb @@ -7,9 +7,16 @@ def sanitize_html stream Dryopteris.sanitize(stream) end + def sanitize_doc stream + Dryopteris.sanitize_document(stream) + end + def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput) # libxml uses double-quotes, so let's swappo-boppo our quotes before comparing. assert_equal htmloutput, sanitize_html(input).gsub(/"/,"'"), input + + doc = sanitize_doc(input).gsub(/"/,"'") + assert doc.include?(htmloutput), "#{input}:\n#{doc}\nshould include:\n#{htmloutput}" end WhiteList::ALLOWED_ELEMENTS.each do |tag_name|