Skip to content

Commit

Permalink
Moving files into lib/dryopteris
Browse files Browse the repository at this point in the history
  • Loading branch information
brynary committed Dec 5, 2008
1 parent 7422732 commit 037b14a
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 106 deletions.
108 changes: 2 additions & 106 deletions lib/dryopteris.rb
@@ -1,107 +1,3 @@
$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__)))

require 'rubygems'
gem 'nokogiri', '>=1.0.5'
require 'nokogiri'
require 'cgi'

require File.join(File.dirname(__FILE__), 'whitelist')

module Dryopteris

class << self
def strip_tags(string_or_io, encoding=nil)
doc = Nokogiri::HTML.parse(string_or_io, nil, encoding)
doc.text
end

def sanitize(string_or_io, encoding=nil)
doc = Nokogiri::HTML.parse(string_or_io, nil, encoding)
doc.xpath("html/body/*").each do |node|
traverse_conditionally_top_down(node, self.method(:sanitize_node).to_proc)
end
snippet = doc.xpath("html/body").first
snippet.nil? ? "" : snippet.inner_html
end

private
def traverse_conditionally_top_down(node, proc)
return if proc.call(node)
node.children.each {|j| traverse_conditionally_top_down(j, proc)}
end


def sanitize_node(node)
case node.type
when 1 # Nokogiri::XML::Node::ELEMENT_NODE
if HashedWhiteList::ALLOWED_ELEMENTS[node.name]
node.attributes.each do |attr|
node.remove_attribute(attr.first) unless HashedWhiteList::ALLOWED_ATTRIBUTES[attr.first]
end
node.attributes.each do |attr|
if HashedWhiteList::ATTR_VAL_IS_URI[attr.first]
# this block lifted nearly verbatim from HTML5 sanitization
val_unescaped = CGI.unescapeHTML(attr.last).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and HashedWhiteList::ALLOWED_PROTOCOLS[val_unescaped.split(':')[0]].nil?
node.remove_attribute(attr.first)
end
end
end
if node.attributes['style']
node['style'] = sanitize_css(node.attributes['style'])
end
return false
end
when 3 # Nokogiri::XML::Node::TEXT_NODE
return false
when 4 # Nokogiri::XML::Node::CDATA_SECTION_NODE
return false
end
replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document)
node.add_next_sibling(replacement_killer)
node.remove
return true
end


# this liftend nearly verbatim from html5
def sanitize_css(style)
# disallow urls
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')

# gauntlet
return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
return '' unless style =~ /^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$/

clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
next if val.empty?
prop.downcase!
if HashedWhiteList::ALLOWED_CSS_PROPERTIES[prop]
clean << "#{prop}: #{val};"
elsif %w[background border margin padding].include?(prop.split('-')[0])
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
HashedWhiteList::ALLOWED_CSS_KEYWORDS[keyword].nil? and
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
end
elsif HashedWhiteList::ALLOWED_SVG_PROPERTIES[prop]
clean << "#{prop}: #{val};"
end
end

style = clean.join(' ')
end

end # self

module HashedWhiteList
# turn each of the whitelist arrays into a hash for faster lookup
WhiteList.constants.each do |constant|
next unless WhiteList.module_eval("#{constant}").is_a?(Array)
module_eval <<-CODE
#{constant} = {}
WhiteList::#{constant}.each { |c| #{constant}[c] = true ; #{constant}[c.downcase] = true }
CODE
end
end

end
require "dryopteris/sanitize"
106 changes: 106 additions & 0 deletions lib/dryopteris/sanitize.rb
@@ -0,0 +1,106 @@
require 'rubygems'
gem 'nokogiri', '>=1.0.5'
require 'nokogiri'
require 'cgi'

require "dryopteris/whitelist"

module Dryopteris

class << self
def strip_tags(string_or_io, encoding=nil)
doc = Nokogiri::HTML.parse(string_or_io, nil, encoding)
doc.text
end

def sanitize(string_or_io, encoding=nil)
doc = Nokogiri::HTML.parse(string_or_io, nil, encoding)
doc.xpath("html/body/*").each do |node|
traverse_conditionally_top_down(node, self.method(:sanitize_node).to_proc)
end
snippet = doc.xpath("html/body").first
snippet.nil? ? "" : snippet.inner_html
end

private
def traverse_conditionally_top_down(node, proc)
return if proc.call(node)
node.children.each {|j| traverse_conditionally_top_down(j, proc)}
end


def sanitize_node(node)
case node.type
when 1 # Nokogiri::XML::Node::ELEMENT_NODE
if HashedWhiteList::ALLOWED_ELEMENTS[node.name]
node.attributes.each do |attr|
node.remove_attribute(attr.first) unless HashedWhiteList::ALLOWED_ATTRIBUTES[attr.first]
end
node.attributes.each do |attr|
if HashedWhiteList::ATTR_VAL_IS_URI[attr.first]
# this block lifted nearly verbatim from HTML5 sanitization
val_unescaped = CGI.unescapeHTML(attr.last).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and HashedWhiteList::ALLOWED_PROTOCOLS[val_unescaped.split(':')[0]].nil?
node.remove_attribute(attr.first)
end
end
end
if node.attributes['style']
node['style'] = sanitize_css(node.attributes['style'])
end
return false
end
when 3 # Nokogiri::XML::Node::TEXT_NODE
return false
when 4 # Nokogiri::XML::Node::CDATA_SECTION_NODE
return false
end
replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document)
node.add_next_sibling(replacement_killer)
node.remove
return true
end


# this liftend nearly verbatim from html5
def sanitize_css(style)
# disallow urls
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')

# gauntlet
return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
return '' unless style =~ /^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$/

clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
next if val.empty?
prop.downcase!
if HashedWhiteList::ALLOWED_CSS_PROPERTIES[prop]
clean << "#{prop}: #{val};"
elsif %w[background border margin padding].include?(prop.split('-')[0])
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
HashedWhiteList::ALLOWED_CSS_KEYWORDS[keyword].nil? and
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
end
elsif HashedWhiteList::ALLOWED_SVG_PROPERTIES[prop]
clean << "#{prop}: #{val};"
end
end

style = clean.join(' ')
end

end # self

module HashedWhiteList
# turn each of the whitelist arrays into a hash for faster lookup
WhiteList.constants.each do |constant|
next unless WhiteList.module_eval("#{constant}").is_a?(Array)
module_eval <<-CODE
#{constant} = {}
WhiteList::#{constant}.each { |c| #{constant}[c] = true ; #{constant}[c.downcase] = true }
CODE
end
end

end
File renamed without changes.

0 comments on commit 037b14a

Please sign in to comment.