-
Notifications
You must be signed in to change notification settings - Fork 6
/
sanitize.rb
135 lines (116 loc) · 4.5 KB
/
sanitize.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
require 'rubygems'
gem 'nokogiri', '>=1.0.5'
require 'nokogiri'
require 'cgi'
require "dryopteris/whitelist"
module Dryopteris
class << self
def strip_tags(string_or_io, encoding=nil)
return nil if string_or_io.nil?
return "" if string_or_io.strip.size == 0
doc = Nokogiri::HTML.parse(string_or_io, nil, encoding)
body_element = doc.at("/html/body")
return "" if body_element.nil?
body_element.inner_text
end
def sanitize(string, encoding=nil)
return nil if string.nil?
return "" if string.strip.size == 0
string = "<html><body>" + string + "</body></html>"
doc = Nokogiri::HTML.parse(string, nil, encoding)
body = doc.xpath("/html/body").first
return "" if body.nil?
body.children.each do |node|
traverse_conditionally_top_down(node, :sanitize_node)
end
body.children.map { |x| x.to_xml }.join
end
def sanitize_document(string_or_io, encoding=nil)
return nil if string_or_io.nil?
return "" if string_or_io.strip.size == 0
doc = Nokogiri::HTML.parse(string_or_io, nil, encoding)
elements = doc.xpath("/html/head/*","/html/body/*")
return "" if (elements.nil? || elements.empty?)
elements.each do |node|
traverse_conditionally_top_down(node, :sanitize_node)
end
doc.root.to_xml
end
private
def traverse_conditionally_top_down(node, method_name)
return if send(method_name, node)
node.children.each {|j| traverse_conditionally_top_down(j, method_name)}
end
def remove_tags_from_node(node)
replacement_killer = Nokogiri::XML::Text.new(node.text, node.document)
node.add_next_sibling(replacement_killer)
node.remove
return true
end
def sanitize_node(node)
case node.type
when 1 # Nokogiri::XML::Node::ELEMENT_NODE
if HashedWhiteList::ALLOWED_ELEMENTS[node.name]
node.attributes.each do |attr|
node.remove_attribute(attr.first) unless HashedWhiteList::ALLOWED_ATTRIBUTES[attr.first]
end
node.attributes.each do |attr|
if HashedWhiteList::ATTR_VAL_IS_URI[attr.first]
# this block lifted nearly verbatim from HTML5 sanitization
val_unescaped = CGI.unescapeHTML(attr.last.to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and HashedWhiteList::ALLOWED_PROTOCOLS[val_unescaped.split(':')[0]].nil?
node.remove_attribute(attr.first)
end
end
end
if node.attributes['style']
node['style'] = sanitize_css(node.attributes['style'])
end
return false
end
when 3 # Nokogiri::XML::Node::TEXT_NODE
return false
when 4 # Nokogiri::XML::Node::CDATA_SECTION_NODE
return false
end
replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document)
node.add_next_sibling(replacement_killer)
node.remove
return true
end
# this liftend nearly verbatim from html5
def sanitize_css(style)
# disallow urls
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
# gauntlet
return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
return '' unless style =~ /^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$/
clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
next if val.empty?
prop.downcase!
if HashedWhiteList::ALLOWED_CSS_PROPERTIES[prop]
clean << "#{prop}: #{val};"
elsif %w[background border margin padding].include?(prop.split('-')[0])
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
HashedWhiteList::ALLOWED_CSS_KEYWORDS[keyword].nil? and
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
end
elsif HashedWhiteList::ALLOWED_SVG_PROPERTIES[prop]
clean << "#{prop}: #{val};"
end
end
style = clean.join(' ')
end
end # self
module HashedWhiteList
# turn each of the whitelist arrays into a hash for faster lookup
WhiteList.constants.each do |constant|
next unless WhiteList.module_eval("#{constant}").is_a?(Array)
module_eval <<-CODE
#{constant} = {}
WhiteList::#{constant}.each { |c| #{constant}[c] = true ; #{constant}[c.downcase] = true }
CODE
end
end
end