-
Notifications
You must be signed in to change notification settings - Fork 0
/
util.py
282 lines (262 loc) · 12 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
"""
This is an updated version of the HTML parsing logic that uses a simple HTML Lexer (tokenizer) to step through the
HTML and make all the modifications we need. The plus side is we don't have a full blown parser that makes all kinds
of changes to our HTML (closing tags etc.) that we don't want. We only alter the parts that we need, and everything
else is untouched. The bad part is, we have to make numerous passes through the HTML to get all the info we need, so
this will be a lot slower
"""
from urlparse import urlparse, urljoin
#from BeautifulSoup import BeautifulSoup, ICantBelieveItsBeautifulSoup, MinimalSoup, BeautifulStoneSoup
import urllib2, re, time
from models import Page, CSSAsset, CachedPage
from datetime import datetime
from pygments.lexers import HtmlLexer
from pygments.token import Token, Text, Comment, Operator, Keyword, Name, String, \
Number, Other, Punctuation, Literal
delimiter = '--REPLACE--'
css_types = {"STYLESHEET":1,"INLINE":2,"ATTRIBUTE":3}
t1 = -1
def mark(name=None):
"""
Timing function
"""
global t1
if t1 == -1:
t1 = time.time()
else:
t2 = time.time()
print "%s took %0.3f ms" % (name,(t2-t1) * 1000)
t1 = t2
def clear():
global t1
t1 = -1
def processPage(page_url):
"""
Main entry point in this class, Grabs a web page given a URL and parses out all the styles,
creating classes for each one
"""
mark()
try:
#Check if urlparse can find a scheme for us, if not, we just put http:// in front
parsed = urlparse(page_url)
if parsed.scheme == '':
page_url = u'http://'+ unicode(page_url)
except:
return None
try:
cached_page = CachedPage.objects.get(url=page_url)
page_content = cached_page.original
#print "fetched cached page %d" % cached_page.id
except: #TODO what is the error we are catching? Not Found?
try:
mark("check cache")
f = urllib2.urlopen(page_url) #TODO rate limit this or find some way to stop ourselves from being used as a DOS tool
page_content = unicode(f.read(),'utf-8')
#Create a cached page that we can fetch by URL later
cached_page = CachedPage()
cached_page.url = page_url
cached_page.original = page_content
cached_page.date = datetime.now()
cached_page.save()
mark("download page")
#print "saved cached page %d" % cached_page.id
except urllib2.HTTPError, error:
raise error
return None
page = Page.create()
page.url = page_url
page.original = page_content
page.save()
css_stylesheets = []
css_tags = []
mark("save page")
page_content = makeLinksAbsolute(page_content,[u'href',u'src'], page_url)
mark("make links absolute")
page_content = parseStyleAttributes(page_content, css_stylesheets, page)
mark("parse style attributes")
page_content = parseStyleTags(page_content, css_stylesheets, page)
mark("parse style tags")
page_content = parseLinkedStylesheets(page_content, css_stylesheets, page)
mark("parse linked stylesheets")
clear()
#save all the replacements to the page
page.raw = page_content
page.save()
return page
def parseStyleAttributes(document, css_stylesheets, page):
"""
Grabs any style="" attributes on normal html tags and saves the CSS therein.
Replaces the style with a delimited UUID tag that we can use later to re-insert the style
"""
attr_regex = re.compile(r'style\s*=',re.I)
output_tokens = []
tokens = HtmlLexer().get_tokens_unprocessed(document)
for index,token,value in tokens:
output_tokens.append(value)
if token == Token.Name.Attribute and attr_regex.match(value):
index,token,value = tokens.next() # get the attribute value
css_content = value.strip("\"' ")
css_name = 'style=""'#TODO get the ID attribute from the same tag (could be difficult)
css_asset = createCSSAsset(css_content, page, css_types['ATTRIBUTE'], name=css_name)
css_stylesheets.append(css_asset)#TODO css_stylesheets is not necessary anymore but perhape we can save cycles by passing it diretly to editpage instead of having to get that all from the DB again
output_tokens.append('"' + delimiter + css_asset.uuid + delimiter + '"')
return "".join(output_tokens)
def parseStyleTags(document, css_stylesheets, page):
"""
Grabs any <style> tags and saves the CSS therein. replaces with a
uuid that we can use later to re-insert the style.
"""
output_tokens = []
tokens = HtmlLexer().get_tokens_unprocessed(document)
intag = False
instyle = False
for index,token,value in tokens:
if not instyle:
output_tokens.append(value)
if not intag and token == Token.Name.Tag and re.match(r'<\s*style\s*',value):
intag = True
elif intag and token == Token.Name.Tag and re.match(r'\s*>',value):
intag = False
instyle = True
stylesheet_tokens = []
elif instyle and token == Token.Name.Tag and re.match(r'<\s*/\s*style\s*>',value):
instyle = False
css_content = "".join(stylesheet_tokens)
css_content = makeCSSURLsAbsolute(css_content,page.url)
css_asset = createCSSAsset(css_content, page, css_types['INLINE'], name='<style/>')
css_stylesheets.append(css_asset)
parseNestedStylesheets(css_asset, css_stylesheets, page)
output_tokens.append( delimiter + css_asset.uuid + delimiter )
output_tokens.append(value)
elif instyle:
stylesheet_tokens.append(value)
return "".join(output_tokens)
def parseLinkedStylesheets(document, css_stylesheets, page):
"""
Grabs any <link> tags that point to stylesheets, downloads and saves the
linked stylesheet, and replaces the link to our own saved version
"""
output_tokens = []
tokens = HtmlLexer().get_tokens_unprocessed(document)
tag_regex = re.compile(r'<\s*link',re.I)
for index,token,value in tokens:
output_tokens.append(value)
if token == Token.Name.Tag and tag_regex.match(value):
attr_dict,close_tag = parseTagAttributes(tokens)
if 'href' in attr_dict and attr_dict['href'] and 'rel' in attr_dict and attr_dict['rel'] and attr_dict['rel'].lower() == 'stylesheet':
css_url = attr_dict['href']
css_name = urlparse(css_url).path
try:
f = urllib2.urlopen(css_url)
except urllib2.HTTPError, error:
continue
#TODO other exceptions to handle here, like connection refused.
css_content = unicode(f.read(),'utf-8')
css_content = makeCSSURLsAbsolute(css_content, css_url)
css_asset = createCSSAsset(css_content, page, css_types['STYLESHEET'], css_url, css_name)
css_stylesheets.append(css_asset)
attr_dict['href'] = u'/css/%s' % css_asset.uuid #No need to save a delimited value to regex out later. the link to /css/{uuid} will be constant
parseNestedStylesheets(css_asset, css_stylesheets, page)
output_tokens.append(" " + serializeTagAttributes(attr_dict) + " ")
output_tokens.append(close_tag)
return "".join(output_tokens)
def parseNestedStylesheets(css_asset, css_stylesheets, page):
"""
Looks through a CSS stylesheet for any @import tags and downloads the imported stylesheets,
replacing their reference in the parent stylesheet with the link to our own saves version
"""
#Group(1) is everything between the @import and the actual URL,
#group(2) is the URL, and group(3) is any trailing characers
regex = re.compile(r'''(?<=@import)(\s+(?:url)?\(?\s*['"]?)((?:[^'"()\\]|\\.)*)(['"]?\s*\)?)''',re.I)
#This replacement function gets called on every match and downloads/parses the stylesheet at that location.
#TODO we might want to do this asynchronously
def replace(match):
css_url = match.group(2)
css_name = urlparse(css_url).path
try:
f = urllib2.urlopen(css_url)
except urllib2.HTTPError, error:
return match.group(0)
css_content = unicode(f.read())
css_content = makeCSSURLsAbsolute(css_content,css_url)
css_sub_asset = createCSSAsset(css_content, page, css_types['STYLESHEET'], css_url, css_name)
css_stylesheets.append(css_sub_asset)
return match.group(1) + u'/css/%s' % css_sub_asset.uuid + match.group(3)
css_asset.raw = regex.sub(replace,css_asset.raw)
css_asset.save()
def scrubCSS(css):
"""
Makes sure CSS doesn't contain any strings that might allow us to get pwned.
like closing the style and setting a script tag
"""
regex = re.compile(r'<\s*/?')
#TODO we should really just delete all content after an attempted pwn
return regex.sub('NO PWN, NO PWN, I JUST WANT TO BE ALONE',css)
def createCSSAsset(content,page,type,url='',name=''):
"""
Creates a CSSAsset class from the given values
"""
css_asset = CSSAsset.create()
css_asset.type = type
css_asset.url = url
css_asset.original = content
css_asset.raw = content
css_asset.page = page
css_asset.name = name
css_asset.save()
return css_asset
def makeLinksAbsolute(document, attrs, root_url):
"""
Looks through document for tags with attributes in attrs, and uses urljoin to make those
attribute values into absolute URLs.
"""
if type(attrs) is str:
attrs = [attrs]
attr_regex = re.compile('|'.join(attrs)+r'\s*=',re.I)
output_tokens = []
tokens = HtmlLexer().get_tokens_unprocessed(document)
for index,token,value in tokens:
output_tokens.append(value)
if token == Token.Name.Attribute and attr_regex.match(value):
index,token,value = tokens.next() # get the attribute value
output_tokens.append('"'+urljoin(root_url,value.strip("\"' "))+'"')
return u''.join(output_tokens)
def makeCSSURLsAbsolute(css_content,root_url):
"""
Looks through a CSS document for any @import or url() rules, and uses urljoin to change the URL
into an absolute one
"""
regex = re.compile(r'''((?:@import\s+(?:url\(?)?|\burl\()\s*['"]?)((?:[^'"()\\]|\\.)*)(['"]?\s*\)?)''',re.I)
#regex = re.compile(r'''\burl\(\s*['"]?((?:[^'"()\\]|\\.)*)['"]?\s*\)''',re.I)
def replace(match):
#print "matched %s : making absolute" % match.group(0)
return match.group(1) + unicode(urljoin(root_url,match.group(2))) + match.group(3)
return regex.sub(replace,css_content)
def parseTagAttributes(tokens):
"""
This is the first step to building a full parser from our lexer, while in an HTML tag, this function saves
all attributes and their values into a dictionary, until we reach the end of the tag. It returns the attribute
dictionary and the closing tag. Tokens is an iterator produced by the lexer whose current position should be at
the beginning of a tag, having just consumed the tag name.
"""
attr_dict = {}
attr_regex = re.compile(r'([a-zA-Z0-9_:-]+)(\s*=)?')
end_regex = re.compile(r'(/?\s*>)')
for index,token,value in tokens:
if token == Token.Name.Attribute :
attr_match = attr_regex.match(value)
if attr_match.group(2):
index,token,value = tokens.next() # get the attribute value
attr_dict[attr_match.group(1).lower()] = value.strip("\"' ")
else:
attr_dict[attr_match.group(1).lower()] = None
elif token != Token.Name.Attribute:
close_match = end_regex.match(value)
if close_match:
return (attr_dict,close_match.group(1))
def serializeTagAttributes(attr_dict):
"""
Takes an attribute dictionary produced by parseTagAttributes, and returns the HTML string representation
of those attributes
"""
return " ".join(k + (('="' + v + '"') if v != None else '') for k,v in attr_dict.iteritems())