Permalink
Browse files

Add error handling to catch the various ambiguous errors that might

come from lxml.html. It throws ValueError for everything, and the only
way to find out what really happened is to parse the string.
ValueError "unknown url type:" means it hit an improperly formed URL,
like one with no schema like http:.
ValueError of "Unicode strings with encoding declaration" in
lxml.html.fromstring() means that urllib2 has inserted a preface like
<?xml version="1.0" encoding="utf-8"?>
which causes urllib2 to barf.

Use xmlcharrefreplace when writing output: lxml.xml insists on replacing
all HTML entities with unicode characters. You can't get back the original
HTML entity -- that info has been thrown away -- but at least you can
write the numerical char ref replacement.

Add a warning class to MsgLog.
  • Loading branch information...
1 parent 4188fd1 commit 737114bbe41011f4ade13328dcd738b170774824 @akkana committed Oct 8, 2012
Showing with 83 additions and 32 deletions.
  1. +26 −8 feedme
  2. +57 −24 feedmeparser.py
View
@@ -182,12 +182,16 @@ class MsgLog :
def msg(self, s) :
self.msgstr += "\n" + s
- print "", s.encode('ascii', 'backslashreplace')
+ print "MESSAGE:", s.encode('ascii', 'backslashreplace')
+
+ def warn(self, s) :
+ self.msgstr += "\n" + s
+ print "WARNING:", s.encode('ascii', 'backslashreplace')
def err(self, s) :
self.errstr += "\n" + s
print "ERROR:", s.encode('ascii', 'backslashreplace')
- traceback.print_stack()
+ #traceback.print_stack()
def get_msgs(self) :
return self.msgstr
@@ -564,13 +568,27 @@ Which (default = s): """)
except ValueError, e :
# urllib2 is supposed to throw a urllib2.URLError for
- # "unknown url type", but in practice it throws a ValueError.
- # See this most often for doubleclick ad links in the latimes
+ # "unknown url type", but in practice it throws ValueError.
+ # See this e.g. for doubleclick ad links in the latimes
# that have no spec, e.g. //ad.doubleclick.net/...
- errmsg = "Couldn't open url " + item.link + "\n"
- errmsg += "Title: " + item.title.encode('utf-8', 'replace')
- print >>sys.stderr, errmsg
- msglog.err(errmsg)
+ # Unfortunately it seems to happen in other cases too,
+ # so there's no way to separate out the urllib2 ones
+ # except by string: str(sys.exc_info()[1]) starts with
+ # "unknown url type:"
+ errmsg = "ValueError on title "
+ errmsg += item.title.encode('utf-8', 'replace')
+ errmsg += "\n"
+ # print >>sys.stderr, errmsg
+ # msglog.err will print it, no need to print it again.
+ if str(sys.exc_info()[1]).startswith("unknown url type:") :
+ # Don't show stack trace for unknown URL types,
+ # since it's a known error.
+ errmsg += str(sys.exc_info()[1]) + " - couldn't load\n"
+ msglog.warn(errmsg)
+ else :
+ errmsg += "ValueError on url " + item.link + "\n"
+ errmsg += traceback.format_exc(sys.exc_info()[2])
+ msglog.err(errmsg)
continue
except Exception as e :
View
@@ -144,7 +144,6 @@ def fetch_url(self, url, newdir, newname, title=None, author=None,
html = None
try :
html = response.read()
- print >>sys.stderr, "successully read", url
# XXX Need to guard against IncompleteRead -- but what class owns it??
#except httplib.IncompleteRead, e :
# print >>sys.stderr, "Ignoring IncompleteRead on", url
@@ -158,7 +157,6 @@ def fetch_url(self, url, newdir, newname, title=None, author=None,
#print >>sys.stderr, "response.read() returned type", type(html)
# Want to end up with unicode. In case it's str, decode it:
if type(html) is str :
- #print >>sys.stderr, "decoding html using charset", self.encoding
# But sometimes this raises errors anyway, even using
# the page's own encoding, so use 'replace':
html = html.decode(self.encoding, 'replace')
@@ -262,12 +260,29 @@ def fetch_url(self, url, newdir, newname, title=None, author=None,
self.single_page_url
print >>sys.stderr, e
- def feed(self, html) :
+ def feed(self, uhtml) :
"""Duplicate, in a half-assed way, HTMLParser.feed() but
using lxml.html since it handles real-world documents.
+ Input is expected to be unicode.
"""
- # Parse the whole document
- tree = lxml.html.fromstring(html)
+ # Parse the whole document.
+ # (Trying valiantly to recover from lxml errors.)
+ try :
+ tree = lxml.html.fromstring(uhtml)
+ except ValueError :
+ print "ValueError!"
+ # Idiot lxml.html that doesn't give any sensible way
+ # to tell what really went wrong:
+ if str(sys.exc_info()[1]).startswith(
+ "Unicode strings with encoding declaration") :
+ # This seems to happen because somehow the html gets
+ # something like this inserted at the beginning:
+ # <?xml version="1.0" encoding="utf-8"?>
+ # So if we've hit the error, try to remove it:
+ tree = lxml.html.fromstring(re.sub(
+ '<\?xml .*encoding=[\'"].*?[\'"]\?>', '', uhtml))
+ else :
+ raise ValueError
# Iterate over the DOM tree:
self.crawl_tree(tree)
@@ -318,7 +333,7 @@ def handle_starttag(self, tag, attrs):
return
#print "type(tag) =", type(tag)
- self.outfile.write('<' + tag.encode(self.encoding, 'replace'))
+ self.outfile.write('<' + tag.encode(self.encoding, 'xmlcharrefreplace'))
if tag == 'a' :
if 'href' in attrs.keys() :
@@ -339,7 +354,8 @@ def handle_starttag(self, tag, attrs):
self.make_absolute(href[m.start():m.end()])
print >>sys.stderr, \
"\nFound single-page pattern:", \
- self.single_page_url.encode('utf-8', 'replace')
+ self.single_page_url.encode('utf-8',
+ 'xmlcharrefreplace')
# But continue fetching the regular pattern,
# since the single-page one may fail
@@ -400,16 +416,20 @@ def handle_starttag(self, tag, attrs):
except Exception, e :
print "Error downloading image:", str(e), src
else :
+ # Looks like it's probably a nonlocal image.
+ # Possibly this could be smarter about finding similar domains,
+ # or having a list of allowed image domains.
print >>sys.stderr, req.get_host(), "and", self.host, "are too different -- not fetching"
# Now we've done any needed processing to the tag and its attrs.
# t's time to write them to the output file.
for attr in attrs.keys() :
- self.outfile.write(' ' + attr.encode(self.encoding, 'replace'))
+ self.outfile.write(' ' + attr.encode(self.encoding,
+ 'xmlcharrefreplace'))
if attrs[attr] and type(attrs[attr]) is str :
# make sure attr[1] doesn't have any embedded double-quotes
val = attrs[attr].replace('"', '\"').encode(self.encoding,
- 'replace')
+ 'xmlcharrefreplace')
self.outfile.write('="' + val + '"')
self.outfile.write('>')
@@ -433,32 +453,45 @@ def handle_endtag(self, tag):
if tag == "body" or tag == 'html' :
return
- self.outfile.write('</' + tag.encode(self.encoding, 'replace') + '>\n')
+ self.outfile.write('</' + tag.encode(self.encoding,
+ 'xmlcharrefreplace') + '>\n')
def handle_data(self, data):
+ # XXX lxml.etree.tostring() might be a cleaner way of printing
+ # these nodes: http://lxml.de/tutorial.html
if self.skipping :
#print >>sys.stderr, "Skipping data"
return
if type(data) is unicode :
- #print >>sys.stderr, "Writing unicode"
- self.outfile.write(data.encode(self.encoding, 'replace'))
+ #print >>sys.stderr, "Unicode data is", \
+ # data.encode(self.encoding, 'xmlcharrefreplace')
+ self.outfile.write(data.encode(self.encoding, 'xmlcharrefreplace'))
elif type(data) is str :
- #print >>sys.stderr, "Writing text", data
+ #print >>sys.stderr, "Text data is", data
self.outfile.write(data)
else :
print >>sys.stderr, "Data isn't str or unicode! type =", type(title)
- def handle_charref(self, num) :
- if self.skipping :
- #print "Skipping charref"
- return
- self.outfile.write('&#' + num.encode(self.encoding, 'replace') + ';')
-
- def handle_entityref(self, name) :
- if self.skipping :
- #print "Skipping entityref"
- return
- self.outfile.write('&' + name.encode(self.encoding, 'replace') + ';')
+ # def handle_charref(self, num) :
+ # # I don't think we ever actually get here -- lxml.html.fromstring()
+ # # already replaces all html entities with the numeric unicode
+ # # equivalent whether we want that or not, and we have to write
+ # # them out in handle_data with xmlcharrefreplace.
+ # # If we really really wanted to we might be able to keep the
+ # # page's original entities by calling fromstring(cgi.urlescape(html))
+ # # html before
+ # if self.skipping :
+ # #print "Skipping charref"
+ # return
+ # self.outfile.write('&#' + num.encode(self.encoding,
+ # 'xmlcharrefreplace') + ';')
+
+ # def handle_entityref(self, name) :
+ # if self.skipping :
+ # #print "Skipping entityref"
+ # return
+ # self.outfile.write('&' + name.encode(self.encoding,
+ # 'xmlcharrefreplace') + ';')
def same_host(self, host1, host2) :
"""Are two hosts close enough for the purpose of downloading images?"""

0 comments on commit 737114b

Please sign in to comment.