From 699bf70357abae1ef8e9b05f16a6f68a8e88153a Mon Sep 17 00:00:00 2001 From: Akkana Peck Date: Thu, 5 Dec 2013 18:27:42 -0800 Subject: [PATCH 1/4] Don't fail to download links just because they contain # (startswith # is a jump to a named anchor, but many links contain # later on). Try to be more comprehensive about links that failed, and batch up errors to show the user at the end (even though the user won't see them if they click Dismiss at the SL4A prompt ... oh well). --- feedfetcher/feedfetcher.py | 49 +++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/feedfetcher/feedfetcher.py b/feedfetcher/feedfetcher.py index bea37a3..eb2535a 100755 --- a/feedfetcher/feedfetcher.py +++ b/feedfetcher/feedfetcher.py @@ -13,10 +13,12 @@ import sys, os import time +# import traceback + ############# CONFIGURATION ######################################## # Put your server base URL here, the dir that will contain -# both feedme and feeds directories. It must end with a slash. +# both feedme/ and feeds/ directories. It must end with a slash. serverurl = 'http://localhost/' # Where to download feeds if running locally. @@ -29,6 +31,8 @@ ############# END CONFIGURATION #################################### +errstr = '' + # Are we on Android? Make it optional, for easier testing. try: import android @@ -41,9 +45,12 @@ is_android = False def perror(s): - if is_android: - droid.makeToast(s) + global errstr + # makeToast is cool but takes way too long and might still not be seen. + #if is_android: + # droid.makeToast(s) print s + errstr += '\n' + s def fetch_url_to(url, outfile): if os.path.exists(outfile): @@ -53,14 +60,21 @@ def fetch_url_to(url, outfile): print "Fetching", url, "to", outfile # Read the URL. It may fail: not all referenced links - # are always successfully downloaded. + # were successfully downloaded to the server. + # But sometimes it fails for other reasons too, + # so we need to distinguish a "not found" from other causes. try: infile = urllib2.urlopen(url) contents = infile.read() infile.close() - except urllib2.HTTPError: - print "Couldn't fetch " + url - # Don't do perror because droid.makeToast() delays way too long. + except urllib2.HTTPError, e: + perror("Couldn't fetch %s: HTTPError code %s" % (url, str(e.code))) + return + except urllib2.URLError, e: + perror("Couldn't fetch %s: URLError args %s" % (url, str(e.args))) + return + except ValueError, e: + perror("Couldn't fetch %s: ValueError, %s" % (url, str(e))) return # Copy to the output file @@ -81,7 +95,10 @@ def fetch_url_to(url, outfile): return def not_a_local_link(l): - if not link or ':' in link or '#' in link or link[0] == '/' \ + '''If a link doesn't have a schema, consists only of an + absolute path or a named anchor like #, it's local. + ''' + if not link or ':' in link or link.startswith('#') or link[0] == '/' \ or link.startswith('../'): # I don't know why we see ../ links, but we do. return True @@ -170,11 +187,11 @@ def parse_directory_page(urldir): def fetch_feeds_dir_recursive(urldir, outdir): feeddirs = parse_directory_page(urldir) if feeddirs == None: - errstr = "Couldn't find %s on server" % os.path.basename(urldir) - perror(errstr) + err = "Couldn't find %s on server" % os.path.basename(urldir) + perror(err) if is_android: droid.vibrate() - droid.notify(errstr) + droid.notify(err) return # now feeddirs[] should contain the subdirs we want to fetch. @@ -238,6 +255,11 @@ def url_exists(url): print "\nOops, got some HTTP error other than a 404" raise(e) + # We can also get various other errors, such as httplib.BadStatusLine + except Exception, e: + print "Problem checking whether URL exists!" + raise(e) + def wait_for_feeds(baseurl): # When the server is done running feedme, it should create a file # inside the date directory called LOG. @@ -312,6 +334,7 @@ def check_if_feedme_run(feedurl, dateurl): print "Feedme is running already" else: print "Feedme already ran to completion" + sys.stdout.flush() try: if already_ran == 0: @@ -327,3 +350,7 @@ def check_if_feedme_run(feedurl, dateurl): print "KeyboardInterrupt" except urllib2.URLError, e: print "Couldn't access server: " + str(e) + + if errstr: + print "\n\n====== ERRORS ============" + print errstr From 18abecf402277969c9785bd91849b89bc08b0d67 Mon Sep 17 00:00:00 2001 From: Akkana Peck Date: Tue, 10 Dec 2013 18:46:11 -0800 Subject: [PATCH 2/4] More robust handling of URLerrors: try to get the error code when it's available. Eventually we'd like to be able to detect network-down conditions and loop prompting the user to try to reset the network. (If sl4a allows prompting ...) Also, strip # named anchors from both fetched URL and saved filename. --- feedfetcher/feedfetcher.py | 41 +++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/feedfetcher/feedfetcher.py b/feedfetcher/feedfetcher.py index eb2535a..f9291f8 100755 --- a/feedfetcher/feedfetcher.py +++ b/feedfetcher/feedfetcher.py @@ -57,6 +57,15 @@ def fetch_url_to(url, outfile): print os.path.basename(outfile), "already exists -- not re-fetching" return + # If there's a named anchor appended, strip it from both url and filename. + # Pro Publica in particular has a lot of these. + if '#' in url: + print "Stripping named anchor from url", url + url = url[:url.find('#')] + if '#' in outfile: + print "Stripping named anchor from outfile", outfile + outfile = outfile[:outfile.find('#')] + print "Fetching", url, "to", outfile # Read the URL. It may fail: not all referenced links @@ -160,7 +169,9 @@ def parse_directory_page(urldir): dirpage = f.read() # dirlines = dirpage.split('\n') f.close() - except urllib2.HTTPError: + except urllib2.HTTPError, e: + perror("HTTP error parsing directory page: code is %d" \ + % (e.code)) return None # Parse the directory contents to get the list of feeds @@ -252,12 +263,36 @@ def url_exists(url): except urllib2.HTTPError, e: if e.code == 404: return False - print "\nOops, got some HTTP error other than a 404" + perror("HTTP error checking whether URL %s exists! code %d" % (url, e.code)) + raise(e) + + except urllib2.URLError, e: + # type(e) is urllib2.URLError + # e.args is type tuple + # e.args[0] is type socket.gaierror + # e.args[1] is None + # e.reason is "[Errno 2] temporary failure in name resolution" + # e.reason is type socket.gaierror + # There is NO documentation on how to handle these, + # but from fiddling around, a socket.gaierror can be treated + # as a tuple where the 0th element is the errno. + #perror("URL error checking whether URL %s exists! type %s, arg type %s, args %s, reason: %s, reason is type %s" % (url, str(type(e)), str(type(e.args)), str(e.args), str(e.reason), type(e.reason))) + perror("URL error checking whether URL %s exists! errno %d" \ + % (url, e.reason[0])) + # Was it because we don't have a network at all? + # 2 = failure in address resolution, -2 = name or service not known. + # If it's one of these, give the user a chance to notice it and + # restart the network. + if e.reason[0] == 2 or e.reason[0] == -2: + droid.vibrate() + droid.makeToast("Network may be down!") + return False raise(e) # We can also get various other errors, such as httplib.BadStatusLine except Exception, e: - print "Problem checking whether URL exists!" + perror("Problem checking whether URL %s exists!\nException: %s" \ + % (url, str(e))) raise(e) def wait_for_feeds(baseurl): From 9b60acd8d21dcb7da154fbb6cb8e04ba83407f9e Mon Sep 17 00:00:00 2001 From: Akkana Peck Date: Tue, 10 Dec 2013 18:48:25 -0800 Subject: [PATCH 3/4] Remove some of the verbose comments on URLError and how to debug URLError conditions. I left the comments in intentionally the first time, so those suggestions will be available in the git history if anyone ever needs them. But they don't need to clog the running code forever. --- feedfetcher/feedfetcher.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/feedfetcher/feedfetcher.py b/feedfetcher/feedfetcher.py index f9291f8..7f9a18a 100755 --- a/feedfetcher/feedfetcher.py +++ b/feedfetcher/feedfetcher.py @@ -267,15 +267,10 @@ def url_exists(url): raise(e) except urllib2.URLError, e: - # type(e) is urllib2.URLError - # e.args is type tuple - # e.args[0] is type socket.gaierror - # e.args[1] is None - # e.reason is "[Errno 2] temporary failure in name resolution" - # e.reason is type socket.gaierror - # There is NO documentation on how to handle these, - # but from fiddling around, a socket.gaierror can be treated - # as a tuple where the 0th element is the errno. + # There is NO documentation on how to handle URLErrors + # or how to find out the code, + # but from fiddling around, it's a a socket.gaierror, + # which can be treated as a tuple where the 0th element is the errno. #perror("URL error checking whether URL %s exists! type %s, arg type %s, args %s, reason: %s, reason is type %s" % (url, str(type(e)), str(type(e.args)), str(e.args), str(e.reason), type(e.reason))) perror("URL error checking whether URL %s exists! errno %d" \ % (url, e.reason[0])) From 6861febd1b156c876da850c853f7127694a3a1ca Mon Sep 17 00:00:00 2001 From: Akkana Peck Date: Tue, 10 Dec 2013 19:47:24 -0800 Subject: [PATCH 4/4] Remove a commented-out print --- feedfetcher/feedfetcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/feedfetcher/feedfetcher.py b/feedfetcher/feedfetcher.py index 7f9a18a..8cd5bf6 100755 --- a/feedfetcher/feedfetcher.py +++ b/feedfetcher/feedfetcher.py @@ -271,7 +271,6 @@ def url_exists(url): # or how to find out the code, # but from fiddling around, it's a a socket.gaierror, # which can be treated as a tuple where the 0th element is the errno. - #perror("URL error checking whether URL %s exists! type %s, arg type %s, args %s, reason: %s, reason is type %s" % (url, str(type(e)), str(type(e.args)), str(e.args), str(e.reason), type(e.reason))) perror("URL error checking whether URL %s exists! errno %d" \ % (url, e.reason[0])) # Was it because we don't have a network at all?