Skip to content

Commit

Permalink
Handle redirects and referrer url
Browse files Browse the repository at this point in the history
  • Loading branch information
akkana committed Apr 19, 2012
1 parent faeb57e commit 062ffb8
Showing 1 changed file with 59 additions and 21 deletions.
80 changes: 59 additions & 21 deletions feedmeparser.py
Expand Up @@ -39,14 +39,15 @@ def __init__(self, config, feedname) :
self.outfile = None self.outfile = None


def fetch_url(self, url, newdir, newname, title=None, author=None, def fetch_url(self, url, newdir, newname, title=None, author=None,
footer='') : footer='', referrer=None) :
"""Read a URL from the web. Parse it, rewriting any links, """Read a URL from the web. Parse it, rewriting any links,
downloading any images and making any other changes needed downloading any images and making any other changes needed
according to the config file and current feed name. according to the config file and current feed name.
Write the modified HTML output to $newdir/$newname, Write the modified HTML output to $newdir/$newname,
and download any images into $newdir. and download any images into $newdir.
""" """
if self.config.getboolean(self.feedname, 'verbose') : verbose = self.config.getboolean(self.feedname, 'verbose')
if verbose :
print >>sys.stderr, "Fetching link", url, \ print >>sys.stderr, "Fetching link", url, \
"to", newdir, "/", newname "to", newdir, "/", newname


Expand All @@ -66,6 +67,12 @@ def fetch_url(self, url, newdir, newname, title=None, author=None,
# Nobody seems to have RSS pointing to RSS. # Nobody seems to have RSS pointing to RSS.
request = urllib2.Request(url) request = urllib2.Request(url)


# If we're after the single-page URL, we may need a referrer
if referrer :
if verbose :
print >>sys.stderr, "Adding referrer", referrer
request.add_header('Referer', referrer)

response = urllib2.urlopen(request) response = urllib2.urlopen(request)


# At this point it would be lovely to check whether the # At this point it would be lovely to check whether the
Expand All @@ -83,10 +90,19 @@ def fetch_url(self, url, newdir, newname, title=None, author=None,
response.close() response.close()
return return


# Were we redirected? geturl() will tell us that.
self.cururl = response.geturl()

# but sadly, that means we need another request object
# to parse out the host and prefix:
real_request = urllib2.Request(self.cururl)

# feed() is going to need to know the host, to rewrite urls. # feed() is going to need to know the host, to rewrite urls.
# So save it: # So save it, based on any redirects we've had:
self.host = request.get_host() #self.host = request.get_host()
self.prefix = request.get_type() + '://' + self.host + '/' #self.prefix = request.get_type() + '://' + self.host + '/'
self.host = real_request.get_host()
self.prefix = real_request.get_type() + '://' + self.host + '/'


outfilename = os.path.join(self.newdir, self.newname) outfilename = os.path.join(self.newdir, self.newname)
self.outfile = open(outfilename, "w") self.outfile = open(outfilename, "w")
Expand Down Expand Up @@ -122,7 +138,15 @@ def fetch_url(self, url, newdir, newname, title=None, author=None,


# Read the content of the link: # Read the content of the link:
# This can die with socket.error, "connection reset by peer" # This can die with socket.error, "connection reset by peer"
html = response.read() try :
html = response.read()
print >>sys.stderr, "successully read", url
# XXX Need to guard against IncompleteRead -- but what class owns it??
#except httplib.IncompleteRead, e :
# print >>sys.stderr, "Ignoring IncompleteRead on", url
except Exception, e :
print >>sys.stderr, "Unknown error from response.read()", url

#print >>sys.stderr, "response.read() returned type", type(html) #print >>sys.stderr, "response.read() returned type", type(html)
# Want to end up with unicode. In case it's str, decode it: # Want to end up with unicode. In case it's str, decode it:
if type(html) is str : if type(html) is str :
Expand All @@ -144,7 +168,7 @@ def fetch_url(self, url, newdir, newname, title=None, author=None,
print "looking for page_start", page_start print "looking for page_start", page_start
match = html.find(page_start) match = html.find(page_start)
if match >= 0: if match >= 0:
if self.config.getboolean(self.feedname, 'verbose') : if verbose :
print "Found page_start", page_start print "Found page_start", page_start
html = html[match:] html = html[match:]
break break
Expand All @@ -154,7 +178,7 @@ def fetch_url(self, url, newdir, newname, title=None, author=None,
print "looking for page_end", page_end print "looking for page_end", page_end
match = html.find(page_end) match = html.find(page_end)
if match >= 0: if match >= 0:
if self.config.getboolean(self.feedname, 'verbose') : if verbose :
print "Found page_end", page_end print "Found page_end", page_end
html = html[0 : match] html = html[0 : match]


Expand All @@ -164,7 +188,7 @@ def fetch_url(self, url, newdir, newname, title=None, author=None,
if len(skip_pats) > 0 : if len(skip_pats) > 0 :
print len(skip_pats), "skip pats" print len(skip_pats), "skip pats"
for skip in skip_pats : for skip in skip_pats :
if self.config.getboolean(self.feedname, 'verbose') : if verbose :
print >>sys.stderr, "Trying to skip", skip print >>sys.stderr, "Trying to skip", skip
#print >>sys.stderr, "in", html.encode('utf-8') #print >>sys.stderr, "in", html.encode('utf-8')
#sys.stderr.flush() #sys.stderr.flush()
Expand Down Expand Up @@ -204,18 +228,32 @@ def fetch_url(self, url, newdir, newname, title=None, author=None,
# when we're called recursively, url will be the single # when we're called recursively, url will be the single
# page url so we won't make another recursive call. # page url so we won't make another recursive call.
singlefile = outfilename + ".single" singlefile = outfilename + ".single"
self.fetch_url(self.single_page_url, newdir, singlefile, try :
title=title, footer=footer) if verbose :
# If the fetch succeeded and we have a single-page file, print >>sys.stderr, \
# replace the original file with it "Trying to fetch single-page url with referrer =", \
# and remove the original. response.geturl(), "instead of", url
if os.path.exists(singlefile) : self.fetch_url(self.single_page_url, newdir, singlefile,
#os.rename(outfilename, outfilename + '.1') title=title, footer=footer,
os.remove(outfilename) referrer=response.geturl())
os.rename(singlefile, outfilename)
if self.config.getboolean(self.feedname, 'verbose') : # If the fetch succeeded and we have a single-page file,
print >>sys.stderr, "Removing", outfilename, \ # replace the original file with it
"and renaming", singlefile # and remove the original.
if os.path.exists(singlefile) :
#os.rename(outfilename, outfilename + '.1')
os.remove(outfilename)
os.rename(singlefile, outfilename)
if verbose :
print >>sys.stderr, "Removing", outfilename, \
"and renaming", singlefile
else :
print >>sys.stderr, \
"Tried to fetch single-page file but apparently failed"
except (IOError, urllib2.HTTPError) as e :
print >>sys.stderr, "Couldn't read single-page URL", \
self.single_page_url
print >>sys.stderr, e


def feed(self, html) : def feed(self, html) :
"""Duplicate, in a half-assed way, HTMLParser.feed() but """Duplicate, in a half-assed way, HTMLParser.feed() but
Expand Down

0 comments on commit 062ffb8

Please sign in to comment.