Permalink
Browse files

For falls_between comparisons, get the last-modified date of the

cache file just once, at the start of the run. Otherwise, for all
but the first feed, last-mod time will be a few minutes ago.

Also add some (hopefully temporary) debugging chatter re falls_between,
and clean up the output from the most common "unknown url" message
(common in the LA Times feed, anyway).
  • Loading branch information...
1 parent 7a0ff9a commit 546fd18ef74cca4b580a2176c7b369e1cd4a175b @akkana committed Sep 20, 2012
Showing with 38 additions and 12 deletions.
  1. +37 −9 feedme
  2. +1 −3 feedmeparser.py
View
46 feedme
@@ -249,7 +249,8 @@ def falls_between(when, time1, time2) :
If when == none, return True.
If when matches time2, return True.
"""
- if not when or type(len) is str and len(when) <= 0 :
+ print >>sys.stderr, "Does", when, "fall between", time1, "and", time2, "?"
+ if not when or type(when) is str and len(when) <= 0 :
return True
# We need both times both in seconds since epoch and in struct_time:
@@ -264,13 +265,20 @@ def falls_between(when, time1, time2) :
else : raise ValueError("%s not int or struct_time" % str(t))
(t1, st1) = both_time_types(time1)
+ print >>sys.stderr, "Time1:", t1, st1
(t2, st2) = both_time_types(time2)
+ print >>sys.stderr, "Time2:", t2, st2
daysdiff = (t2 - t1) / 60. / 60. / 24.
+ print >>sys.stderr, "daysdiff is", daysdiff
+ if daysdiff < 0 :
+ msglog.err("daysdiff < 0!!! " + str(daysdiff))
# Is it a day of the month?
try :
day_of_month = int(when)
+ print >>sys.stderr, "It's a day of the month:", day_of_month,
+ print >>sys.stderr, "compared to", st1.tm_mday, "and", st2.tm_mday
# It is a day of the month! How many days in between the two dates?
if daysdiff > 31 :
@@ -304,15 +312,19 @@ def falls_between(when, time1, time2) :
# Has more than a week passed? Then it encompasses all weekdays.
if daysdiff > 7 :
+ print >>sys.stderr, "More than a week has passed"
return True
day_of_week = weekdays.index(when)
- return (st2.tm_wday - day_of_week) % 7 < daysdiff
+ print >>sys.stderr, when, "is weekday #", day_of_week,
+ print >>sys.stderr, "compared to", st1.tm_wday, "and", st2.tm_wday
+ print >>sys.stderr, "Will return", (st2.tm_wday - day_of_week) % 7, '<', daysdiff
+ return (st2.tm_wday - day_of_week) % 7 < daysdiff
#
# Get a single feed
#
-def get_feed(feedname, config, cache, cachefile, msglog) :
+def get_feed(feedname, config, cache, cachefile, last_time, msglog) :
"""Fetch a single feed"""
# Mandatory arguments:
try :
@@ -338,9 +350,8 @@ def get_feed(feedname, config, cache, cachefile, msglog) :
cache file was written?
"""
when = config.get(feedname, "when")
- if when and when != '' :
- statbuf = os.stat(cachefile)
- if not falls_between(when, statbuf.st_mtime, time.localtime()) :
+ if when and when != '' and last_time :
+ if not falls_between(when, last_time, time.localtime()) :
print >>sys.stderr, "Skipping", feedname, "-- not", when
return
print >>sys.stderr, "Yes, it's time to feed:", when
@@ -544,12 +555,24 @@ Which (default = s): """)
print >>sys.stderr, "=============="
#raise # so this entry won't get stored or cached
continue # Move on to next story
+
+ except ValueError, e :
+ # urllib2 is supposed to throw a urllib2.URLError for
+ # "unknown url type", but in practice it throws a ValueError.
+ # See this most often for doubleclick ad links in the latimes
+ # that have no spec, e.g. //ad.doubleclick.net/...
+ errmsg = "Couldn't open url " + item.link + "\n"
+ errmsg += "Title: " + item.title.encode('utf-8', 'replace')
+ print >>sys.stderr, errmsg
+ msglog.err(errmsg)
+ continue
+
except Exception as e :
# An unknown error, so report it complete with traceback.
errmsg = "Unknown error reading " + item.link + "\n"
errmsg += "Title: " + item.title.encode('utf-8', 'replace')
if verbose :
- errmsg += "Item summary was:\n------\n"
+ errmsg += "\nItem summary was:\n------\n"
errmsg += item.summary + "\n------\n"
errmsg += str(e) + '<br>\n'
errmsg += str(sys.exc_info()[0]) + '<br>\n'
@@ -770,8 +793,13 @@ Copyright 2011 by Akkana Peck; share and enjoy under the GPL v2 or later."
if options.nocache :
cache = None
cachefile = None
+ last_time = None
else :
cache, cachefile = init_cache()
+ # Figure out the last time we ran feedme.
+ # We'll use this for feeds that only update at certain times.
+ statbuf = os.stat(cachefile)
+ last_time = statbuf.st_mtime
logfilename = config.get('DEFAULT', 'logfile')
if logfilename :
@@ -785,11 +813,11 @@ Copyright 2011 by Akkana Peck; share and enjoy under the GPL v2 or later."
try :
if len(args) == 0 :
for feedname in sections :
- get_feed(feedname, config, cache, cachefile, msglog)
+ get_feed(feedname, config, cache, cachefile, last_time, msglog)
else :
for arg in args :
print >>sys.stderr, 'Getting feed for', arg
- get_feed(arg, config, cache, cachefile, msglog)
+ get_feed(arg, config, cache, cachefile, last_time, msglog)
# This causes a lot of premature exits. Not sure why we end up
# here rather than in the inner KeyboardInterrupt section.
View
4 feedmeparser.py
@@ -85,8 +85,7 @@ def fetch_url(self, url, newdir, newname, title=None, author=None,
# text/something, that's bad.
ctype = response.headers['content-type']
if ctype and ctype != '' and ctype[0:4] != 'text' :
- #msglog.error(url + " isn't text -- skipping")
- print url + " isn't text -- skipping"
+ print >>sys.stderr, url, "isn't text -- skipping"
response.close()
return
@@ -207,7 +206,6 @@ def fetch_url(self, url, newdir, newname, title=None, author=None,
# Another way would be to use (.|\\n) in place of .
# For some reason [.\n] doesn't work.
#html = re.sub(skip, '', html, flags=re.DOTALL)
- else : print "no skip pats"
self.single_page_url = None

0 comments on commit 546fd18

Please sign in to comment.