Permalink
Browse files

Revert "Feed cleaning. Coerce everything to unicode."

This reverts commit af2b102.

Conflicts:

	clean.py
	main.py
	util.py
  • Loading branch information...
1 parent bd03b16 commit f2d16b31cf8d2ce92095917b2d0227918a5b32f6 @arantius committed Aug 31, 2010
Showing with 36 additions and 155 deletions.
  1. +1 −5 CREDITS
  2. +5 −41 clean.py
  3. +3 −3 extract_content.py
  4. +18 −1 extract_feed.py
  5. +8 −22 main.py
  6. +0 −52 templates/feed.xml
  7. +1 −13 templates/main.html
  8. +0 −18 util.py
View
@@ -1,7 +1,3 @@
-Partially derived from "hn.py", originally by Nirmal Patel.
+Includes "Hacker News" feed cleaning code, originally by Nirmal Patel.
http://nirmalpatel.com/fcgi/hn.py
Reused under terms of the GPLv3 license.
-
-Partially derived from "Readable Feeds" by Andrew Trusty.
-http://github.com/scyclops/Readable-Feeds
-Reused under terms of the GPLv3 license.
View
@@ -27,10 +27,7 @@
import os
import re
-from google.appengine.ext import deferred
-
from third_party import BeautifulSoup
-from third_party import feedparser
import extract_content
import extract_feed
@@ -63,34 +60,7 @@
))
-def CleanFeed(feed_url, keep_contents):
- feed_source, _ = util.Fetch(feed_url)
- feed = feedparser.parse(feed_source)
-
- # Sort and limit maximum number of entries.
- feed.entries = sorted(feed.entries, key=lambda e: e.updated_parsed)[0:15]
-
- # Pre-clean (and cache) entries in parallel, via deferred.
- # (This will be slightly wasteful, we might double up calls here that
- # start but don't finish in deferred.
- if not util.IS_DEV_APPSERVER:
- for entry in feed.entries:
- deferred.defer(_CleanUrlDeferred, entry.link)
-
- # For those left, clean up the contents.
- for entry in feed.entries:
- clean_content = CleanUrl(entry.link)
- if keep_contents:
- entry.content = u'%s<hr>%s' % (util.EntryContent(entry), clean_content)
- else:
- entry.content = clean_content
-
- return feed
-if not util.IS_DEV_APPSERVER:
- CleanFeed = util.Memoize('Clean_%s_%d', 1800)(CleanFeed)
-
-
-def CleanUrl(url):
+def Clean(url):
"""Clean the contents of a given URL to only the "readable part".
Handle special cases like YouTube, PDF, images directly. Delegate out to
@@ -131,15 +101,7 @@ def CleanUrl(url):
return note + Munge(content)
if not util.IS_DEV_APPSERVER:
- CleanUrl = util.Memoize('Clean_%s', 3600*24)(CleanUrl)
-
-
-def _CleanUrlDeferred(url):
- """Call CleanUrl() but catch any possible exception, to avoid retry loops."""
- try:
- CleanUrl(url)
- except:
- pass # pylint: disable-msg=W0702
+ Clean = util.Memoize('Clean_%s', 3600*24)(Clean) # pylint: disable-msg=C6409
def Munge(html):
@@ -178,4 +140,6 @@ def Munge(html):
for tag in soup.findAll(name='img', attrs={'src': RE_FEEDBURNER_LINK}):
tag.extract()
- return unicode(soup)
+ content = soup.renderContents()
+
+ return content
View
@@ -115,7 +115,7 @@ def ExtractFromHtml(url, html):
soup = BeautifulSoup.BeautifulSoup(html)
except HTMLParser.HTMLParseError, e:
logging.exception(e)
- return u''
+ return ''
_Strip(soup, _UnwantedTagPre)
@@ -152,14 +152,14 @@ def ExtractFromHtml(url, html):
logging.debug('%10.2f %s', parent['score'], util.SoupTagOnly(parent))
if not top_parent:
- return u''
+ return ''
# Strip pieces with negative scores here?
_Strip(soup, _UnwantedTagPost)
_FixUrls(top_parent, url)
- return unicode(top_parent)
+ return top_parent.renderContents()
def _FixUrls(parent, base_url):
View
@@ -98,7 +98,7 @@ def __init__(self, url=None, final_url=None, html=None):
self.feed = feedparser.parse(feed_source)
self._FindEntry()
- self.content = util.EntryContent(self.entry)
+ self.content = self._GetContent()
# Now, we've found content. Check if it's legit.
soup = BeautifulSoup.BeautifulSoup(self.content)
@@ -140,3 +140,20 @@ def _UrlsMatch(self, url1, url2, trim_query):
if trim_query:
url1 = TrimQuery(url1)
return url1 == url2
+
+ def _GetContent(self):
+ """Figure out the best content for this entry."""
+ # Prefer "content".
+ if 'content' in self.entry:
+ # If there's only one, use it.
+ if len(self.entry.content) == 1:
+ return self.entry.content[0]['value']
+ # Or, use the text/html type if there's more than one.
+ for content in self.entry.content:
+ if 'text/html' == content.type:
+ return content['value']
+ # Otherwise try "summary_detail" and "summary".
+ if 'summary_detail' in self.entry:
+ return self.entry.summary_detail['value']
+ if 'summary' in self.entry:
+ return self.entry.summary
View
@@ -21,12 +21,10 @@
import logging
import os
-import urllib
from google.appengine.ext import webapp
from google.appengine.ext.webapp import template
from google.appengine.ext.webapp.util import run_wsgi_app
-from google.appengine import runtime
import clean
import util
@@ -46,32 +44,20 @@ def get(self):
class Clean(webapp.RequestHandler):
def get(self):
- feed = self.request.get('feed')
- if feed:
- try:
- keep_contents = self.request.get('keep_contents', 'False') == 'True'
- clean_feed = clean.CleanFeed(feed, keep_contents)
- output = util.RenderTemplate('feed.xml', clean_feed)
- except runtime.DeadlineExceededError:
- # If we run out of time, we've probably processed (and cached) at least
- # one item. Tell the client to redirect back here again, to resume
- # processing, and pick up after the cached items.
- self.response.clear()
- self.redirect('/clean?feed=' + urllib.quote(feed))
+ url = self.request.get('url') or self.request.get('link')
+ html_wrap = self.request.get('html_wrap', 'False') == 'True'
+ if url:
+ output = clean.Clean(url)
+ if html_wrap:
+ output = u'<html><body>\n%s\n</body></html>' % output
else:
- url = self.request.get('url') or self.request.get('link')
- html_wrap = self.request.get('html_wrap', 'False') == 'True'
- if url:
- output = clean.CleanUrl(url)
- if html_wrap:
- output = u'<html><body>\n%s\n</body></html>' % output
- else:
- output = 'Provide either "url" or "feed" parameters!'
+ output = 'Provide either "url" or "feed" parameters!'
self.response.headers['Content-Type'] = 'text/html; charset=UTF-8'
self.response.out.write(output)
+
def main():
application = webapp.WSGIApplication(
[('/', MainPage), ('/clean', Clean)],
View
@@ -1,52 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<rss version="2.0">
-<channel>
-<title>{{ feed.title|escape }}</title>
-<link>{{ feed.link|escape }}</link>
-<description>{{ feed.subtitle|escape }}</description>
-{% for entry in entries %}
-<item>
-<title>{{ entry.title|escape }}</title>
-<link>{{ entry.link|escape }}</link>
-<pubDate>{{ entry.updated }}</pubDate>
-<description><![CDATA[{{ entry.content }}]]></description>
-</item>
-{% endfor %}
-</channel>
-</rss>
-
-<!--
-<item>
-
-<title>Star City</title>
-<link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
-<description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's &lt;a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm"&gt;Star City&lt;/a&gt;.</description>
-<pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
-<guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
-
-</item>
-<item>
-<description>Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a &lt;a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm"&gt;partial eclipse of the Sun&lt;/a&gt; on Saturday, May 31st.</description>
-<pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate>
-<guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid>
-
-</item>
-<item>
-<title>The Engine That Does More</title>
-<link>http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp</link>
-<description>Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that.</description>
-<pubDate>Tue, 27 May 2003 08:37:32 GMT</pubDate>
-<guid>http://liftoff.msfc.nasa.gov/2003/05/27.html#item571</guid>
-
-</item>
-<item>
-<title>Astronauts' Dirty Laundry</title>
-<link>http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp</link>
-<description>Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options.</description>
-<pubDate>Tue, 20 May 2003 08:56:02 GMT</pubDate>
-<guid>http://liftoff.msfc.nasa.gov/2003/05/20.html#item570</guid>
-
-</item>
-</channel>
-</rss>
--->
View
@@ -6,8 +6,7 @@
<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />
</head>
<body>
-
-<p>Clean up a URL:</p>
+<p>Clean it up:</p>
<form method='get' action='/clean'>
URL: <input type='text' name='url' size='50'>
<input type='submit'><br>
@@ -16,16 +15,5 @@
Wrap contents in &lt;html&gt; and &lt;body&gt; tags
</label>
</form>
-
-<p>Clean up a feed:</p>
-<form method='get' action='/clean'>
-URL: <input type='text' name='feed' size='50'>
-<input type='submit'><br>
-<label>
- <input type='checkbox' name='keep_contents' value='True'>
- Keep original feed contents
-</label>
-</form>
-
</body>
</html>
View
@@ -46,24 +46,6 @@ def InnerDecorator(*args, **kwargs):
return Decorator
-def EntryContent(entry):
- """Figure out the best content for this (feedparser) entry."""
- # Prefer "content".
- if 'content' in entry:
- # If there's only one, use it.
- if len(entry.content) == 1:
- return unicode(entry.content[0]['value'])
- # Or, use the text/html type if there's more than one.
- for content in entry.content:
- if 'text/html' == content.type:
- return unicode(content['value'])
- # Otherwise try "summary_detail" and "summary".
- if 'summary_detail' in entry:
- return unicode(entry.summary_detail['value'])
- if 'summary' in entry:
- return unicode(entry.summary)
-
-
@Memoize('Fetch_%s')
def Fetch(url):
"""Fetch a URL, return its contents and any final-after-redirects URL."""

0 comments on commit f2d16b3

Please sign in to comment.