Permalink
Browse files

Apply special case hack for CNN.

  • Loading branch information...
1 parent ce88874 commit eda502e019031829ff2a2728170a709dfc7803a8 @arantius committed Aug 29, 2011
Showing with 4 additions and 0 deletions.
  1. +4 −0 util.py
View
@@ -36,6 +36,7 @@
EMBED_NAMES = set(('embed', 'object'))
IS_DEV_APPSERVER = 'Development' in os.environ.get('SERVER_SOFTWARE', '')
MAX_SCORE_DEPTH = 5
+RE_CNN_HACK = re.compile(r'<!-- with(out)? htc -->')
RE_DOCTYPE = re.compile(r'<!DOCTYPE.*?>', re.S)
RE_HTML_COMMENTS = re.compile(r'<!--.*?-->', re.S)
RE_SCRIPT_STYLE = re.compile(r'<(script|style)[^>]*>.*?</\1>\s*', re.S)
@@ -201,6 +202,9 @@ def ParseFeedAtUrl(url):
def PreCleanHtml(html):
+ # CNN improperly nests comments, this special-case hack removes them.
+ html = re.sub(RE_CNN_HACK, '', html)
+
html = re.sub(RE_HTML_COMMENTS, '', html)
html = re.sub(RE_DOCTYPE, '', html)
html = re.sub(RE_SCRIPT_STYLE, '', html)

0 comments on commit eda502e

Please sign in to comment.